#GO_STP_379
# In this task we have to find the students scores based on their study hours.
# This is a simple Regression problem type because it has only two variables.
import pandas as pd
data = pd.read_csv('HR_comma_sep.csv')
# exploration of data
print("-------exploration of data------------")
print(data.info())
print(data.head())
# laber encoder of data
from sklearn.preprocessing import LabelEncoder
col=['Department','salary']
label_encoder =LabelEncoder()
data['Department']= label_encoder.fit_transform(data['Department'])
data['salary']= label_encoder.fit_transform(data['salary'])
print("after the laber encoder : \n",data)
# LogisticRegression of data
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
ft=data[['Department','satisfaction_level','salary']]
label=data['left']
xtrain,xtest,ytrain,ytest=train_test_split(ft,label)
my_model=LogisticRegression()
my_model.fit(xtrain,ytrain)
y_pred=my_model.predict(xtest) # y test
cm=confusion_matrix(ytest,y_pred)
print("confusion matrix: ",cm)
print("accuracy socre: ",accuracy_score(ytest,y_pred))
print("socre: ",my_model.score(xtrain,ytrain))
# visualization of data
import matplotlib.pyplot as plt
plt.subplot(2,2,1)
plt.scatter(ytest, y_pred, marker = '+')
plt.xlabel('xtest')
plt.ylabel('y prediction')
plt.legend()
plt.title('Prediction of company')
plt.subplot(2,2,2)
plt.scatter(x=data['salary'], y=data['left'],label='salary and left')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.title('salary and left')
plt.subplot(2,2,3)
plt.scatter(x=data['satisfaction_level'], y=data['left'],label='satisfaction level and left')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.title('satisfaction level and left')
plt.subplot(2,2,4)
plt.scatter(x=data['time_spend_company'], y=data['left'],label='time_spend_company and left')
plt.xlabel('x')
plt.ylabel('y')
plt.title('time_spend_company and left')
plt.legend()
plt.show()
# logistic regression model to predict Employee Attrition
#create a pipeline for Logistic Regression
from sklearn.externals import joblib
import joblib as joblib
import pickle
with open('model_save','wb') as file:
pickle.dump(my_model,file)
#load model and prediction
with open('model_save','rb') as file:
newmodel=pickle.load(file)
# newmodel.coef_
joblib.dump(my_model,'model_joblib')
mymodel=joblib.load('model_joblib')
print("my model: ",mymodel)
print("new model: ",newmodel)
print("file is :",file)