import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score,confusion_matrix
import matplotlib.pyplot as plt
# read dataset
data=pd.read_csv('data loan.csv')
# print(data)
# treating the null values
data['Gender']=np.where(data['Gender'].isnull(),data['Gender'].mode(),data['Gender'])
data['Married']=np.where(data['Married'].isnull(),data['Married'].mode(),data['Married'])
data['Dependents']=np.where(data['Dependents'].isnull(),data['Dependents'].mode(),data['Dependents'])
data['Self_Employed']=np.where(data['Self_Employed'].isnull(),data['Self_Employed'].mode(),data['Self_Employed'])
data['LoanAmount']=np.where(data['LoanAmount'].isnull(),data['LoanAmount'].median(),data['LoanAmount'])
data['Loan_Amount_Term']=np.where(data['Loan_Amount_Term'].isnull(),data['Loan_Amount_Term'].median(),data['Loan_Amount_Term'])
data['Credit_History']=np.where(data['Credit_History'].isnull(),data['Credit_History'].median(),data['Credit_History'])
# print(data.info())
# print(data.info())
# Laber encoder of data
from sklearn.preprocessing import LabelEncoder
col=['Department','salary']
label_encoder =LabelEncoder()
data['Loan_ID']= label_encoder.fit_transform(data['Loan_ID'])
data['Gender']= label_encoder.fit_transform(data['Gender'])
data['Married']= label_encoder.fit_transform(data['Married'])
data['Dependents']= label_encoder.fit_transform(data['Dependents'])
data['Education']= label_encoder.fit_transform(data['Education'])
data['Self_Emplyed']= label_encoder.fit_transform(data['Self_Employed'])
data['Property_Area']= label_encoder.fit_transform(data['Property_Area'])
data['Loan_Status']= label_encoder.fit_transform(data['Loan_Status'])
# print(data['Loan_Status'].value_counts())
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
X=data[['Gender','Married','Dependents','Education','Self_Emplyed','ApplicantIncome','CoapplicantIncome','LoanAmount','Loan_Amount_Term','Credit_History','Property_Area']]
Y=data['Loan_Status']==1
x_train,x_test,y_train,ytest=train_test_split(X,Y,test_size=0.3,random_state=1)
clf_model=DecisionTreeClassifier()
clf_model.fit(x_train,y_train)
print("classifier decision tree score: ",clf_model.score(x_test,ytest))
#prediction
# X=[[6.4,1.7,6.6,2.1,4.5]]
Y_pred=clf_model.predict(X)
print("predict value : ",Y_pred)
Y_pred=clf_model.predict(x_test)
print("accuracy",accuracy_score(ytest,Y_pred))
cm=confusion_matrix(ytest,Y_pred)
print("confusion matrix: ",cm)
#plot decision tree
from sklearn import tree
tree.plot_tree(clf_model,fontsize='5')
text_rep=tree.export_text(clf_model)
print("decision tree",(text_rep))
data.hist(figsize=(15,12))
#Plotting the categorical columns
import seaborn as sns
sns.countplot(data['Education'],hue=data['Loan_Status'])
sns.countplot(data['Married'],hue=data['Loan_Status'])
sns.countplot(data['Gender'],hue=data['Loan_Status'])
sns.countplot(data['Self_Employed'],hue=data['Loan_Status'])
sns.countplot(data['Property_Area'],hue=data['Loan_Status'])
sns.countplot(data['Dependents'],hue=data['Loan_Status'])
plt.show()