Loan Prediction Notebook
Jupiter notebook for loan prediction
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("/Users/SPECTRE/Downloads/archive-3/train.csv")
pd.set_option('display.max_columns', None)
data.head()
ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Batch Enrolled | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Payment Plan | Loan Title | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Initial List Status | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Application Type | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 65087372 | 10000 | 32236 | 12329.36286 | 59 | BAT2522922 | 11.135007 | B | C4 | MORTGAGE | 176346.62670 | Not Verified | n | Debt Consolidation | 16.284758 | 1 | 0 | 13 | 0 | 24246 | 74.932551 | 7 | w | 2929.646315 | 0.102055 | 2.498291 | 0.793724 | 0 | INDIVIDUAL | 49 | 0 | 31 | 311301 | 6619 | 0 |
1 | 1450153 | 3609 | 11940 | 12191.99692 | 59 | BAT1586599 | 12.237563 | C | D3 | RENT | 39833.92100 | Source Verified | n | Debt consolidation | 15.412409 | 0 | 0 | 12 | 0 | 812 | 78.297186 | 13 | f | 772.769385 | 0.036181 | 2.377215 | 0.974821 | 0 | INDIVIDUAL | 109 | 0 | 53 | 182610 | 20885 | 0 |
2 | 1969101 | 28276 | 9311 | 21603.22455 | 59 | BAT2136391 | 12.545884 | F | D4 | MORTGAGE | 91506.69105 | Source Verified | n | Debt Consolidation | 28.137619 | 0 | 0 | 14 | 0 | 1843 | 2.073040 | 20 | w | 863.324396 | 18.778660 | 4.316277 | 1.020075 | 0 | INDIVIDUAL | 66 | 0 | 34 | 89801 | 26155 | 0 |
3 | 6651430 | 11170 | 6954 | 17877.15585 | 59 | BAT2428731 | 16.731201 | C | C3 | MORTGAGE | 108286.57590 | Source Verified | n | Debt consolidation | 18.043730 | 1 | 0 | 7 | 0 | 13819 | 67.467951 | 12 | w | 288.173196 | 0.044131 | 0.107020 | 0.749971 | 0 | INDIVIDUAL | 39 | 0 | 40 | 9189 | 60214 | 0 |
4 | 14354669 | 16890 | 13226 | 13539.92667 | 59 | BAT5341619 | 15.008300 | C | D4 | MORTGAGE | 44234.82545 | Source Verified | n | Credit card refinancing | 17.209886 | 1 | 3 | 13 | 1 | 1544 | 85.250761 | 22 | w | 129.239553 | 19.306646 | 1294.818751 | 0.368953 | 0 | INDIVIDUAL | 18 | 0 | 430 | 126029 | 22579 | 0 |
data.head()['Inquires - six months']
0 0 1 0 2 0 3 0 4 3 Name: Inquires - six months, dtype: int64
# plot recoveries vs. loan status
data.groupby('Loan Status')['Recoveries'].median().plot(kind='bar')
<AxesSubplot:xlabel='Loan Status'>
data.groupby("Loan Status").mean().round(2)
#This shows how similar the statistics of the two classes are. This could be due to 2 reasons - 1. the dataset is highly randomized which makes it unlikely to show correlation.
#2. The classes are imbalanced (less represntation of class 1).
ID | Loan Amount | Funded Amount | Funded Amount Investor | Term | Interest Rate | Home Ownership | Debit to Income | Delinquency - two years | Inquires - six months | Open Account | Public Record | Revolving Balance | Revolving Utilities | Total Accounts | Total Received Interest | Total Received Late Fee | Recoveries | Collection Recovery Fee | Collection 12 months Medical | Last week Pay | Accounts Delinquent | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
Loan Status | |||||||||||||||||||||||||
0 | 25624430.71 | 16860.85 | 15767.05 | 14622.00 | 58.18 | 11.84 | 80488.08 | 23.31 | 0.32 | 0.15 | 14.28 | 0.08 | 7702.03 | 52.86 | 18.63 | 2067.80 | 1.13 | 59.77 | 1.13 | 0.02 | 71.08 | 0.0 | 144.59 | 159137.67 | 23113.30 |
1 | 25658773.35 | 16731.67 | 15805.42 | 14619.87 | 58.14 | 11.88 | 81065.54 | 23.22 | 0.35 | 0.15 | 14.13 | 0.09 | 7673.00 | 53.18 | 18.63 | 2080.68 | 1.30 | 58.96 | 1.08 | 0.02 | 71.99 | 0.0 | 164.87 | 163853.53 | 23218.24 |
# plot a bar plot of inital list status by the number of 1s and the number of 0s in loan status
data.groupby('Initial List Status')['Loan Status'].value_counts().unstack().plot(kind='bar')
<AxesSubplot:xlabel='Initial List Status'>
data.groupby("Application Type").sum()["Loan Status"]
Application Type INDIVIDUAL 6230 JOINT 11 Name: Loan Status, dtype: int64
data.groupby("Verification Status").sum()["Loan Status"]
Verification Status Not Verified 1506 Source Verified 3085 Verified 1650 Name: Loan Status, dtype: int64
# revolving utilization and revolving balance compared to loan status
data.groupby('Loan Status')['Revolving Balance', 'Revolving Utilities'].median().plot(kind='bar', color=['lightgreen', 'lightblue'])
/var/folders/lf/xb0tgmbn1151tv7hrt1mr_5r0000gn/T/ipykernel_32143/2613361430.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead. data.groupby('Loan Status')['Revolving Balance', 'Revolving Utilities'].median().plot(kind='bar', color=['lightgreen', 'lightblue'])
<AxesSubplot:xlabel='Loan Status'>
data.groupby('Loan Status')['Revolving Balance'].median().plot(kind='bar', color=['lightgreen'])
<AxesSubplot:xlabel='Loan Status'>
data.groupby('Loan Status')['Revolving Utilities'].median().plot(kind='bar', color=['lightblue'])
<AxesSubplot:xlabel='Loan Status'>
# inquiries 6 months vs loan status
data.groupby('Loan Status')['Inquires - six months'].mean().plot(kind='bar', color = 'purple')
<AxesSubplot:xlabel='Loan Status'>
# inquiries 6 mmonth value counts
data['Inquires - six months'].value_counts()
0 60486 1 4558 2 2042 3 320 4 54 5 3 Name: Inquires - six months, dtype: int64
def drop_useless(df):
df = df.copy()
df = df.drop(columns=['ID', 'Term', 'Payment Plan', 'Open Account', 'Batch Enrolled',
'Accounts Delinquent', 'Total Accounts', 'Recoveries', 'Initial List Status', 'Collection Recovery Fee', 'Last week Pay',
'Revolving Utilities', 'Funded Amount', 'Revolving Balance', 'Inquires - six months', 'Loan Title'])
return df
EDA¶
data_clean = drop_useless(data)
data_clean.head()
Loan Amount | Funded Amount Investor | Interest Rate | Grade | Sub Grade | Employment Duration | Home Ownership | Verification Status | Debit to Income | Delinquency - two years | Public Record | Total Received Interest | Total Received Late Fee | Collection 12 months Medical | Application Type | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10000 | 12329.36286 | 11.135007 | B | C4 | MORTGAGE | 176346.62670 | Not Verified | 16.284758 | 1 | 0 | 2929.646315 | 0.102055 | 0 | INDIVIDUAL | 31 | 311301 | 6619 | 0 |
1 | 3609 | 12191.99692 | 12.237563 | C | D3 | RENT | 39833.92100 | Source Verified | 15.412409 | 0 | 0 | 772.769385 | 0.036181 | 0 | INDIVIDUAL | 53 | 182610 | 20885 | 0 |
2 | 28276 | 21603.22455 | 12.545884 | F | D4 | MORTGAGE | 91506.69105 | Source Verified | 28.137619 | 0 | 0 | 863.324396 | 18.778660 | 0 | INDIVIDUAL | 34 | 89801 | 26155 | 0 |
3 | 11170 | 17877.15585 | 16.731201 | C | C3 | MORTGAGE | 108286.57590 | Source Verified | 18.043730 | 1 | 0 | 288.173196 | 0.044131 | 0 | INDIVIDUAL | 40 | 9189 | 60214 | 0 |
4 | 16890 | 13539.92667 | 15.008300 | C | D4 | MORTGAGE | 44234.82545 | Source Verified | 17.209886 | 1 | 1 | 129.239553 | 19.306646 | 0 | INDIVIDUAL | 430 | 126029 | 22579 | 0 |
data_clean.columns
Index(['Loan Amount', 'Funded Amount Investor', 'Interest Rate', 'Grade', 'Sub Grade', 'Employment Duration', 'Home Ownership', 'Verification Status', 'Debit to Income', 'Delinquency - two years', 'Public Record', 'Total Received Interest', 'Total Received Late Fee', 'Collection 12 months Medical', 'Application Type', 'Total Collection Amount', 'Total Current Balance', 'Total Revolving Credit Limit', 'Loan Status'], dtype='object')
UNIVARIATE ANALYSIS¶
# plot histogram of all numerical columns
data_clean.hist(figsize=(20, 20))
array([[<AxesSubplot:title={'center':'Loan Amount'}>, <AxesSubplot:title={'center':'Funded Amount Investor'}>, <AxesSubplot:title={'center':'Interest Rate'}>, <AxesSubplot:title={'center':'Home Ownership'}>], [<AxesSubplot:title={'center':'Debit to Income'}>, <AxesSubplot:title={'center':'Delinquency - two years'}>, <AxesSubplot:title={'center':'Public Record'}>, <AxesSubplot:title={'center':'Total Received Interest'}>], [<AxesSubplot:title={'center':'Total Received Late Fee'}>, <AxesSubplot:title={'center':'Collection 12 months Medical'}>, <AxesSubplot:title={'center':'Total Collection Amount'}>, <AxesSubplot:title={'center':'Total Current Balance'}>], [<AxesSubplot:title={'center':'Total Revolving Credit Limit'}>, <AxesSubplot:title={'center':'Loan Status'}>, <AxesSubplot:>, <AxesSubplot:>]], dtype=object)
Interesting findings from the Hists¶
Loan Status
has large discrepancy between the two classes. Hence, when training the model, we will try to reduce the number of 0 classifications in order to reduce the bias towards the 0 class.
Total Current Balance
, Total Revolving Credit Limit
, Total Collection Amount
, Total Received Late Fee
and Total Receieved Interest
are highly skewed. We will try to reduce the skewness of these variables.
# Box plots of `Total Current Balance`, `Total Revolving Credit Limit`, `Total Collection Amount`, `Total Received Late Fee` and `Total Receieved Interest` using seaborn and subpllots
ax = plt.subplots(figsize=(20, 20))
plt.subplot(3, 2, 1)
sns.boxplot(data_clean['Total Current Balance'])
plt.subplot(3, 2, 2)
sns.boxplot(data_clean['Total Revolving Credit Limit'])
plt.subplot(3, 2, 3)
sns.boxplot(data_clean['Total Collection Amount'])
plt.subplot(3, 2, 4)
sns.boxplot(data_clean['Total Received Late Fee'])
plt.subplot(3, 2, 5)
sns.boxplot(data_clean['Total Received Interest'])
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn( /Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='Total Received Interest'>
BIVARIATE ANALYSIS¶
# plot a graph grouping by the grades and sub grades and plotting the count of 0 for each grade and sub grade
data_clean.groupby('Grade')['Loan Status'].value_counts().unstack().plot(kind='bar', color=['lightgreen', 'lightblue'])
# rename legend 0 to not approved and 1 to approved
plt.legend(['Not Approved', 'Approved'])
<matplotlib.legend.Legend at 0x7feed40c91f0>
# get the percentage of 0s and 1s for each grade
data_clean.groupby('Grade')['Loan Status'].value_counts(normalize=True).unstack()
Loan Status | 0 | 1 |
---|---|---|
Grade | ||
A | 0.908752 | 0.091248 |
B | 0.912763 | 0.087237 |
C | 0.906104 | 0.093896 |
D | 0.903620 | 0.096380 |
E | 0.904127 | 0.095873 |
F | 0.896260 | 0.103740 |
G | 0.893651 | 0.106349 |
# Groupby grade, count the number of 0s and 1s for each grade and and get the percetage of 1s for each grade
temp = data_clean.groupby('Grade')['Loan Status'].value_counts()
# get the percentages of 1s for each grade
temp = temp.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
# plot all the 1s percentages for each grade
temp.loc[temp.index.get_level_values(1) == 1].plot(kind='bar')
<AxesSubplot:xlabel='Grade,Loan Status'>
# groupby grade and subgrade
temp = data_clean.groupby(['Grade', 'Sub Grade'])['Loan Status'].value_counts()
# Use subplots and 1 for each grade
ax = plt.subplots(figsize=(20, 60))
plt.subplot(7, 1, 1)
temp.loc['A'].loc[temp.loc['A'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade A')
plt.subplot(7, 1, 2)
temp.loc['B'].loc[temp.loc['B'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade B')
plt.subplot(7, 1, 3)
temp.loc['C'].loc[temp.loc['C'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade C')
plt.subplot(7, 1, 4)
temp.loc['D'].loc[temp.loc['D'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade D')
plt.subplot(7, 1, 5)
temp.loc['E'].loc[temp.loc['E'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade E')
plt.subplot(7, 1, 6)
temp.loc['F'].loc[temp.loc['F'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade F')
plt.subplot(7, 1, 7)
temp.loc['G'].loc[temp.loc['G'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade G')
Text(0.5, 1.0, 'Grade G')
# plot application type vs loan status as percentage of 1s for each application type
data_clean.groupby('Application Type')['Loan Status'].value_counts(normalize=True).unstack().plot(kind='bar')
<AxesSubplot:xlabel='Application Type'>
zeros = data_clean.loc[data_clean["Loan Status"] == 0]
ones = data_clean.loc[data_clean["Loan Status"] == 1]
balanced = zeros.sample(ones.shape[0])
balanced_df = pd.concat([ones, balanced], axis = 0).reset_index()
# deliquency 2 years vs loan status , one for 0 loan status and one for 1 loan status
data_clean.groupby('Loan Status')['Delinquency - two years'].value_counts().unstack().plot(kind='bar', figsize=(20, 10))
# title
plt.title('Delinquency - two years vs Loan Status')
# rename the legend name
Text(0.5, 1.0, 'Delinquency - two years vs Loan Status')
# deliquency 2 years vs loan status , one for 0 loan status and one for 1 loan status
balanced_df.groupby('Loan Status')['Delinquency - two years'].value_counts().unstack().plot(kind='bar', figsize=(20, 10))
# title
plt.title('Delinquency - two years vs Loan Status')
# rename the legend name
Text(0.5, 1.0, 'Delinquency - two years vs Loan Status')
data_clean[data_clean['Loan Status'] == 0].groupby('Delinquency - two years')['Loan Status'].value_counts()/data_clean[data_clean['Loan Status'] == 0].shape[0]
Delinquency - two years Loan Status 0 0 0.772222 1 0 0.173859 2 0 0.039136 3 0 0.006550 4 0 0.000212 5 0 0.001062 6 0 0.002679 7 0 0.003675 8 0 0.000604 Name: Loan Status, dtype: float64
data_clean[data_clean['Loan Status'] == 1].groupby('Delinquency - two years')['Loan Status'].value_counts()/data_clean[data_clean['Loan Status'] == 1].shape[0]
Delinquency - two years Loan Status 0 1 0.765422 1 1 0.174972 2 1 0.040859 3 1 0.007050 4 1 0.000481 5 1 0.001442 6 1 0.004326 7 1 0.004326 8 1 0.001122 Name: Loan Status, dtype: float64
# deliquency 2 years vs loan status , one for 0 loan status and one for 1 loan status percentage
temp = data_clean.groupby('Loan Status')['Delinquency - two years'].value_counts(normalize=True).unstack()
# at a row which is the diff
temp.loc["Diff Approved vs Not Approved"] = temp.loc[1] - temp.loc[0]
temp
Delinquency - two years | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 |
---|---|---|---|---|---|---|---|---|---|
Loan Status | |||||||||
0 | 0.772222 | 0.173859 | 0.039136 | 0.00655 | 0.000212 | 0.001062 | 0.002679 | 0.003675 | 0.000604 |
1 | 0.765422 | 0.174972 | 0.040859 | 0.00705 | 0.000481 | 0.001442 | 0.004326 | 0.004326 | 0.001122 |
Diff Approved vs Not Approved | -0.006800 | 0.001113 | 0.001723 | 0.00050 | 0.000268 | 0.000380 | 0.001647 | 0.000651 | 0.000517 |
# plot scatter of Total Revolving Credit Limit vs Total Collection Amount and color the points by loan status
plt.scatter(data_clean['Total Revolving Credit Limit'], data_clean['Total Collection Amount'], c=data_clean['Loan Status'])
# title
plt.title('Total Revolving Credit Limit vs Total Collection Amount')
# legend
plt.legend(['Not Approved', 'Approved'])
# qxis labels
plt.xlabel('Total Revolving Credit Limit')
plt.ylabel('Total Collection Amount')
Text(0, 0.5, 'Total Collection Amount')
# plot scatter of Total Revolving Credit Limit vs Total Collection Amount and color the points by loan status
plt.scatter(balanced_df['Total Revolving Credit Limit'], balanced_df['Total Collection Amount'], c=balanced_df['Loan Status'])
# title
plt.title('Total Revolving Credit Limit vs Total Collection Amount')
# legend
plt.legend(['Not Approved', 'Approved'])
# qxis labels
plt.xlabel('Total Revolving Credit Limit')
plt.ylabel('Total Collection Amount')
Text(0, 0.5, 'Total Collection Amount')
# Verification Statues vs loan status
data_clean.groupby('Loan Status')['Verification Status'].value_counts().unstack().plot(kind='bar')
<AxesSubplot:xlabel='Loan Status'>
# Verification Statues vs loan status
balanced_df.groupby('Loan Status')['Verification Status'].value_counts().unstack().plot(kind='bar')
<AxesSubplot:xlabel='Loan Status'>
# Loan Amount vs loan status
data_clean.groupby('Loan Status')['Loan Amount'].min().plot(kind='bar')
<AxesSubplot:xlabel='Loan Status'>
BASELINE MODEL¶
Models evaluated¶
For the baseline model we are using Logistic Classification, K-means classification, Decision tree Classifier, Random Forest Classifier. The features we use are : insert an image of features We choose to evaluate these models using F-1 score because this dataset has imbbalanced classes. It combines precision and recall into a single score hence balancing both measures. We perform basic one hot encoding on the categorical features and leave all other features as it is.
Logistic Regression This assumes linear relationship between the features and the label (loan status). The f-1 score for this model is 0. This means that the model is not useful and performs badly
Decision tree Classifier this model is a non-parametric supervised learning algorithm, which is utilized for both classification and regression tasks. They break down complex data into more manageable parts. Decison trees can automatically handle missing values, can be used to capture non-linear relationships, and are used for both numerical and catagorical data. After testing this model against our unbalanced dataset we get a f-1 score of 0.015 which is better than Logistic Regression.
Random Forest Classifier Random forests or random decision forests is an ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time. It does sometimes tend to overfit the data but there are ways we can optimize it and prevent from overfitting. The f-1 score for this model on our unbalanced dataset is 0.13.
The baseline models are fitted on the same trainings set.All hyperparametrs are default to test the basic model performance.
Result¶
From analysisng the models above, we choose to further work on Decision trees and Random Forest Classifiers. We aim to perform feauture engineering, choose the optimum hyperparameters, and other optimization of model to select our final model.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
X = data_clean.drop('Loan Status', axis=1)
y = data_clean['Loan Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
preproc = ColumnTransformer([
("one-hot", OneHotEncoder(handle_unknown="ignore"), ["Grade", "Sub Grade", "Employment Duration", "Verification Status", "Application Type"])
], remainder="passthrough")
rf = RandomForestClassifier() #Random Forest Classifier
log = LogisticRegression() #Logistic Regression
dt = DecisionTreeClassifier() #Decision Tree Classifier
for i in range(3):
if i == 0:
model = rf
elif i == 1:
model = log
elif i == 2:
model = dt
pipe = Pipeline([
("preprocessing", preproc),
("model", model)
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print("Model: ", model)
print("Accuracy Score: ", accuracy_score(y_test, y_pred))
print("F1 Score: ", f1_score(y_test, y_pred))
print("Class 0:", (y_pred == 0).sum())
print("Class 1:", (y_pred == 1).sum())
print("")
Model: RandomForestClassifier() Accuracy Score: 0.9097309716149115 F1 Score: 0.0 Class 0: 13493 Class 1: 0 Model: LogisticRegression() Accuracy Score: 0.9097309716149115 F1 Score: 0.0 Class 0: 13493 Class 1: 0 Model: DecisionTreeClassifier() Accuracy Score: 0.8205736307715111 F1 Score: 0.10033444816053513 Class 0: 12020 Class 1: 1473
FEATURE ENGINEERING¶
OneHotEncoding - One-hot encoding in machine learning is the conversion of categorical information into a format that may be fed into machine learning algorithms to improve prediction accuracy. We are one hot encoding the catagorical features "Employment Duration", "Verification Status", "Application Type", "Delinquency - two years", "Collection 12 months Medical". Because these features are not in any specific order or cannot be classified as ordinal, we are choosing it over other encoding methods.
Ordinal Encoding - We are combining teh columns Grade and Sub Grade to make a new column called GradeSubGrade. Since these grade catagories are ordinal, we prefer ordinal encoding over OneHotEncoding or any other encoding methods. For example, the grade AF1 is worse than grade AA1. This also makes the model more efficient by reducing the memory (number of columns in onehotencoding) and increasing the speed.
Since this dataset in highly unbalanced, we decide to balance the dataset by undersampling. Since in real world we the loan status is not equally likely to be defaulted, we try to keep the perecntage divide unequal yet close. We run a test and choose the most optimal balanced set.
Combining grade and subgrade and ordinally encoding it¶
data = pd.read_csv("/Users/SPECTRE/Downloads/archive-3/train.csv")
train = drop_useless(data)
grade = train["Grade"].unique()
subgrade = train["Sub Grade"].unique()
train["GradeSubGrade"] = train["Grade"] + train["Sub Grade"]
train = train.drop(columns= ["Grade", "Sub Grade"])
features = []
for i in grade:
for j in subgrade:
lst = [i+j]
features.append(lst)
features = np.unique(features)
dicta = {}
k = 1
for i in features:
dicta[i] = k
k = k+1
dicta
train["GradeSubGrade"] = train["GradeSubGrade"].replace(dicta)
train.head()
Loan Amount | Funded Amount Investor | Interest Rate | Employment Duration | Home Ownership | Verification Status | Debit to Income | Delinquency - two years | Public Record | Total Received Interest | Total Received Late Fee | Collection 12 months Medical | Application Type | Total Collection Amount | Total Current Balance | Total Revolving Credit Limit | Loan Status | GradeSubGrade | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 10000 | 12329.36286 | 11.135007 | MORTGAGE | 176346.62670 | Not Verified | 16.284758 | 1 | 0 | 2929.646315 | 0.102055 | 0 | INDIVIDUAL | 31 | 311301 | 6619 | 0 | 49 |
1 | 3609 | 12191.99692 | 12.237563 | RENT | 39833.92100 | Source Verified | 15.412409 | 0 | 0 | 772.769385 | 0.036181 | 0 | INDIVIDUAL | 53 | 182610 | 20885 | 0 | 88 |
2 | 28276 | 21603.22455 | 12.545884 | MORTGAGE | 91506.69105 | Source Verified | 28.137619 | 0 | 0 | 863.324396 | 18.778660 | 0 | INDIVIDUAL | 34 | 89801 | 26155 | 0 | 194 |
3 | 11170 | 17877.15585 | 16.731201 | MORTGAGE | 108286.57590 | Source Verified | 18.043730 | 1 | 0 | 288.173196 | 0.044131 | 0 | INDIVIDUAL | 40 | 9189 | 60214 | 0 | 83 |
4 | 16890 | 13539.92667 | 15.008300 | MORTGAGE | 44234.82545 | Source Verified | 17.209886 | 1 | 1 | 129.239553 | 19.306646 | 0 | INDIVIDUAL | 430 | 126029 | 22579 | 0 | 89 |
BALANCING THE DATASET¶
zeros = train.loc[train["Loan Status"] == 0]
ones = train.loc[train["Loan Status"] == 1]
balanced = zeros.sample(ones.shape[0])
balanced_df = pd.concat([ones, balanced], axis = 0).reset_index()
balanced_df = balanced_df.drop(columns = ["index"])
Feature Engineering via sklearN¶
preproc = ColumnTransformer([
("one-hot", OneHotEncoder(handle_unknown="ignore"), ["Employment Duration", "Verification Status", "Application Type"]),
("Std-scale", StandardScaler(), ['Total Collection Amount', 'Total Received Late Fee'])
], remainder="passthrough")
Random Forest Classifier¶
X = balanced_df.drop('Loan Status', axis=1)
y = balanced_df['Loan Status']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
rf_pipe = Pipeline([
("preprocessing", preproc),
("rf", rf)
])
param_grid = {'rf__n_estimators' : [10, 30, 50, 75, 100],
'rf__max_depth' : [2, 5, 10, 20, None],
'rf__min_samples_split' : [1, 2, 5, 10],
'rf__class_weight' : ["balanced", "balanced_subsample", None]}
grid_search_rf = GridSearchCV(rf_pipe, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)
Fitting 5 folds for each of 300 candidates, totalling 1500 fits
Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types /Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning: 375 fits failed out of a total of 1500. The score on these train-test partitions for these parameters will be set to nan. If these failures are not expected, you can try to debug them by setting error_score='raise'. Below are more details about the failures: -------------------------------------------------------------------------------- 375 fits failed with the following error: Traceback (most recent call last): File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit self._final_estimator.fit(Xt, y, **fit_params_last_step) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 441, in fit trees = Parallel( File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 1043, in __call__ if self.dispatch_one_batch(iterator): File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch self._dispatch(tasks) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 779, in _dispatch job = self._backend.apply_async(batch, callback=cb) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 208, in apply_async result = ImmediateResult(func) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 572, in __init__ self.results = batch() File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__ return [func(*args, **kwargs) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp> return [func(*args, **kwargs) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/utils/fixes.py", line 209, in __call__ return self.function(*args, **kwargs) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 184, in _parallel_build_trees tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 937, in fit super().fit( File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 250, in fit raise ValueError( ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1 warnings.warn(some_fits_failed_message, FitFailedWarning) /Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_search.py:969: UserWarning: One or more of the test scores are non-finite: [ nan nan nan nan nan 0.4876717 0.51910417 0.49554556 0.50047947 0.51301775 0.46226748 0.47267092 0.50852022 0.50092526 0.49286259 0.50900232 0.49428234 0.50409027 0.51406805 0.50712063 nan nan nan nan nan 0.48875509 0.50473614 0.4930159 0.49565916 0.49488836 0.47485519 0.49641662 0.50115119 0.49708276 0.49430373 0.47878078 0.49376944 0.49679122 0.50207183 0.49965396 nan nan nan nan nan 0.4980734 0.49488054 0.49001201 0.4918624 0.48636937 0.50552297 0.48557436 0.49142854 0.49696622 0.49098238 0.48755723 0.49518411 0.49478222 0.49643313 0.48619006 nan nan nan nan nan 0.49540625 0.48754703 0.48875335 0.4820197 0.48926963 0.49440975 0.48798014 0.49357648 0.48486914 0.48538061 0.49794904 0.49388407 0.4813358 0.48808208 0.50040798 nan nan nan nan nan 0.41657133 0.45567121 0.463984 0.49049026 0.47475296 0.48838393 0.48609744 0.49272626 0.48586004 0.48907034 0.48928125 0.49317958 0.49938562 0.49111024 0.49632264 nan nan nan nan nan 0.48788891 0.48930717 0.50183224 0.5063664 0.50584575 0.48761491 0.49825036 0.50285831 0.50995919 0.50223032 0.47249482 0.4946651 0.50191365 0.49580879 0.50827925 nan nan nan nan nan 0.48030389 0.49750564 0.4910142 0.49635751 0.49796402 0.48300619 0.50097949 0.50089398 0.49642547 0.49419356 0.48606852 0.49452372 0.49695914 0.50074775 0.49757586 nan nan nan nan nan 0.48734587 0.5081879 0.48751474 0.48905161 0.48640047 0.48968525 0.48833876 0.49005426 0.49164023 0.49313442 0.50316092 0.49203182 0.48854878 0.49291465 0.49481723 nan nan nan nan nan 0.48831138 0.49813806 0.49603967 0.49123974 0.47691792 0.49320082 0.49618096 0.48832141 0.49612333 0.49324305 0.49726861 0.49713826 0.49319971 0.48696851 0.49269589 nan nan nan nan nan 0.42271624 0.45807157 0.4687901 0.49815782 0.47473815 0.48427584 0.49789747 0.4940195 0.49769886 0.48896086 0.49051335 0.4971353 0.48769575 0.49696481 0.49141617 nan nan nan nan nan 0.45519142 0.44017127 0.43009604 0.41695111 0.41176105 0.4263803 0.43854289 0.42639532 0.42734847 0.4157253 0.46479037 0.44836922 0.397357 0.42132424 0.40424917 nan nan nan nan nan 0.47328454 0.47216731 0.46343726 0.46577353 0.46959026 0.46403307 0.46115032 0.46418086 0.46252775 0.4693212 0.4839632 0.45610901 0.46451705 0.46324183 0.47352734 nan nan nan nan nan 0.47931978 0.4747452 0.47882185 0.475346 0.47690689 0.48206064 0.47467087 0.48558705 0.47312928 0.47285291 0.48275851 0.48272826 0.47480674 0.4838109 0.48135277 nan nan nan nan nan 0.48835436 0.48458225 0.48544697 0.48547968 0.48536242 0.47838016 0.48476848 0.47788299 0.48379202 0.48754652 0.48068387 0.47575185 0.48673005 0.48703122 0.48280944 nan nan nan nan nan 0.42885129 0.45391929 0.46212771 0.47863123 0.46493197 0.49496842 0.49503892 0.48566295 0.48666077 0.48946716 0.49742789 0.49509577 0.49134528 0.49110992 0.49134453] warnings.warn(
GridSearchCV(cv=5, estimator=Pipeline(steps=[('preprocessing', ColumnTransformer(remainder='passthrough', transformers=[('one-hot', OneHotEncoder(handle_unknown='ignore'), ['Employment ' 'Duration', 'Verification ' 'Status', 'Application ' 'Type']), ('Std-scale', StandardScaler(), ['Total ' 'Collection ' 'Amount', 'Total ' 'Received ' 'Late ' 'Fee'])])), ('rf', RandomForestClassifier())]), n_jobs=-1, param_grid={'rf__class_weight': ['balanced', 'balanced_subsample', None], 'rf__max_depth': [2, 5, 10, 20, None], 'rf__min_samples_split': [1, 2, 5, 10], 'rf__n_estimators': [10, 30, 50, 75, 100]}, scoring='f1', verbose=1)
grid_search_rf.best_params_
{'rf__class_weight': 'balanced', 'rf__max_depth': 2, 'rf__min_samples_split': 2, 'rf__n_estimators': 30}
grid_search_rf.best_score_
0.5191041701889237
# get accuracy
grid_search_rf.score(X_test, y_test)
accuracy_score(y_test, grid_search_rf.predict(X_test))
0.5282338806567881
dt_pipe = Pipeline([
("preprocessing", preproc),
("dt", dt)
])
param_grid = {'dt__max_depth' : [2, 5, 10, 20, None],
'dt__min_samples_split' : [1, 2, 5, 10],
'dt__class_weight' : ["balanced", None]}
grid_search_dt = GridSearchCV(dt_pipe, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)
Fitting 5 folds for each of 40 candidates, totalling 200 fits
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning: 50 fits failed out of a total of 200. The score on these train-test partitions for these parameters will be set to nan. If these failures are not expected, you can try to debug them by setting error_score='raise'. Below are more details about the failures: -------------------------------------------------------------------------------- 50 fits failed with the following error: Traceback (most recent call last): File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score estimator.fit(X_train, y_train, **fit_params) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit self._final_estimator.fit(Xt, y, **fit_params_last_step) File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 937, in fit super().fit( File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 250, in fit raise ValueError( ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1 warnings.warn(some_fits_failed_message, FitFailedWarning) /Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_search.py:969: UserWarning: One or more of the test scores are non-finite: [ nan 0.58016522 0.58016522 0.58016522 nan 0.36607779 0.36575413 0.36639149 nan 0.41890964 0.41908244 0.42250784 nan 0.47220281 0.4739036 0.47821298 nan 0.49983136 0.5000964 0.51119761 nan 0.51957701 0.51957701 0.51957701 nan 0.36591946 0.36612825 0.36620445 nan 0.41733356 0.41653174 0.41705025 nan 0.4641829 0.46158159 0.46236304 nan 0.49602268 0.49944225 0.49448895] warnings.warn(
GridSearchCV(cv=5, estimator=Pipeline(steps=[('preprocessing', ColumnTransformer(remainder='passthrough', transformers=[('one-hot', OneHotEncoder(handle_unknown='ignore'), ['Employment ' 'Duration', 'Verification ' 'Status', 'Application ' 'Type']), ('Std-scale', StandardScaler(), ['Total ' 'Collection ' 'Amount', 'Total ' 'Received ' 'Late ' 'Fee'])])), ('dt', DecisionTreeClassifier())]), n_jobs=-1, param_grid={'dt__class_weight': ['balanced', None], 'dt__max_depth': [2, 5, 10, 20, None], 'dt__min_samples_split': [1, 2, 5, 10]}, scoring='f1', verbose=1)
grid_search_dt.best_params_
{'dt__class_weight': 'balanced', 'dt__max_depth': 2, 'dt__min_samples_split': 2}
grid_search_dt.best_score_
0.5801652170704724
# get accuracy score of the best model
y_pred = grid_search_rf.predict(X_test)
accuracy_score(y_test, y_pred)
0.5282338806567881
# get recall
from sklearn.metrics import recall_score
recall_score(y_test, y_pred)
0.46280344557556774
# get precision
from sklearn.metrics import precision_score
precision_score(y_test, y_pred)
0.5457063711911357
# get AUC score
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)
0.5297623785254888
# export the dt model
import pickle
pickle.dump(grid_search_dt, open('dt_model.pkl', 'wb'))
FINAL MODEL - DECISION TREE CLASSIFIER¶
final_pipe = Pipeline([
("preprocessing", preproc),
("dt", DecisionTreeClassifier(max_depth = 2, min_samples_split = 2, class_weight = "balanced"))
])
final_pipe.fit(X_train, y_train)
Pipeline(steps=[('preprocessing', ColumnTransformer(remainder='passthrough', transformers=[('one-hot', OneHotEncoder(handle_unknown='ignore'), ['Employment Duration', 'Verification Status', 'Application Type']), ('Std-scale', StandardScaler(), ['Total Collection Amount', 'Total Received Late ' 'Fee'])])), ('dt', DecisionTreeClassifier(class_weight='balanced', max_depth=2))])
f1_score(y_test, final_pipe.predict(X_test))
0.6049868766404198
tn, fp, fn, tp = confusion_matrix(y_test, final_pipe.predict(X_test)).ravel()
print("True negative ", tn)
print("True positive ", tp)
print("False negative ", fn)
print("False positive ", fp)
True negative 371 True positive 922 False negative 355 False positive 849