Loan Prediction Notebook

Jupiter notebook for loan prediction

notebook

In [ ]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [ ]:

data = pd.read_csv("/Users/SPECTRE/Downloads/archive-3/train.csv")

In [ ]:

pd.set_option('display.max_columns', None)
data.head()

Out[ ]:

	ID	Loan Amount	Funded Amount	Funded Amount Investor	Term	Batch Enrolled	Interest Rate	Grade	Sub Grade	Employment Duration	Home Ownership	Verification Status	Payment Plan	Loan Title	Debit to Income	Delinquency - two years	Inquires - six months	Open Account	Public Record	Revolving Balance	Revolving Utilities	Total Accounts	Initial List Status	Total Received Interest	Total Received Late Fee	Recoveries	Collection Recovery Fee	Application Type	Last week Pay	Total Collection Amount	Total Current Balance	Total Revolving Credit Limit
0	65087372	10000	32236	12329.36286	59	BAT2522922	11.135007	B	C4	MORTGAGE	176346.62670	Not Verified	n	Debt Consolidation	16.284758	1	0	13	0	24246	74.932551	7	w	2929.646315	0.102055	2.498291	0.793724	INDIVIDUAL	49	31	311301	6619
1	1450153	3609	11940	12191.99692	59	BAT1586599	12.237563	C	D3	RENT	39833.92100	Source Verified	n	Debt consolidation	15.412409	0	0	12	0	812	78.297186	13	f	772.769385	0.036181	2.377215	0.974821	INDIVIDUAL	109	53	182610	20885
2	1969101	28276	9311	21603.22455	59	BAT2136391	12.545884	F	D4	MORTGAGE	91506.69105	Source Verified	n	Debt Consolidation	28.137619	0	0	14	0	1843	2.073040	20	w	863.324396	18.778660	4.316277	1.020075	INDIVIDUAL	66	34	89801	26155
3	6651430	11170	6954	17877.15585	59	BAT2428731	16.731201	C	C3	MORTGAGE	108286.57590	Source Verified	n	Debt consolidation	18.043730	1	0	7	0	13819	67.467951	12	w	288.173196	0.044131	0.107020	0.749971	INDIVIDUAL	39	40	9189	60214
4	14354669	16890	13226	13539.92667	59	BAT5341619	15.008300	C	D4	MORTGAGE	44234.82545	Source Verified	n	Credit card refinancing	17.209886	1	3	13	1	1544	85.250761	22	w	129.239553	19.306646	1294.818751	0.368953	INDIVIDUAL	18	430	126029	22579

In [ ]:

data.head()['Inquires - six months']

Out[ ]:

0    0
1    0
2    0
3    0
4    3
Name: Inquires - six months, dtype: int64

In [ ]:

# plot recoveries vs. loan status
data.groupby('Loan Status')['Recoveries'].median().plot(kind='bar')

Out[ ]:

<AxesSubplot:xlabel='Loan Status'>

In [ ]:

data.groupby("Loan Status").mean().round(2)

#This shows how similar the statistics of the two classes are. This could be due to 2 reasons - 1. the dataset is highly randomized which makes it unlikely to show correlation.
#2. The classes are imbalanced (less represntation of class 1). 

Out[ ]:

	ID	Loan Amount	Funded Amount	Funded Amount Investor	Term	Interest Rate	Home Ownership	Debit to Income	Delinquency - two years	Inquires - six months	Open Account	Public Record	Revolving Balance	Revolving Utilities	Total Accounts	Total Received Interest	Total Received Late Fee	Recoveries	Collection Recovery Fee	Collection 12 months Medical	Last week Pay	Accounts Delinquent	Total Collection Amount	Total Current Balance	Total Revolving Credit Limit
Loan Status
0	25624430.71	16860.85	15767.05	14622.00	58.18	11.84	80488.08	23.31	0.32	0.15	14.28	0.08	7702.03	52.86	18.63	2067.80	1.13	59.77	1.13	0.02	71.08	0.0	144.59	159137.67	23113.30
1	25658773.35	16731.67	15805.42	14619.87	58.14	11.88	81065.54	23.22	0.35	0.15	14.13	0.09	7673.00	53.18	18.63	2080.68	1.30	58.96	1.08	0.02	71.99	0.0	164.87	163853.53	23218.24

In [ ]:

# plot a bar plot of inital list status by the number of 1s and the number of 0s in loan status
data.groupby('Initial List Status')['Loan Status'].value_counts().unstack().plot(kind='bar')

Out[ ]:

<AxesSubplot:xlabel='Initial List Status'>

In [ ]:

data.groupby("Application Type").sum()["Loan Status"]

Out[ ]:

Application Type
INDIVIDUAL    6230
JOINT           11
Name: Loan Status, dtype: int64

In [ ]:

data.groupby("Verification Status").sum()["Loan Status"]

Out[ ]:

Verification Status
Not Verified       1506
Source Verified    3085
Verified           1650
Name: Loan Status, dtype: int64

In [ ]:

# revolving utilization and revolving balance compared to loan status
data.groupby('Loan Status')['Revolving Balance', 'Revolving Utilities'].median().plot(kind='bar', color=['lightgreen', 'lightblue'])

/var/folders/lf/xb0tgmbn1151tv7hrt1mr_5r0000gn/T/ipykernel_32143/2613361430.py:2: FutureWarning: Indexing with multiple keys (implicitly converted to a tuple of keys) will be deprecated, use a list instead.
  data.groupby('Loan Status')['Revolving Balance', 'Revolving Utilities'].median().plot(kind='bar', color=['lightgreen', 'lightblue'])

Out[ ]:

<AxesSubplot:xlabel='Loan Status'>

In [ ]:

data.groupby('Loan Status')['Revolving Balance'].median().plot(kind='bar', color=['lightgreen'])

Out[ ]:

<AxesSubplot:xlabel='Loan Status'>

In [ ]:

data.groupby('Loan Status')['Revolving Utilities'].median().plot(kind='bar', color=['lightblue'])

Out[ ]:

<AxesSubplot:xlabel='Loan Status'>

In [ ]:

# inquiries 6 months vs loan status
data.groupby('Loan Status')['Inquires - six months'].mean().plot(kind='bar', color = 'purple')

Out[ ]:

<AxesSubplot:xlabel='Loan Status'>

In [ ]:

# inquiries 6 mmonth value counts
data['Inquires - six months'].value_counts()

Out[ ]:

0    60486
1     4558
2     2042
3      320
4       54
5        3
Name: Inquires - six months, dtype: int64

In [ ]:

def drop_useless(df):
    df = df.copy()
    df = df.drop(columns=['ID', 'Term', 'Payment Plan', 'Open Account', 'Batch Enrolled',
                          'Accounts Delinquent', 'Total Accounts', 'Recoveries', 'Initial List Status', 'Collection Recovery Fee', 'Last week Pay',
                          'Revolving Utilities', 'Funded Amount', 'Revolving Balance', 'Inquires - six months', 'Loan Title'])
    return df

In [ ]:

EDA¶

In [ ]:

data_clean = drop_useless(data)
data_clean.head()

Out[ ]:

	Loan Amount	Funded Amount Investor	Interest Rate	Grade	Sub Grade	Employment Duration	Home Ownership	Verification Status	Debit to Income	Delinquency - two years	Public Record	Total Received Interest	Total Received Late Fee	Application Type	Total Collection Amount	Total Current Balance	Total Revolving Credit Limit
0	10000	12329.36286	11.135007	B	C4	MORTGAGE	176346.62670	Not Verified	16.284758	1	0	2929.646315	0.102055	INDIVIDUAL	31	311301	6619
1	3609	12191.99692	12.237563	C	D3	RENT	39833.92100	Source Verified	15.412409	0	0	772.769385	0.036181	INDIVIDUAL	53	182610	20885
2	28276	21603.22455	12.545884	F	D4	MORTGAGE	91506.69105	Source Verified	28.137619	0	0	863.324396	18.778660	INDIVIDUAL	34	89801	26155
3	11170	17877.15585	16.731201	C	C3	MORTGAGE	108286.57590	Source Verified	18.043730	1	0	288.173196	0.044131	INDIVIDUAL	40	9189	60214
4	16890	13539.92667	15.008300	C	D4	MORTGAGE	44234.82545	Source Verified	17.209886	1	1	129.239553	19.306646	INDIVIDUAL	430	126029	22579

In [ ]:

data_clean.columns

Out[ ]:

Index(['Loan Amount', 'Funded Amount Investor', 'Interest Rate', 'Grade',
       'Sub Grade', 'Employment Duration', 'Home Ownership',
       'Verification Status', 'Debit to Income', 'Delinquency - two years',
       'Public Record', 'Total Received Interest', 'Total Received Late Fee',
       'Collection 12 months Medical', 'Application Type',
       'Total Collection Amount', 'Total Current Balance',
       'Total Revolving Credit Limit', 'Loan Status'],
      dtype='object')

UNIVARIATE ANALYSIS¶

In [ ]:

# plot histogram of all numerical columns
data_clean.hist(figsize=(20, 20))

Out[ ]:

array([[<AxesSubplot:title={'center':'Loan Amount'}>,
        <AxesSubplot:title={'center':'Funded Amount Investor'}>,
        <AxesSubplot:title={'center':'Interest Rate'}>,
        <AxesSubplot:title={'center':'Home Ownership'}>],
       [<AxesSubplot:title={'center':'Debit to Income'}>,
        <AxesSubplot:title={'center':'Delinquency - two years'}>,
        <AxesSubplot:title={'center':'Public Record'}>,
        <AxesSubplot:title={'center':'Total Received Interest'}>],
       [<AxesSubplot:title={'center':'Total Received Late Fee'}>,
        <AxesSubplot:title={'center':'Collection 12 months Medical'}>,
        <AxesSubplot:title={'center':'Total Collection Amount'}>,
        <AxesSubplot:title={'center':'Total Current Balance'}>],
       [<AxesSubplot:title={'center':'Total Revolving Credit Limit'}>,
        <AxesSubplot:title={'center':'Loan Status'}>, <AxesSubplot:>,
        <AxesSubplot:>]], dtype=object)

In [ ]:

Interesting findings from the Hists¶

Loan Status has large discrepancy between the two classes. Hence, when training the model, we will try to reduce the number of 0 classifications in order to reduce the bias towards the 0 class.

Total Current Balance, Total Revolving Credit Limit, Total Collection Amount, Total Received Late Fee and Total Receieved Interest are highly skewed. We will try to reduce the skewness of these variables.

In [ ]:

# Box plots of `Total Current Balance`, `Total Revolving Credit Limit`, `Total Collection Amount`, `Total Received Late Fee` and `Total Receieved Interest` using seaborn and subpllots
ax = plt.subplots(figsize=(20, 20))

plt.subplot(3, 2, 1)
sns.boxplot(data_clean['Total Current Balance'])

plt.subplot(3, 2, 2)
sns.boxplot(data_clean['Total Revolving Credit Limit'])

plt.subplot(3, 2, 3)
sns.boxplot(data_clean['Total Collection Amount'])

plt.subplot(3, 2, 4)
sns.boxplot(data_clean['Total Received Late Fee'])

plt.subplot(3, 2, 5)
sns.boxplot(data_clean['Total Received Interest'])

/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/seaborn/_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation.
warnings.warn(

Out[ ]:

<AxesSubplot:xlabel='Total Received Interest'>

BIVARIATE ANALYSIS¶

In [ ]:

# plot a graph grouping by the grades and sub grades and plotting the count of 0 for each grade and sub grade
data_clean.groupby('Grade')['Loan Status'].value_counts().unstack().plot(kind='bar', color=['lightgreen', 'lightblue'])
# rename legend 0 to not approved and 1 to approved
plt.legend(['Not Approved', 'Approved'])

Out[ ]:

<matplotlib.legend.Legend at 0x7feed40c91f0>

In [ ]:

# get the percentage of 0s and 1s for each grade
data_clean.groupby('Grade')['Loan Status'].value_counts(normalize=True).unstack()

Out[ ]:

Loan Status	0	1
Grade
A	0.908752	0.091248
B	0.912763	0.087237
C	0.906104	0.093896
D	0.903620	0.096380
E	0.904127	0.095873
F	0.896260	0.103740
G	0.893651	0.106349

In [ ]:

# Groupby grade, count the number of 0s and 1s for each grade and and get the percetage of 1s for each grade
temp = data_clean.groupby('Grade')['Loan Status'].value_counts()
# get the percentages of 1s for each grade
temp = temp.groupby(level=0).apply(lambda x: 100 * x / float(x.sum()))
# plot all the 1s percentages for each grade    
temp.loc[temp.index.get_level_values(1) == 1].plot(kind='bar')

Out[ ]:

<AxesSubplot:xlabel='Grade,Loan Status'>

In [ ]:

# groupby grade and subgrade
temp = data_clean.groupby(['Grade', 'Sub Grade'])['Loan Status'].value_counts()

# Use subplots and 1 for each grade
ax = plt.subplots(figsize=(20, 60))

plt.subplot(7, 1, 1)
temp.loc['A'].loc[temp.loc['A'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade A')


plt.subplot(7, 1, 2)
temp.loc['B'].loc[temp.loc['B'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade B')

plt.subplot(7, 1, 3)
temp.loc['C'].loc[temp.loc['C'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade C')

plt.subplot(7, 1, 4)
temp.loc['D'].loc[temp.loc['D'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade D')

plt.subplot(7, 1, 5)
temp.loc['E'].loc[temp.loc['E'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade E')

plt.subplot(7, 1, 6)
temp.loc['F'].loc[temp.loc['F'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade F')

plt.subplot(7, 1, 7)
temp.loc['G'].loc[temp.loc['G'].index.get_level_values(1) == 1].plot(kind='bar')
plt.title('Grade G')

Out[ ]:

Text(0.5, 1.0, 'Grade G')

In [ ]:

# plot application type vs loan status as percentage of 1s for each application type
data_clean.groupby('Application Type')['Loan Status'].value_counts(normalize=True).unstack().plot(kind='bar')

Out[ ]:

<AxesSubplot:xlabel='Application Type'>

In [ ]:

zeros = data_clean.loc[data_clean["Loan Status"] == 0]
ones =  data_clean.loc[data_clean["Loan Status"] == 1]

balanced = zeros.sample(ones.shape[0])
balanced_df = pd.concat([ones, balanced], axis = 0).reset_index()

In [ ]:

# deliquency 2 years vs loan status , one for 0 loan status and one for 1 loan status
data_clean.groupby('Loan Status')['Delinquency - two years'].value_counts().unstack().plot(kind='bar', figsize=(20, 10))
# title
plt.title('Delinquency - two years vs Loan Status')
# rename the legend name

Out[ ]:

Text(0.5, 1.0, 'Delinquency - two years vs Loan Status')

In [ ]:

# deliquency 2 years vs loan status , one for 0 loan status and one for 1 loan status
balanced_df.groupby('Loan Status')['Delinquency - two years'].value_counts().unstack().plot(kind='bar', figsize=(20, 10))
# title
plt.title('Delinquency - two years vs Loan Status')
# rename the legend name

Out[ ]:

Text(0.5, 1.0, 'Delinquency - two years vs Loan Status')

In [ ]:

data_clean[data_clean['Loan Status'] == 0].groupby('Delinquency - two years')['Loan Status'].value_counts()/data_clean[data_clean['Loan Status'] == 0].shape[0]

Out[ ]:

Delinquency - two years  Loan Status
0                        0              0.772222
1                        0              0.173859
2                        0              0.039136
3                        0              0.006550
4                        0              0.000212
5                        0              0.001062
6                        0              0.002679
7                        0              0.003675
8                        0              0.000604
Name: Loan Status, dtype: float64

In [ ]:

data_clean[data_clean['Loan Status'] == 1].groupby('Delinquency - two years')['Loan Status'].value_counts()/data_clean[data_clean['Loan Status'] == 1].shape[0]

Out[ ]:

Delinquency - two years  Loan Status
0                        1              0.765422
1                        1              0.174972
2                        1              0.040859
3                        1              0.007050
4                        1              0.000481
5                        1              0.001442
6                        1              0.004326
7                        1              0.004326
8                        1              0.001122
Name: Loan Status, dtype: float64

In [ ]:

# deliquency 2 years vs loan status , one for 0 loan status and one for 1 loan status percentage
temp = data_clean.groupby('Loan Status')['Delinquency - two years'].value_counts(normalize=True).unstack()
# at a row which is the diff
temp.loc["Diff Approved vs Not Approved"] = temp.loc[1] - temp.loc[0]
temp

Out[ ]:

Delinquency - two years	0	1	2	3	4	5	6	7	8
Loan Status
0	0.772222	0.173859	0.039136	0.00655	0.000212	0.001062	0.002679	0.003675	0.000604
1	0.765422	0.174972	0.040859	0.00705	0.000481	0.001442	0.004326	0.004326	0.001122
Diff Approved vs Not Approved	-0.006800	0.001113	0.001723	0.00050	0.000268	0.000380	0.001647	0.000651	0.000517

In [ ]:

# plot scatter of Total Revolving Credit Limit vs Total Collection Amount and color the points by loan status
plt.scatter(data_clean['Total Revolving Credit Limit'], data_clean['Total Collection Amount'], c=data_clean['Loan Status'])
# title
plt.title('Total Revolving Credit Limit vs Total Collection Amount')
# legend
plt.legend(['Not Approved', 'Approved'])
# qxis labels
plt.xlabel('Total Revolving Credit Limit')
plt.ylabel('Total Collection Amount')

Out[ ]:

Text(0, 0.5, 'Total Collection Amount')

In [ ]:

# plot scatter of Total Revolving Credit Limit vs Total Collection Amount and color the points by loan status
plt.scatter(balanced_df['Total Revolving Credit Limit'], balanced_df['Total Collection Amount'], c=balanced_df['Loan Status'])
# title
plt.title('Total Revolving Credit Limit vs Total Collection Amount')
# legend
plt.legend(['Not Approved', 'Approved'])
# qxis labels
plt.xlabel('Total Revolving Credit Limit')
plt.ylabel('Total Collection Amount')

Out[ ]:

Text(0, 0.5, 'Total Collection Amount')

In [ ]:

# Verification Statues vs loan status
data_clean.groupby('Loan Status')['Verification Status'].value_counts().unstack().plot(kind='bar')

Out[ ]:

<AxesSubplot:xlabel='Loan Status'>

In [ ]:

# Verification Statues vs loan status
balanced_df.groupby('Loan Status')['Verification Status'].value_counts().unstack().plot(kind='bar')

Out[ ]:

<AxesSubplot:xlabel='Loan Status'>

In [ ]:

# Loan Amount vs loan status
data_clean.groupby('Loan Status')['Loan Amount'].min().plot(kind='bar')

Out[ ]:

<AxesSubplot:xlabel='Loan Status'>

BASELINE MODEL¶

Models evaluated¶

For the baseline model we are using Logistic Classification, K-means classification, Decision tree Classifier, Random Forest Classifier. The features we use are : insert an image of features We choose to evaluate these models using F-1 score because this dataset has imbbalanced classes. It combines precision and recall into a single score hence balancing both measures. We perform basic one hot encoding on the categorical features and leave all other features as it is.

Logistic Regression This assumes linear relationship between the features and the label (loan status). The f-1 score for this model is 0. This means that the model is not useful and performs badly

Decision tree Classifier this model is a non-parametric supervised learning algorithm, which is utilized for both classification and regression tasks. They break down complex data into more manageable parts. Decison trees can automatically handle missing values, can be used to capture non-linear relationships, and are used for both numerical and catagorical data. After testing this model against our unbalanced dataset we get a f-1 score of 0.015 which is better than Logistic Regression.

Random Forest Classifier Random forests or random decision forests is an ensemble learning method for classification, regression and other tasks that operates by constructing a multitude of decision trees at training time. It does sometimes tend to overfit the data but there are ways we can optimize it and prevent from overfitting. The f-1 score for this model on our unbalanced dataset is 0.13.

The baseline models are fitted on the same trainings set.All hyperparametrs are default to test the basic model performance.

Result¶

From analysisng the models above, we choose to further work on Decision trees and Random Forest Classifiers. We aim to perform feauture engineering, choose the optimum hyperparameters, and other optimization of model to select our final model.

In [ ]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV

In [ ]:

X = data_clean.drop('Loan Status', axis=1)
y = data_clean['Loan Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [ ]:

preproc = ColumnTransformer([
    ("one-hot", OneHotEncoder(handle_unknown="ignore"), ["Grade", "Sub Grade", "Employment Duration", "Verification Status", "Application Type"])
], remainder="passthrough")

In [ ]:

rf = RandomForestClassifier() #Random Forest Classifier
log = LogisticRegression() #Logistic Regression
dt = DecisionTreeClassifier() #Decision Tree Classifier

for i in range(3):
    if i == 0:
        model = rf
    elif i == 1:
        model = log
    elif i == 2:
        model = dt

    pipe = Pipeline([
        ("preprocessing", preproc),
        ("model", model)
    ])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    print("Model: ", model)
    print("Accuracy Score: ", accuracy_score(y_test, y_pred))
    print("F1 Score: ", f1_score(y_test, y_pred))
    print("Class 0:", (y_pred == 0).sum())
    print("Class 1:", (y_pred == 1).sum())
    print("")

Model:  RandomForestClassifier()
Accuracy Score:  0.9097309716149115
F1 Score:  0.0
Class 0: 13493
Class 1: 0

Model:  LogisticRegression()
Accuracy Score:  0.9097309716149115
F1 Score:  0.0
Class 0: 13493
Class 1: 0

Model:  DecisionTreeClassifier()
Accuracy Score:  0.8205736307715111
F1 Score:  0.10033444816053513
Class 0: 12020
Class 1: 1473

FEATURE ENGINEERING¶

OneHotEncoding - One-hot encoding in machine learning is the conversion of categorical information into a format that may be fed into machine learning algorithms to improve prediction accuracy. We are one hot encoding the catagorical features "Employment Duration", "Verification Status", "Application Type", "Delinquency - two years", "Collection 12 months Medical". Because these features are not in any specific order or cannot be classified as ordinal, we are choosing it over other encoding methods.

Ordinal Encoding - We are combining teh columns Grade and Sub Grade to make a new column called GradeSubGrade. Since these grade catagories are ordinal, we prefer ordinal encoding over OneHotEncoding or any other encoding methods. For example, the grade AF1 is worse than grade AA1. This also makes the model more efficient by reducing the memory (number of columns in onehotencoding) and increasing the speed.

Since this dataset in highly unbalanced, we decide to balance the dataset by undersampling. Since in real world we the loan status is not equally likely to be defaulted, we try to keep the perecntage divide unequal yet close. We run a test and choose the most optimal balanced set.

Combining grade and subgrade and ordinally encoding it¶

In [ ]:

data = pd.read_csv("/Users/SPECTRE/Downloads/archive-3/train.csv")
train = drop_useless(data)

grade = train["Grade"].unique()
subgrade = train["Sub Grade"].unique()
train["GradeSubGrade"] = train["Grade"] + train["Sub Grade"]
train = train.drop(columns= ["Grade", "Sub Grade"])

In [ ]:

features = []

for i in grade:
    for j in subgrade:
        lst = [i+j]
        features.append(lst)
features = np.unique(features)

dicta = {}
k = 1
for i in features:
    dicta[i] = k
    k = k+1
dicta

train["GradeSubGrade"] = train["GradeSubGrade"].replace(dicta)

In [ ]:

train.head()

Out[ ]:

	Loan Amount	Funded Amount Investor	Interest Rate	Employment Duration	Home Ownership	Verification Status	Debit to Income	Delinquency - two years	Public Record	Total Received Interest	Total Received Late Fee	Application Type	Total Collection Amount	Total Current Balance	Total Revolving Credit Limit	GradeSubGrade
0	10000	12329.36286	11.135007	MORTGAGE	176346.62670	Not Verified	16.284758	1	0	2929.646315	0.102055	INDIVIDUAL	31	311301	6619	49
1	3609	12191.99692	12.237563	RENT	39833.92100	Source Verified	15.412409	0	0	772.769385	0.036181	INDIVIDUAL	53	182610	20885	88
2	28276	21603.22455	12.545884	MORTGAGE	91506.69105	Source Verified	28.137619	0	0	863.324396	18.778660	INDIVIDUAL	34	89801	26155	194
3	11170	17877.15585	16.731201	MORTGAGE	108286.57590	Source Verified	18.043730	1	0	288.173196	0.044131	INDIVIDUAL	40	9189	60214	83
4	16890	13539.92667	15.008300	MORTGAGE	44234.82545	Source Verified	17.209886	1	1	129.239553	19.306646	INDIVIDUAL	430	126029	22579	89

BALANCING THE DATASET¶

In [ ]:

zeros = train.loc[train["Loan Status"] == 0]
ones =  train.loc[train["Loan Status"] == 1]

balanced = zeros.sample(ones.shape[0])
balanced_df = pd.concat([ones, balanced], axis = 0).reset_index()
balanced_df = balanced_df.drop(columns = ["index"])

Feature Engineering via sklearN¶

In [ ]:

preproc = ColumnTransformer([
    ("one-hot", OneHotEncoder(handle_unknown="ignore"), ["Employment Duration", "Verification Status", "Application Type"]),
    ("Std-scale", StandardScaler(), ['Total Collection Amount', 'Total Received Late Fee'])
], remainder="passthrough")

Random Forest Classifier¶

In [ ]:

X = balanced_df.drop('Loan Status', axis=1)
y = balanced_df['Loan Status']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_pipe = Pipeline([
    ("preprocessing", preproc),
    ("rf", rf)
])

param_grid = {'rf__n_estimators' : [10, 30, 50, 75, 100],
              'rf__max_depth' : [2, 5, 10, 20, None],
              'rf__min_samples_split' : [1, 2, 5, 10],
              'rf__class_weight' : ["balanced", "balanced_subsample", None]}

grid_search_rf = GridSearchCV(rf_pipe, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 300 candidates, totalling 1500 fits

Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types
Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types
Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types
Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types
Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types
Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types
Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types
Unable to load extension: pydevd_plugins.extensions.types.pydevd_plugin_pandas_types
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning: 
375 fits failed out of a total of 1500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
375 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 441, in fit
    trees = Parallel(
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 861, in dispatch_one_batch
    self._dispatch(tasks)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 779, in _dispatch
    job = self._backend.apply_async(batch, callback=cb)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
    result = ImmediateResult(func)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/_parallel_backends.py", line 572, in __init__
    self.results = batch()
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 262, in __call__
    return [func(*args, **kwargs)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/joblib/parallel.py", line 262, in <listcomp>
    return [func(*args, **kwargs)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/utils/fixes.py", line 209, in __call__
    return self.function(*args, **kwargs)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/ensemble/_forest.py", line 184, in _parallel_build_trees
    tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 937, in fit
    super().fit(
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

  warnings.warn(some_fits_failed_message, FitFailedWarning)
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_search.py:969: UserWarning: One or more of the test scores are non-finite: [       nan        nan        nan        nan        nan 0.4876717
 0.51910417 0.49554556 0.50047947 0.51301775 0.46226748 0.47267092
 0.50852022 0.50092526 0.49286259 0.50900232 0.49428234 0.50409027
 0.51406805 0.50712063        nan        nan        nan        nan
        nan 0.48875509 0.50473614 0.4930159  0.49565916 0.49488836
 0.47485519 0.49641662 0.50115119 0.49708276 0.49430373 0.47878078
 0.49376944 0.49679122 0.50207183 0.49965396        nan        nan
        nan        nan        nan 0.4980734  0.49488054 0.49001201
 0.4918624  0.48636937 0.50552297 0.48557436 0.49142854 0.49696622
 0.49098238 0.48755723 0.49518411 0.49478222 0.49643313 0.48619006
        nan        nan        nan        nan        nan 0.49540625
 0.48754703 0.48875335 0.4820197  0.48926963 0.49440975 0.48798014
 0.49357648 0.48486914 0.48538061 0.49794904 0.49388407 0.4813358
 0.48808208 0.50040798        nan        nan        nan        nan
        nan 0.41657133 0.45567121 0.463984   0.49049026 0.47475296
 0.48838393 0.48609744 0.49272626 0.48586004 0.48907034 0.48928125
 0.49317958 0.49938562 0.49111024 0.49632264        nan        nan
        nan        nan        nan 0.48788891 0.48930717 0.50183224
 0.5063664  0.50584575 0.48761491 0.49825036 0.50285831 0.50995919
 0.50223032 0.47249482 0.4946651  0.50191365 0.49580879 0.50827925
        nan        nan        nan        nan        nan 0.48030389
 0.49750564 0.4910142  0.49635751 0.49796402 0.48300619 0.50097949
 0.50089398 0.49642547 0.49419356 0.48606852 0.49452372 0.49695914
 0.50074775 0.49757586        nan        nan        nan        nan
        nan 0.48734587 0.5081879  0.48751474 0.48905161 0.48640047
 0.48968525 0.48833876 0.49005426 0.49164023 0.49313442 0.50316092
 0.49203182 0.48854878 0.49291465 0.49481723        nan        nan
        nan        nan        nan 0.48831138 0.49813806 0.49603967
 0.49123974 0.47691792 0.49320082 0.49618096 0.48832141 0.49612333
 0.49324305 0.49726861 0.49713826 0.49319971 0.48696851 0.49269589
        nan        nan        nan        nan        nan 0.42271624
 0.45807157 0.4687901  0.49815782 0.47473815 0.48427584 0.49789747
 0.4940195  0.49769886 0.48896086 0.49051335 0.4971353  0.48769575
 0.49696481 0.49141617        nan        nan        nan        nan
        nan 0.45519142 0.44017127 0.43009604 0.41695111 0.41176105
 0.4263803  0.43854289 0.42639532 0.42734847 0.4157253  0.46479037
 0.44836922 0.397357   0.42132424 0.40424917        nan        nan
        nan        nan        nan 0.47328454 0.47216731 0.46343726
 0.46577353 0.46959026 0.46403307 0.46115032 0.46418086 0.46252775
 0.4693212  0.4839632  0.45610901 0.46451705 0.46324183 0.47352734
        nan        nan        nan        nan        nan 0.47931978
 0.4747452  0.47882185 0.475346   0.47690689 0.48206064 0.47467087
 0.48558705 0.47312928 0.47285291 0.48275851 0.48272826 0.47480674
 0.4838109  0.48135277        nan        nan        nan        nan
        nan 0.48835436 0.48458225 0.48544697 0.48547968 0.48536242
 0.47838016 0.48476848 0.47788299 0.48379202 0.48754652 0.48068387
 0.47575185 0.48673005 0.48703122 0.48280944        nan        nan
        nan        nan        nan 0.42885129 0.45391929 0.46212771
 0.47863123 0.46493197 0.49496842 0.49503892 0.48566295 0.48666077
 0.48946716 0.49742789 0.49509577 0.49134528 0.49110992 0.49134453]
  warnings.warn(

Out[ ]:

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('one-hot',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['Employment '
                                                                          'Duration',
                                                                          'Verification '
                                                                          'Status',
                                                                          'Application '
                                                                          'Type']),
                                                                        ('Std-scale',
                                                                         StandardScaler(),
                                                                         ['Total '
                                                                          'Collection '
                                                                          'Amount',
                                                                          'Total '
                                                                          'Received '
                                                                          'Late '
                                                                          'Fee'])])),
                                       ('rf', RandomForestClassifier())]),
             n_jobs=-1,
             param_grid={'rf__class_weight': ['balanced', 'balanced_subsample',
                                              None],
                         'rf__max_depth': [2, 5, 10, 20, None],
                         'rf__min_samples_split': [1, 2, 5, 10],
                         'rf__n_estimators': [10, 30, 50, 75, 100]},
             scoring='f1', verbose=1)

In [ ]:

grid_search_rf.best_params_

Out[ ]:

{'rf__class_weight': 'balanced',
 'rf__max_depth': 2,
 'rf__min_samples_split': 2,
 'rf__n_estimators': 30}

In [ ]:

grid_search_rf.best_score_

Out[ ]:

0.5191041701889237

In [ ]:

# get accuracy
grid_search_rf.score(X_test, y_test)
accuracy_score(y_test, grid_search_rf.predict(X_test))

Out[ ]:

0.5282338806567881

In [ ]:

dt_pipe = Pipeline([
    ("preprocessing", preproc),
    ("dt", dt)
])

param_grid = {'dt__max_depth' : [2, 5, 10, 20, None],
                'dt__min_samples_split' : [1, 2, 5, 10],
                'dt__class_weight' : ["balanced", None]}

grid_search_dt = GridSearchCV(dt_pipe, param_grid=param_grid, cv=5, scoring='f1', n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)

Fitting 5 folds for each of 40 candidates, totalling 200 fits

/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_validation.py:372: FitFailedWarning: 
50 fits failed out of a total of 200.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 681, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 937, in fit
    super().fit(
  File "/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/tree/_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

  warnings.warn(some_fits_failed_message, FitFailedWarning)
/Users/SPECTRE/opt/anaconda3/envs/dsc80/lib/python3.8/site-packages/sklearn/model_selection/_search.py:969: UserWarning: One or more of the test scores are non-finite: [       nan 0.58016522 0.58016522 0.58016522        nan 0.36607779
 0.36575413 0.36639149        nan 0.41890964 0.41908244 0.42250784
        nan 0.47220281 0.4739036  0.47821298        nan 0.49983136
 0.5000964  0.51119761        nan 0.51957701 0.51957701 0.51957701
        nan 0.36591946 0.36612825 0.36620445        nan 0.41733356
 0.41653174 0.41705025        nan 0.4641829  0.46158159 0.46236304
        nan 0.49602268 0.49944225 0.49448895]
  warnings.warn(

Out[ ]:

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('preprocessing',
                                        ColumnTransformer(remainder='passthrough',
                                                          transformers=[('one-hot',
                                                                         OneHotEncoder(handle_unknown='ignore'),
                                                                         ['Employment '
                                                                          'Duration',
                                                                          'Verification '
                                                                          'Status',
                                                                          'Application '
                                                                          'Type']),
                                                                        ('Std-scale',
                                                                         StandardScaler(),
                                                                         ['Total '
                                                                          'Collection '
                                                                          'Amount',
                                                                          'Total '
                                                                          'Received '
                                                                          'Late '
                                                                          'Fee'])])),
                                       ('dt', DecisionTreeClassifier())]),
             n_jobs=-1,
             param_grid={'dt__class_weight': ['balanced', None],
                         'dt__max_depth': [2, 5, 10, 20, None],
                         'dt__min_samples_split': [1, 2, 5, 10]},
             scoring='f1', verbose=1)

In [ ]:

grid_search_dt.best_params_

Out[ ]:

{'dt__class_weight': 'balanced',
 'dt__max_depth': 2,
 'dt__min_samples_split': 2}

In [ ]:

grid_search_dt.best_score_

Out[ ]:

0.5801652170704724

In [ ]:

# get accuracy score of the best model
y_pred = grid_search_rf.predict(X_test)
accuracy_score(y_test, y_pred)

Out[ ]:

0.5282338806567881

In [ ]:

# get recall
from sklearn.metrics import recall_score
recall_score(y_test, y_pred)

Out[ ]:

0.46280344557556774

In [ ]:

# get precision
from sklearn.metrics import precision_score
precision_score(y_test, y_pred)

Out[ ]:

0.5457063711911357

In [ ]:

# get AUC score
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_pred)

Out[ ]:

0.5297623785254888

In [ ]:

# export the dt model
import pickle
pickle.dump(grid_search_dt, open('dt_model.pkl', 'wb'))

FINAL MODEL - DECISION TREE CLASSIFIER¶

In [ ]:

final_pipe = Pipeline([
    ("preprocessing", preproc),
    ("dt", DecisionTreeClassifier(max_depth = 2, min_samples_split = 2, class_weight = "balanced"))
])

final_pipe.fit(X_train, y_train)

Out[ ]:

Pipeline(steps=[('preprocessing',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('one-hot',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['Employment Duration',
                                                   'Verification Status',
                                                   'Application Type']),
                                                 ('Std-scale', StandardScaler(),
                                                  ['Total Collection Amount',
                                                   'Total Received Late '
                                                   'Fee'])])),
                ('dt',
                 DecisionTreeClassifier(class_weight='balanced', max_depth=2))])

In [ ]:

f1_score(y_test, final_pipe.predict(X_test))

Out[ ]:

0.6049868766404198

In [ ]:

tn, fp, fn, tp = confusion_matrix(y_test, final_pipe.predict(X_test)).ravel()
print("True negative ", tn)
print("True positive ", tp)
print("False negative ", fn)
print("False positive ", fp)

True negative  371
True positive  922
False negative  355
False positive  849

Loan Prediction Notebook

EDA¶

UNIVARIATE ANALYSIS¶

Interesting findings from the Hists¶

BIVARIATE ANALYSIS¶

BASELINE MODEL¶

Models evaluated¶

Result¶

FEATURE ENGINEERING¶

Combining grade and subgrade and ordinally encoding it¶

BALANCING THE DATASET¶

Feature Engineering via sklearN¶

Random Forest Classifier¶

FINAL MODEL - DECISION TREE CLASSIFIER¶

The final model is a decsion tree classifier with f1 score of 0.60.¶

Jack Kai Lim

Error

EDA¶

UNIVARIATE ANALYSIS¶

Interesting findings from the Hists¶

BIVARIATE ANALYSIS¶

BASELINE MODEL¶

Models evaluated¶

Result¶

FEATURE ENGINEERING¶

Combining grade and subgrade and ordinally encoding it¶

BALANCING THE DATASET¶

Feature Engineering via sklearN¶

Random Forest Classifier¶

FINAL MODEL - DECISION TREE CLASSIFIER¶

The final model is a decsion tree classifier with f1 score of 0.60.¶

Templates (for web app):

Error