from mywebstyle import plot_style
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import kaggle
plot_style('#f4f4f4' )
loandata = pd.read_csv('lending_club_loan_two.csv' )
first_13_cols = loandata.iloc[:, :13 ]
first_13_cols.head()
Warning: Your Kaggle API key is readable by other users on this system! To fix this, you can run 'chmod 600 /Users/macpc/.kaggle/kaggle.json'
0
10000
36 months
11.44
329.48
B
B4
Marketing
10+ years
RENT
117000.0
Not Verified
Jan-15
Fully Paid
1
8000
36 months
11.99
265.68
B
B5
Credit analyst
4 years
MORTGAGE
65000.0
Not Verified
Jan-15
Fully Paid
2
15600
36 months
10.49
506.97
B
B3
Statistician
< 1 year
RENT
43057.0
Source Verified
Jan-15
Fully Paid
3
7200
36 months
6.49
220.65
A
A2
Client Advocate
6 years
RENT
54000.0
Not Verified
Nov-14
Fully Paid
4
24375
60 months
17.27
609.33
C
C5
Destiny Management Inc.
9 years
MORTGAGE
55000.0
Verified
Apr-13
Charged Off
second_13_cols = loandata.iloc[:, 13 :]
second_13_cols.head()
0
vacation
Vacation
26.24
Jun-90
16
0
36369
41.8
25
w
INDIVIDUAL
0.0
0.0
0174 Michelle Gateway\r\nMendozaberg, OK 22690
1
debt_consolidation
Debt consolidation
22.05
Jul-04
17
0
20131
53.3
27
f
INDIVIDUAL
3.0
0.0
1076 Carney Fort Apt. 347\r\nLoganmouth, SD 05113
2
credit_card
Credit card refinancing
12.79
Aug-07
13
0
11987
92.2
26
f
INDIVIDUAL
0.0
0.0
87025 Mark Dale Apt. 269\r\nNew Sabrina, WV 05113
3
credit_card
Credit card refinancing
2.60
Sep-06
6
0
5472
21.5
13
f
INDIVIDUAL
0.0
0.0
823 Reid Ford\r\nDelacruzside, MA 00813
4
credit_card
Credit Card Refinance
33.95
Mar-99
13
0
24584
69.8
43
f
INDIVIDUAL
1.0
0.0
679 Luna Roads\r\nGreggshire, VA 11650
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395900 entries, 0 to 395899
Data columns (total 27 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 loan_amnt 395900 non-null int64
1 term 395900 non-null object
2 int_rate 395900 non-null float64
3 installment 395900 non-null float64
4 grade 395900 non-null object
5 sub_grade 395900 non-null object
6 emp_title 372982 non-null object
7 emp_length 377608 non-null object
8 home_ownership 395900 non-null object
9 annual_inc 395900 non-null float64
10 verification_status 395900 non-null object
11 issue_d 395900 non-null object
12 loan_status 395900 non-null object
13 purpose 395900 non-null object
14 title 394145 non-null object
15 dti 395900 non-null float64
16 earliest_cr_line 395900 non-null object
17 open_acc 395900 non-null int64
18 pub_rec 395900 non-null int64
19 revol_bal 395900 non-null int64
20 revol_util 395624 non-null float64
21 total_acc 395900 non-null int64
22 initial_list_status 395900 non-null object
23 application_type 395900 non-null object
24 mort_acc 358117 non-null float64
25 pub_rec_bankruptcies 395365 non-null float64
26 address 395900 non-null object
dtypes: float64(7), int64(5), object(15)
memory usage: 81.6+ MB
sns.heatmap(loandata.isnull(), yticklabels= False , cbar= False , cmap= 'viridis' )
loan_amnt
395900.0
14114.249305
8357.637338
500.00
8000.00
12000.00
20000.00
40000.00
int_rate
395900.0
13.639385
4.472112
5.32
10.49
13.33
16.49
30.99
installment
395900.0
431.859947
250.733444
16.08
250.33
375.43
567.30
1533.81
annual_inc
395900.0
74206.819251
61645.032777
0.00
45000.00
64000.00
90000.00
8706582.00
dti
395900.0
17.379187
18.021550
0.00
11.28
16.91
22.98
9999.00
open_acc
395900.0
11.311081
5.137591
0.00
8.00
10.00
14.00
90.00
pub_rec
395900.0
0.178204
0.530716
0.00
0.00
0.00
0.00
86.00
revol_bal
395900.0
15844.331435
20589.846553
0.00
6026.00
11181.00
19620.00
1743266.00
revol_util
395624.0
53.793449
24.452575
0.00
35.80
54.80
72.90
892.30
total_acc
395900.0
25.414622
11.887279
2.00
17.00
24.00
32.00
151.00
mort_acc
358117.0
1.814091
2.148006
0.00
0.00
1.00
3.00
34.00
pub_rec_bankruptcies
395365.0
0.121647
0.356176
0.00
0.00
0.00
0.00
8.00
sns.countplot(x= 'loan_status' , data= loandata)
fp = np.round (
len (loandata[loandata['loan_status' ] == 'Fully Paid' ])/ len (loandata)* 100 , 2
)
co = np.round (
len (loandata[loandata['loan_status' ] == 'Charged Off' ]) /
len (loandata)* 100 , 2
)
fig = plt.figure(figsize= (9 , 4 ))
ax1 = fig.add_subplot(121 )
loandata[loandata['loan_status' ] == 'Charged Off' ]['loan_amnt' ].hist(
alpha= 0.5 , color= 'red' , bins= 30 , ax= ax1,
label= 'Charged Off'
)
loandata[loandata['loan_status' ] == 'Fully Paid' ]['loan_amnt' ].hist(
alpha= 0.5 , color= 'blue' , bins= 30 , ax= ax1,
label= 'Fully Paid'
)
ax1.set_title('Loan Amount Distribution' )
ax1.set_xlabel('Loan Amount' )
ax1.legend()
ax2 = fig.add_subplot(122 )
loandata[loandata['loan_status' ] == 'Charged Off' ]['installment' ].hist(
alpha= 0.5 , color= 'red' , bins= 30 , ax= ax2,
label= 'Charged Off'
)
loandata[loandata['loan_status' ] == 'Fully Paid' ]['installment' ].hist(
alpha= 0.5 , color= 'blue' , bins= 30 , ax= ax2,
label= 'Fully Paid'
)
ax2.set_title('Installment Distribution' )
ax2.set_xlabel('Installment' )
ax2.legend()
fig = plt.figure(figsize= (8.8 , 4 ))
ax1 = fig.add_subplot(121 )
sns.boxplot(
x= 'loan_status' , y= 'loan_amnt' , hue= 'loan_status' ,
data= loandata, ax= ax1, palette= 'winter'
)
ax1.set_title('Loan Amount Boxplot' )
ax2 = fig.add_subplot(122 )
sns.boxplot(
x= 'loan_status' , y= 'installment' , hue= 'loan_status' ,
data= loandata, ax= ax2, palette= 'winter'
)
ax2.set_title('Installment Boxplot' )
Text(0.5, 1.0, 'Installment Boxplot')
fig = plt.figure(figsize= (8.8 , 4 ))
ax1 = fig.add_subplot(121 )
sns.countplot(
x= 'loan_status' ,
hue= 'term' , data= loandata,
palette= 'RdBu_r' , ax= ax1
)
ax2 = fig.add_subplot(122 )
sns.countplot(
x= 'loan_status' ,
hue= 'grade' , data= loandata,
palette= 'winter' , ax= ax2
)
fig, ax = plt.subplots(figsize= (10 , 5 ))
sns.histplot(x= 'sub_grade' , hue= 'loan_status' , data= loandata, ax= ax)
loandata['emp_title' ] = loandata['emp_title' ].str .lower()
loandata.emp_title.value_counts()[:25 ]
emp_title
manager 5635
teacher 5426
registered nurse 2626
supervisor 2589
sales 2381
driver 2306
owner 2200
rn 2072
project manager 1776
office manager 1638
general manager 1460
truck driver 1288
director 1192
engineer 1187
police officer 1041
vice president 961
sales manager 961
operations manager 960
store manager 941
president 877
administrative assistant 865
accountant 845
account manager 845
technician 839
mechanic 753
Name: count, dtype: int64
pd.set_option('future.no_silent_downcasting' , True )
loandata['emp_length' ] = loandata['emp_length' ].replace({
'< 1 year' : 0 ,
'1 year' : 1 ,
'2 years' : 2 ,
'3 years' : 3 ,
'4 years' : 4 ,
'5 years' : 5 ,
'6 years' : 6 ,
'7 years' : 7 ,
'8 years' : 8 ,
'9 years' : 9 ,
'10+ years' : 10
}
).infer_objects(copy= False )
loandata['emp_length_group' ] = pd.cut(
loandata['emp_length' ],
bins= [- 1 , 2 , 7 , 10 ], # Bins: <3 years, 3-7 years, > 7 years
labels= ['Short-term' , 'Mid-term' , 'Long-term' ]
)
sns.countplot(
x= 'emp_length_group' ,
hue= 'loan_status' ,
data= loandata,
palette= 'winter' ,
stat= 'count'
)
fig = plt.figure(figsize= (8 , 10 ))
ax1 = fig.add_subplot(211 )
annual_income_threshod = loandata['annual_inc' ].quantile(0.95 )
filtered_income = loandata[loandata['annual_inc' ] <= annual_income_threshod]
sns.boxplot(
x= 'loan_status' , y= 'annual_inc' ,
hue= 'loan_status' , palette= 'winter' ,
data= filtered_income, ax= ax1
)
ax1.set_title('Income Distribution' )
ax1.set_xlabel('Loan Status' )
ax1.set_ylabel('Annual Income' )
ax2 = fig.add_subplot(212 )
sns.countplot(
x= 'home_ownership' , hue= 'loan_status' ,
data= loandata, ax= ax2
)
loandata['issue_d' ] = pd.to_datetime(
loandata['issue_d' ], format = '%b-%y'
)
loandata = loandata.sort_values('issue_d' )
loan_status_trend = loandata.groupby(
['issue_d' , 'loan_status' ]).size().unstack()
fig = plt.figure(figsize= (8 , 10 ))
ax1 = fig.add_subplot(211 )
loan_status_trend.plot(
kind= 'line' , marker= 'o' , ax= ax1
)
ax1.set_title('Loan Status Over Time by Issue Date' )
ax1.set_xlabel('Issue Date(mm-yyyy)' )
ax1.set_ylabel('Number of Loans' )
ax1.legend(title= 'Loan Status' )
ax2 = fig.add_subplot(212 )
sns.countplot(
x= 'verification_status' , hue= 'loan_status' ,
data= loandata, palette= 'winter' , ax= ax2
)
loandata['purpose' ].value_counts()
purpose
debt_consolidation 234420
credit_card 82998
home_improvement 24024
other 21177
major_purchase 8788
small_business 5701
car 4696
medical 4194
moving 2853
vacation 2452
house 2201
wedding 1811
renewable_energy 328
educational 257
Name: count, dtype: int64
fig = plt.figure(figsize= (9 , 4 ))
ax1 = fig.add_subplot(121 )
sns.countplot(
y= 'purpose' , hue= 'loan_status' ,
data= loandata, palette= 'coolwarm'
)
ax1.set_title('Loan Purpose' )
ax2 = fig.add_subplot(122 )
dti_threshold = loandata['dti' ].quantile(0.95 )
filtereddata = loandata[loandata['dti' ] <= dti_threshold]
sns.boxplot(
x= 'loan_status' , y= 'dti' ,
hue= 'loan_status' , data= filtereddata,
palette= 'coolwarm' , ax= ax2
)
ax2.set_title('Debt-to-Income Ratio on Loan Status' )
Text(0.5, 1.0, 'Debt-to-Income Ratio on Loan Status')
Insights: From the purpose column, we see that most of the loans that were charged off were used to make debt consolidation. Therefore, debt consolidation may have been a significant factor when a loan is charged off. Another insight we obtain from the debt-to-income ratio is that the charged off loans have higher dti ratio
fig = plt.figure(figsize= (9 , 4 ))
ax1 = fig.add_subplot(121 )
filtered_open_account_threshold = loandata['open_acc' ].quantile(0.98 )
filtered_open_account = loandata[loandata['open_acc' ]
<= filtered_open_account_threshold]
filtered_open_account[filtered_open_account['loan_status' ] == 'Fully Paid' ]['open_acc' ].hist(
alpha= 0.5 , color= 'green' , bins= 30 , label= 'Fully Paid' , ax= ax1
)
filtered_open_account[filtered_open_account['loan_status' ] == 'Charged Off' ]['open_acc' ].hist(
alpha= 0.5 , color= 'red' , bins= 30 , label= 'Charged Off' , ax= ax1
)
ax1.set_xlabel('Number of Credit Acc.' )
ax1.legend()
ax1.set_title('Number of credit accounts and Loan Status' )
loandata['pub_rec_group' ] = pd.cut(
loandata['pub_rec' ], bins= [- 1 , 0 , 1 , 3 , loandata['pub_rec' ].max ()],
labels= ['0' , '1' , '2-3' , '4+' ]
)
loan_status_by_pub_rec = loandata.groupby(
['pub_rec_group' , 'loan_status' ], observed= False
).size().unstack()
ax2 = fig.add_subplot(122 )
loan_status_by_pub_rec.plot(
kind= 'bar' , stacked= False , edgecolor= 'black' ,
color= ['#1f77b4' , '#ff7f0e' ], ax= ax2
)
ax2.set_title('Public Records and Loan Status' )
ax2.set_xlabel('Public Records' )
Text(0.5, 0, 'Public Records')
sns.boxplot(
y= 'open_acc' , x= 'loan_status' ,
hue= 'loan_status' ,
data= filtered_open_account
)
Insights: Number of credit account seems normally distributed among both groups except for some outliers. However, the mean number of credit accounts are slightly higher for the charged off category than the fully paid category. So, higher credit account has some sort of relation with loan being charged off. Also, people who doesn’t have any public record seems to have higher chance of loan status being charged off
revol_bal_threshold = loandata['revol_bal' ].quantile(0.95 )
filtered_revol_bal = loandata[loandata['revol_bal' ] <= revol_bal_threshold]
sns.boxplot(
x= 'loan_status' , y= 'revol_bal' ,
data= filtered_revol_bal, hue= 'loan_status' ,
palette= 'Set2'
)
revol_util_threshold = loandata['revol_util' ].quantile(0.95 )
filtered_revol_util = loandata[loandata['revol_util' ] <= revol_util_threshold]
sns.boxplot(
x= 'loan_status' , y= 'revol_util' ,
hue= 'loan_status' , data= filtered_revol_util,
palette= 'Set1'
)
fig = plt.figure(figsize= (9 , 4 ))
ax1 = fig.add_subplot(121 )
revol_bal_threshold = loandata['revol_bal' ].quantile(0.95 )
filtered_revol_bal = loandata[loandata['revol_bal' ] <= revol_bal_threshold]
sns.boxplot(
x= 'loan_status' , y= 'revol_bal' ,
data= filtered_revol_bal, hue= 'loan_status' ,
palette= 'Set2' , ax= ax1
)
ax1.set_xlabel('Loan Status' )
ax1.set_ylabel('Revolving Balance' )
ax1.set_title('Revolving Balance vs Loan Status' )
ax2 = fig.add_subplot(122 )
revol_util_threshold = loandata['revol_util' ].quantile(0.95 )
filtered_revol_util = loandata[loandata['revol_util' ] <= revol_util_threshold]
sns.boxplot(
x= 'loan_status' , y= 'revol_util' ,
hue= 'loan_status' , data= filtered_revol_util,
palette= 'Set1' , ax= ax2
)
ax2.set_xlabel('Loan Status' )
ax2.set_ylabel('Revolving Utilization' )
ax2.set_title('Revolving Utilization vs Loan Status' )
Text(0.5, 1.0, 'Revolving Utilization vs Loan Status')
loandata['total_acc' ].value_counts()
total_acc
21 14274
22 14255
20 14220
23 13915
24 13874
...
151 1
104 1
135 1
108 1
115 1
Name: count, Length: 118, dtype: int64
fig = plt.figure(figsize= (9 , 4 ))
ax1 = fig.add_subplot(121 )
filtered_total_account_threshold = loandata['total_acc' ].quantile(0.95 )
filtered_total_account = loandata[loandata['total_acc' ]
<= filtered_total_account_threshold]
filtered_total_account[filtered_total_account['loan_status' ] == 'Fully Paid' ]['total_acc' ].hist(
alpha= 0.5 , color= 'green' , bins= 30 , label= 'Fully Paid' , ax= ax1
)
filtered_total_account[filtered_total_account['loan_status' ] == 'Charged Off' ]['total_acc' ].hist(
alpha= 0.5 , color= 'red' , bins= 30 , label= 'Charged Off' , ax= ax1
)
ax1.set_xlabel('Number of Total Credit Acc.' )
ax1.legend()
ax1.set_title('Number of Total credit accounts and Loan Status' )
ax2 = fig.add_subplot(122 )
sns.countplot(
x= 'initial_list_status' , hue= 'loan_status' ,
data= loandata, palette= 'winter'
)
ax2.set_title('Initial Status and Loan Status' )
ax2.set_xlabel('Initial Status (Funded or Withdrawn)' )
Text(0.5, 0, 'Initial Status (Funded or Withdrawn)')
sns.boxplot(
y= 'total_acc' , x= 'loan_status' ,
hue= 'loan_status' ,
data= filtered_total_account
)
loandata['application_type' ].value_counts()
application_type
INDIVIDUAL 395189
JOINT 425
DIRECT_PAY 286
Name: count, dtype: int64
loandata['mort_acc' ].value_counts()
mort_acc
0.0 139727
1.0 60392
2.0 49931
3.0 38040
4.0 27880
5.0 18188
6.0 11067
7.0 6050
8.0 3121
9.0 1655
10.0 865
11.0 479
12.0 264
13.0 146
14.0 107
15.0 61
16.0 37
17.0 22
18.0 18
19.0 15
20.0 13
24.0 10
22.0 7
25.0 4
21.0 4
27.0 3
23.0 2
31.0 2
32.0 2
26.0 2
34.0 1
30.0 1
28.0 1
Name: count, dtype: int64
mort_acc_summary = loandata.groupby('loan_status' )['mort_acc' ].describe()
print (mort_acc_summary)
count mean std min 25% 50% 75% max
loan_status
Charged Off 72103.0 1.501214 1.974335 0.0 0.0 1.0 2.0 23.0
Fully Paid 286014.0 1.892967 2.182550 0.0 0.0 1.0 3.0 34.0
# Bin the mort_acc column into categories
loandata['mort_acc_group' ] = pd.cut(
loandata['mort_acc' ], bins= [- 1 , 0 , 2 , 5 , 10 , loandata['mort_acc' ].max ()],
labels= ['0' , '1-2' , '3-5' , '6-10' , '10+' ]
)
# Plot the grouped bar chart
mort_acc_counts = loandata.groupby(
['mort_acc_group' , 'loan_status' ], observed= False
).size().unstack()
mort_acc_counts.plot(kind= 'bar' , stacked= False ,
figsize= (10 , 6 ), colormap= 'viridis' )
plt.title('Loan Status by Number of Mortgage Accounts' )
plt.xlabel('Number of Mortgage Accounts (Grouped)' )
plt.ylabel('Number of Loans' )
plt.xticks(rotation= 0 )
plt.legend(title= 'Loan Status' )
plt.show()
loandata['pub_rec_bankruptcies' ].value_counts()
pub_rec_bankruptcies
0.0 350265
1.0 42776
2.0 1846
3.0 351
4.0 82
5.0 32
6.0 7
7.0 4
8.0 2
Name: count, dtype: int64
loandata['pub_rec_bankruptcies_group' ] = pd.cut(
loandata['pub_rec_bankruptcies' ], bins= [- 1 , 0 ,
loandata['pub_rec_bankruptcies' ].max ()],
labels= ['0' , '1 or More' ]
)
# Plot the grouped bar chart
pub_rec_bankruptcies_counts = loandata.groupby(
['pub_rec_bankruptcies_group' , 'loan_status' ], observed= False
).size().unstack()
pub_rec_bankruptcies_counts.plot(
kind= 'bar' , stacked= False , figsize= (10 , 6 ), colormap= 'viridis' )
plt.title('Loan Status by Number of Public Record of Bankruptcies' )
plt.xlabel('Public Record of Bankruptcies (Grouped)' )
plt.ylabel('Number of Loans' )
plt.xticks(rotation= 0 )
plt.legend(title= 'Loan Status' )
plt.show()
fig = plt.figure(figsize= (9 , 4 ))
ax1 = fig.add_subplot(121 )
loandata['mort_acc_group' ] = pd.cut(
loandata['mort_acc' ], bins= [- 1 , 0 , 2 , 5 , 10 , loandata['mort_acc' ].max ()],
labels= ['0' , '1-2' , '3-5' , '6-10' , '10+' ]
)
mort_acc_counts = loandata.groupby(
['mort_acc_group' , 'loan_status' ], observed= False
).size().unstack()
mort_acc_counts.plot(
kind= 'bar' , stacked= False ,
colormap= 'viridis' , ax= ax1
)
ax1.set_title('Loan Status by the # of Mort. Acc' )
ax1.set_xlabel('Number of Mortgage Accounts (Grouped)' )
ax1.set_ylabel('Number of Loans' )
ax1.set_xticklabels(ax1.get_xticklabels(), rotation= 0 )
ax1.legend(title= 'Loan Status' )
ax2 = fig.add_subplot(122 )
loandata['pub_rec_bankruptcies_group' ] = pd.cut(
loandata['pub_rec_bankruptcies' ], bins= [- 1 , 0 ,
loandata['pub_rec_bankruptcies' ].max ()],
labels= ['0' , '1 or More' ]
)
# Plot the grouped bar chart
pub_rec_bankruptcies_counts = loandata.groupby(
['pub_rec_bankruptcies_group' , 'loan_status' ], observed= False
).size().unstack()
pub_rec_bankruptcies_counts.plot(
kind= 'bar' , stacked= False ,
colormap= 'coolwarm' , ax= ax2
)
ax2.set_title('Loan Status by the # of Pub Rec of Bankruptcies' )
ax2.set_xlabel('Public Record of Bankruptcies (Grouped)' )
ax2.set_ylabel('Number of Loans' )
ax2.set_xticklabels(ax2.get_xticklabels(), rotation= 0 )
ax2.legend(title= 'Loan Status' )
plt.show()
loandata['application_type' ].value_counts()
application_type
INDIVIDUAL 395189
JOINT 425
DIRECT_PAY 286
Name: count, dtype: int64
sns.countplot(
x= 'loan_status' , hue= 'application_type' ,
data= loandata, palette= 'winter'
)
numeric_loandata = loandata.select_dtypes(include= ['float64' ,'int64' ])
plt.figure(figsize= (10 ,8 ))
sns.heatmap(numeric_loandata.corr(), annot= True , cmap= 'coolwarm' , vmin=- 1 , vmax= 1 )
plt.show()
Back to top