KAPED Documentation
Search…
⌃K

Underwriting Schema

This is a schema for enterprise clients doing their own underwriting
LogisticRegression.py
1
# -*- coding: utf-8 -*-
2
"""
3
Created on Fri Jul 15 18:19:17 2022
4
Property of KAPED INC.
5
"""
6
7
# import libraries
8
from typing import Any
9
10
from pandas import DataFrame
11
from pandas.io.parsers import TextFileReader
12
from sklearn.linear_model import LogisticRegression
13
from sklearn.model_selection import train_test_split
14
from sklearn.preprocessing import StandardScaler, MinMaxScaler
15
from sklearn.metrics import accuracy_score
16
from sklearn.metrics import classification_report
17
from sklearn.metrics import confusion_matrix
18
from sklearn.model_selection import GridSearchCV
19
20
import pandas as pd
21
import numpy as np
22
import os
23
import matplotlib.pyplot as plt
24
25
from imblearn.over_sampling import SMOTE
26
from collections import OrderedDict
27
plt.rc("font", size=14)
28
29
30
class IndividualPredictions:
31
32
def __init__(self, training_data, individual_testing_data, individual_testing_data_features, probability_approval,
33
find_features, overdraft):
34
self.training_data = training_data
35
self.individual_testing_data_features = individual_testing_data_features
36
self.individual_testing_data = individual_testing_data
37
self.y_training = training_data['Credit_Approval']
38
self.x_training = training_data.drop('Credit_Approval', axis=1)
39
self.x_training = self.x_training.iloc[:, 1:]
40
self.os_x_train = pd.DataFrame()
41
self.os_y_train = pd.DataFrame()
42
self.os_x_test = pd.DataFrame()
43
self.os_y_test = pd.DataFrame()
44
self.os_x_training = pd.DataFrame()
45
self.os_x_testing = pd.DataFrame()
46
self.model = pd.DataFrame()
47
self.columns = pd.DataFrame()
48
self.predictions = np.zeros(shape=(90, 1))
49
self.confusion_matrix_training = np.zeros(shape=(2, 2))
50
self.rescaled_x = np.zeros(shape=(10, 15))
51
self.ind_pred = ""
52
self.probability_approval = probability_approval
53
self.preliminary = ""
54
self.prel_var = 0
55
self.find_features = find_features
56
self.overdraft = overdraft
57
58
# Over-sampling using SMOTE
59
60
os = SMOTE(random_state=0)
61
self.os_x_train, self.os_x_test, self.os_y_train, self.os_y_test = train_test_split(self.x_training,
62
self.y_training,
63
test_size=0.4,
64
random_state=0)
65
66
scaler = MinMaxScaler()
67
68
self.os_x_training = pd.DataFrame(scaler.fit_transform(self.os_x_train), columns=self.os_x_train.columns)
69
70
self.os_x_testing = pd.DataFrame(scaler.fit_transform(self.os_x_test), columns=self.os_x_test.columns)
71
72
self.model = LogisticRegression(random_state=0)
73
74
self.model.fit(self.os_x_training, self.os_y_train)
75
76
if self.find_features == 1:
77
importance = pd.DataFrame(data={
78
'Attribute': self.os_x_training.columns,
79
'Importance': self.model.coef_[0]})
80
81
importance = importance.sort_values(by='Importance', ascending=False)
82
print(importance)
83
84
plt.bar(x=importance['Attribute'], height=importance['Importance'], color='#087E8B')
85
plt.title('Feature importance obtained from coefficients', size=15)
86
plt.xticks(rotation='vertical')
87
plt.xlabel('xlabel', fontsize=8)
88
plt.ylabel('ylabel', fontsize=8)
89
plt.show()
90
91
self.columns = self.os_x_training.columns
92
93
self.predictions = (self.model.predict(self.os_x_testing) >= self.probability_approval).astype(int)
94
95
# self.predictions = self.model.predict_proba(self.os_x_testing)
96
97
'''
98
scikit-learn has an excellent built-in module called classification_report
99
that makes it easy to measure the performance of a classification
100
machine learning model.
101
'''
102
print(classification_report(self.os_y_test, self.predictions))
103
print(confusion_matrix(self.os_y_test, self.predictions))
104
self.confusion_matrix_training = confusion_matrix( self.os_y_test, self.predictions)
105
106
fig, ax = plt.subplots(figsize=(8, 8))
107
ax.imshow(self.confusion_matrix_training)
108
ax.grid(False)
109
ax.xaxis.set(ticks=(0, 1), ticklabels=('Predicted Approved', 'Predicted Not Approved'))
110
ax.set_xlabel('Predicted labels')
111
ax.set_ylabel('True labels')
112
ax.yaxis.set(ticks=(0, 1), ticklabels=('Actual Approved', 'Actual Not Approved'))
113
ax.set_ylim(1.5, -0.5)
114
for i in range(2):
115
for j in range(2):
116
ax.text(j, i, self.confusion_matrix_training[i, j], ha='center', va='center', color='red')
117
plt.show()
118
119
'''
120
# Grid searching and making the model perform better
121
scikit-learn's implementation of logistic regression consists of
122
different hyperparameters but we will grid search over the following two:
123
-tol
124
-max_iter
125
'''
126
127
# Define the grid of values for tol and max_iter
128
tol = [0.01, 0.001, 0.0001]
129
max_iter = [100, 150, 200]
130
131
# Create a dictionary where tol and max_iter are keys and
132
# the lists of their values are corresponding values
133
param_grid = dict(tol=tol, max_iter=max_iter)
134
135
'''
136
Finding the best performing model
137
We have defined the grid of hyperparameter
138
values and converted them into a single dictionary format
139
which GridSearchCV() expects as one of its parameters.
140
Now, we will begin the grid search to see which values perform best.
141
We will instantiate GridSearchCV() with our earlier
142
logreg model with all the data we have. Instead of passing train
143
and test sets separately, we will supply X (scaled version) and y.
144
We will also instruct GridSearchCV() to perform a cross-validation of five folds.
145
We'll end the notebook by storing the best-achieved score and the
146
respective best parameters.
147
While building this credit card predictor, we tackled some of the
148
most widely-known preprocessing steps such as scaling, label encoding,
149
and missing value imputation. We finished with some machine learning to
150
predict if a person's application for a credit card would get approved
151
or not given some information about that person.
152
'''
153
154
grid_model = GridSearchCV(estimator=self.model, param_grid = param_grid, cv=5)
155
156
scaler = MinMaxScaler(feature_range=(0, 1))
157
158
# Use scaler to rescale X and assign it to rescaled_x
159
self.rescaled_x = scaler.fit_transform(self.os_x_training)
160
161
# Fit data to grid_model
162
grid_model_result = grid_model.fit(self.rescaled_x, self.os_y_train)
163
164
# Summarize results
165
best_score, best_params = grid_model_result.best_score_, grid_model_result.best_params_
166
print("Best: %f using %s" % (best_score, best_params))
167
168
print('accuracy score: ', accuracy_score(y_true=self.os_y_test, y_pred=self.predictions))
169
170
def preliminary_check(self):
171
172
if self.individual_testing_data_features["Balance_2d_Mo"] - self.individual_testing_data_features["Limit_Requested"] <= 0:
173
self.preliminary = "Manual Underwriting Required. Limit Should be Lowered."
174
else:
175
self.preliminary = "Proceed to Automatic Underwriting"
176
self.prel_var = 1
177
178
return self.preliminary
179
180
def individual_prediction(self):
181
182
individual_predictions = self.model.predict(self.individual_testing_data)
183
184
if self.prel_var:
185
if individual_predictions:
186
self.ind_pred = "Approved"
187
else:
188
self.ind_pred = "Disapproved"
189
return self.ind_pred
190
else:
191
return self.preliminary
192
193
194
overdraft = 0
195
196
if overdraft:
197
training_data = pd.read_csv('credit_default_training_set.csv')
198
new_customer = OrderedDict([('Overdraft_Protection', 1), ('Balance_1st_Mo', 9880),
199
('Balance_2d_Mo', 1740), ('Expenses_1st_Mo', 44460),
200
('Expenses_2d_Mo', 17060), ('Deposit_Credit_1st_Mo', 54340),
201
('Deposit_Credit_2d_Month', 18800),
202
('Two_Mo_Expenses_Select_Categories', 61520),
203
('Two_Mo_Income', 73140), ('Two_Mo_DTI_Ratio', .80),
204
('Limit_Requested', 5400),
205
('Free_Cash_Flow', 5810), ('Limit_Free_Cash_Ratio', 1.08)])
206
else:
207
training_data = pd.read_csv('credit_default_training_set_no_overdraft.csv')
208
new_customer = OrderedDict([('Balance_1st_Mo', 9880), ('Balance_2d_Mo', 1740),
209
('Expenses_1st_Mo', 44460), ('Expenses_2d_Mo', 17060),
210
('Deposit_Credit_1st_Mo', 54340), ('Deposit_Credit_2d_Month', 18800),
211
('Two_Mo_Expenses_Select_Categories', 61520),
212
('Two_Mo_Income', 73140), ('Two_Mo_DTI_Ratio', .80),
213
('Limit_Requested', 5400),
214
('Free_Cash_Flow', 5810), ('Limit_Free_Cash_Ratio', 1.08)])
215
216
217
individual_testing_data = pd.Series(new_customer)
218
219
individual_testing_data_features = individual_testing_data
220
221
individual_testing_data = individual_testing_data.values.reshape(1, -1)
222
223
probability_approval = 0.4
224
225
find_features = 1
226
227
individualPrediction = IndividualPredictions(training_data, individual_testing_data, individual_testing_data_features,
228
probability_approval, find_features, overdraft)
229
230
preliminary_status = individualPrediction.preliminary_check()
231
232
underwriting_prediction = individualPrediction.individual_prediction()
233
234
print(underwriting_prediction)