本文意在从Lending Club网站下载贷款申请的数据来创建模型,预测贷款申请的结果。本文所使用的数据来源是该公司网站公开提供的,网址是:https://www.lendingclub.com/info/download-data.action
一、数据处理
首先看一下数据集的样本个数以及特征个数
import pandas as pd
loans_2016S3 = pd.read_csv('2016S3.csv', skiprows=1)
print(loans_2016S3.shape)
(42538, 145)
从网站下载的原始数据存在大量的缺失数据,下面列出部分数据的情况
删除缺失值比较多的特征
# 保留至少三分之一不为空的特征属性
half_count = len(loans_2016S3) * 2 / 3
# 保留多于thresh不为空的列属性
loans_2016S3 = loans_2016S3.dropna(thresh=half_count, axis=1)
print(loans_2016S3.shape)
(42538, 54)
删除一些个人认为与结果无关的特征(这可能是模型最终结果不太理想的原因)
# 删除desc属性,贷款原因的大段描述
loans_2016S3 = loans_2016S3.drop(['desc'],axis=1)
loans_2016S3.to_csv('loans_2016S3.csv', index=False)
loans_2016S3 = pd.read_csv("loans_2016S3.csv")
# 删除特征
loans_2016S3 = loans_2016S3.drop([ "funded_amnt", "funded_amnt_inv", "grade", "sub_grade", "emp_title", "issue_d"], axis=1)
# 删除特征
loans_2016S3 = loans_2016S3.drop(["zip_code", "out_prncp", "out_prncp_inv", "total_pymnt", "total_pymnt_inv", "total_rec_prncp"], axis=1)
# 删除特征
loans_2016S3 = loans_2016S3.drop(["total_rec_int", "total_rec_late_fee", "recoveries", "collection_recovery_fee", "last_pymnt_d", "last_pymnt_amnt"], axis=1)
print(loans_2016S3.shape)
print(loans_2016S3.info())
(42538, 35)RangeIndex: 42538 entries, 0 to 42537 Data columns (total 35 columns): loan_amnt 42535 non-null float64 term 42535 non-null object int_rate 42535 non-null object installment 42535 non-null float64 emp_length 42535 non-null object home_ownership 42535 non-null object annual_inc 42531 non-null float64 verification_status 42535 non-null object loan_status 42535 non-null object pymnt_plan 42535 non-null object purpose 42535 non-null object title 42523 non-null object addr_state 42535 non-null object dti 42535 non-null float64 delinq_2yrs 42506 non-null float64 earliest_cr_line 42506 non-null object inq_last_6mths 42506 non-null float64 open_acc 42506 non-null float64 pub_rec 42506 non-null float64 revol_bal 42535 non-null float64 revol_util 42445 non-null object total_acc 42506 non-null float64 initial_list_status 42535 non-null object last_credit_pull_d 42531 non-null object collections_12_mths_ex_med 42390 non-null float64 policy_code 42535 non-null float64 application_type 42535 non-null object acc_now_delinq 42506 non-null float64 chargeoff_within_12_mths 42390 non-null float64 delinq_amnt 42506 non-null float64 pub_rec_bankruptcies 41170 non-null float64 tax_liens 42430 non-null float64 hardship_flag 42535 non-null object disbursement_method 42535 non-null object debt_settlement_flag 42535 non-null object dtypes: float64(17), object(18) memory usage: 11.4+ MB None
查看数据集中申请贷款的结果:
# 打印贷款的申请结果类型及数量统计
print(loans_2016S3['loan_status'].value_counts())
Fully Paid 34116 Charged Off 5670 Does not meet the credit policy. Status:Fully Paid 1988 Does not meet the credit policy. Status:Charged Off 761 Name: loan_status, dtype: int64
我们只需要最终发放和不发放贷款的情况,其他的情况删除不考虑,并将发放样本标记为1,为发放样本标记为0.
# 只保留贷款成功和而不成功的情况,删除需要等待的情况
loans_2016S3 = loans_2016S3[(loans_2016S3['loan_status'] == "Fully Paid") | (loans_2016S3['loan_status'] == "Charged Off")]
status_replace = {
"loan_status" : { "Fully Paid": 1,
"Charged Off": 0, }
}
# 将成功申请的情况用1代替,将申请失败的情况使用0代替
loans_2016S3 = loans_2016S3.replace(status_replace)
print(loans_2016S3.shape)
(39786, 35)
从数据集中删除特征值只有一项的属性,因为这样的特征对最终的结果没有任何影响
# 删除属性值只有一项的属性,因为其对预测没有任何意义
orig_columns = loans_2016S3.columns
drop_columns = []
for col in orig_columns:
# 判断删除空值后是否是单一值
col_series = loans_2016S3[col].dropna().unique()
if len(col_series) == 1:
drop_columns.append(col)
# 删除属性值单一的特征
loans_2016S3 = loans_2016S3.drop(drop_columns, axis=1)
# 打印单一的属性特征
print(drop_columns)
# 打印删除单一特征后剩下的特征属性数量
print(loans_2016S3.shape)
['pymnt_plan', 'initial_list_status', 'collections_12_mths_ex_med', 'policy_code', 'application_type', 'acc_now_delinq', 'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens', 'hardship_flag', 'disbursement_method'] (39786, 24)
统计每个特征的缺失值情况,如果缺失值较少,则删除缺失值对应的样本,如果缺失值数量较多,则直接删除该特征
null_counts = loans.isnull().sum() # 统计每列空值的个数
print(null_counts)
loans = loans.drop("pub_rec_bankruptcies", axis=1) # 空值多的,删除整列
loans = loans.dropna(axis=0) # 空值少的,直接删除该行
loan_amnt 0 term 0 int_rate 0 installment 0 emp_length 0 home_ownership 0 annual_inc 0 verification_status 0 loan_status 0 purpose 0 title 10 addr_state 0 dti 0 delinq_2yrs 0 earliest_cr_line 0 inq_last_6mths 0 open_acc 0 pub_rec 0 revol_bal 0 revol_util 50 total_acc 0 last_credit_pull_d 2 pub_rec_bankruptcies 697 debt_settlement_flag 0 dtype: int64
统计一下所有特征的特征值的类型,并统计各宗类型的个数
# 统计所有特征值的类型,并统计类型个数
print(loans.dtypes.value_counts()) # 统计特征类型的个数
object 12 float64 10 int64 1 dtype: int64
# 寻找特征值是object类型特征,打印其第一个特征值
object_columns_df = loans.select_dtypes(include=["object"])
print(object_columns_df.iloc[0])
term 36 months int_rate 10.65% emp_length 10+ years home_ownership RENT verification_status Verified purpose credit_card title Computer addr_state AZ earliest_cr_line Jan-1985 revol_util 83.7% last_credit_pull_d Jan-2018 debt_settlement_flag N Name: 0, dtype: object
下面的一大堆代码都是处理特征值的类型为object且特征值中同时包含数字和其他字符的特征值
# 首先处理特征值中带有数值类型的特征
# 删除冗余的列
loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)
# 删除百分号
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
# 删除百分号
loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
# 替换
mapping_dict = {
"emp_length": {
"10+ years": 10,
"9 years": 9,
"8 years": 8,
"7 years": 7,
"6 years": 6,
"5 years": 5,
"4 years": 4,
"3 years": 3,
"2 years": 2,
"1 year": 1,
"< 1 year": 0,
"n/a": 0
}
}
loans = loans.replace(mapping_dict)
# 对于时间将最后的month去掉。只保留前面的月数
loans['term'] = loans['term'].apply(lambda x: int(x[:-7]))
查看剩余的类型为object的特征值的情况
object_columns_df = loans.select_dtypes(include=["object"])
# 寻找特征值是object类型特征的第一个特征值
print(object_columns_df.iloc[0])
home_ownership RENT verification_status Verified purpose credit_card debt_settlement_flag N Name: 0, dtype: object
查看上面的四种特征的特征值的种类以及统计每种特征值的个数
cols = ['home_ownership', 'verification_status', "purpose", "debt_settlement_flag"]
for c in cols:
print(loans[c].value_counts())# 打印每个object类型特征的特征值及个数
print("================================")
RENT 18881 MORTGAGE 17688 OWN 3056 OTHER 96 NONE 3 Name: home_ownership, dtype: int64 ================================ Not Verified 16890 Verified 12833 Source Verified 10001 Name: verification_status, dtype: int64 ================================ debt_consolidation 18661 credit_card 5134 other 3985 home_improvement 2980 major_purchase 2182 small_business 1827 car 1549 wedding 947 medical 693 moving 581 house 382 vacation 380 educational 320 renewable_energy 103 Name: purpose, dtype: int64 ================================ N 39576 Y 148 Name: debt_settlement_flag, dtype: int64 ================================
对以上的特征以及特征值进行独热编码处理
# 对某列具有某几个不同的字符串类型的属性进行处理,进行get_dummies编码
cat_columns = ["home_ownership", "verification_status", "purpose", "debt_settlement_flag"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)
查看数据集最终的处理结果
# 查看最终的数据处理结果
print(loans.info())
print(loans.shape)
loans[0:20]
RangeIndex: 39724 entries, 0 to 39723 Data columns (total 39 columns): loan_amnt 39724 non-null float64 term 39724 non-null int64 int_rate 39724 non-null float64 installment 39724 non-null float64 emp_length 39724 non-null int64 annual_inc 39724 non-null float64 loan_status 39724 non-null int64 dti 39724 non-null float64 delinq_2yrs 39724 non-null float64 inq_last_6mths 39724 non-null float64 open_acc 39724 non-null float64 pub_rec 39724 non-null float64 revol_bal 39724 non-null float64 revol_util 39724 non-null float64 total_acc 39724 non-null float64 home_ownership_MORTGAGE 39724 non-null int64 home_ownership_NONE 39724 non-null int64 home_ownership_OTHER 39724 non-null int64 home_ownership_OWN 39724 non-null int64 home_ownership_RENT 39724 non-null int64 verification_status_Not Verified 39724 non-null int64 verification_status_Source Verified 39724 non-null int64 verification_status_Verified 39724 non-null int64 purpose_car 39724 non-null int64 purpose_credit_card 39724 non-null int64 purpose_debt_consolidation 39724 non-null int64 purpose_educational 39724 non-null int64 purpose_home_improvement 39724 non-null int64 purpose_house 39724 non-null int64 purpose_major_purchase 39724 non-null int64 purpose_medical 39724 non-null int64 purpose_moving 39724 non-null int64 purpose_other 39724 non-null int64 purpose_renewable_energy 39724 non-null int64 purpose_small_business 39724 non-null int64 purpose_vacation 39724 non-null int64 purpose_wedding 39724 non-null int64 debt_settlement_flag_N 39724 non-null int64 debt_settlement_flag_Y 39724 non-null int64 dtypes: float64(12), int64(27) memory usage: 11.8 MB None (39724, 39)
最终对特征值进行标准化处理
#分离特征和标签
cols = loans.columns
train_cols = cols.drop("loan_status")
#标签
target = loans["loan_status"]
#特征值
features = loans[train_cols]
# 标准化
features = (features - features.mean(axis=0)) / (features.std(axis=0))
features["loan_status"] = target
print(features.shape)
features.to_csv('dealed_loans_2016S3--7.csv', index=False)
(39724, 39)
二、模型分析
第一部分主要是对数据进行预处理为后续的模型建立做准备,下面建立模型进行预测。
import pandas as pd
import matplotlib.pyplot as plt
# 设置中文编码和负号的正常显示
plt.rcParams['font.sans-serif'] = 'SimHei'
plt.rcParams['axes.unicode_minus'] = False
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.metrics import confusion_matrix,recall_score,classification_report
%matplotlib inline
data = pd.read_csv("dealed_loans_2016S3--7.csv")
data.head()
显示结果如下:
loan_amnt term int_rate installment emp_length annual_inc dti delinq_2yrs inq_last_6mths open_acc ... purpose_medical purpose_moving purpose_other purpose_renewable_energy purpose_small_business purpose_vacation purpose_wedding debt_settlement_flag_N debt_settlement_flag_Y loan_status
0 -0.836004 -0.606506 -0.368436 -0.775832 1.430128 -0.705878 2.146047 -0.298007 0.122478 -1.432408 ... -0.133247 -0.12183 -0.333916 -0.050986 -0.219564 -0.098276 -0.156272 0.061152 -0.061152 1
1 -1.170923 1.648746 0.871349 -1.268983 -1.342721 -0.611784 -1.845732 -0.298007 3.859990 -1.432408 ... -0.133247 -0.12183 -0.333916 -0.050986 -0.219564 -0.098276 -0.156272 0.061152 -0.061152 0
2 -1.184319 -0.606506 1.056512 -1.151726 1.430128 -0.890114 -0.689390 -0.298007 1.056856 -1.659771 ... -0.133247 -0.12183 -0.333916 -0.050986 4.554363 -0.098276 -0.156272 0.061152 -0.061152 1
3 -0.166166 -0.606506 0.393683 0.068614 1.430128 -0.310683 1.000189 -0.298007 0.122478 0.159132 ... -0.133247 -0.12183 2.994688 -0.050986 -0.219564 -0.098276 -0.156272 0.061152 -0.061152 1
4 -1.103939 1.648746 0.179001 -1.230886 -1.065436 0.172334 0.691631 -0.298007 -0.811900 1.295947 ... -0.133247 -0.12183 2.994688 -0.050986 -0.219564 -0.098276 -0.156272 0.061152 -0.061152 1
5 rows × 39 columns
使用pd进行简单的画图,查看数据的不平衡型
count_classes = pd.value_counts(data['loan_status'], sort = True).sort_index()
print(count_classes)
count_classes.plot(kind = 'bar')
plt.title("贷款申请结果统计")
plt.xlabel("loan_status")
plt.ylabel("Frequency")
X = data.ix[:, data.columns != 'loan_status']
print(X[0:2])
y = data.ix[:, data.columns == 'loan_status']
print(y[0:2])
loan_amnt term int_rate installment emp_length annual_inc \ 0 -0.836004 -0.606506 -0.368436 -0.775832 1.430128 -0.705878 1 -1.170923 1.648746 0.871349 -1.268983 -1.342721 -0.611784 dti delinq_2yrs inq_last_6mths open_acc ... \ 0 2.146047 -0.298007 0.122478 -1.432408 ... 1 -1.845732 -0.298007 3.859990 -1.432408 ... purpose_major_purchase purpose_medical purpose_moving purpose_other \ 0 -0.241081 -0.133247 -0.12183 -0.333916 1 -0.241081 -0.133247 -0.12183 -0.333916 purpose_renewable_energy purpose_small_business purpose_vacation \ 0 -0.050986 -0.219564 -0.098276 1 -0.050986 -0.219564 -0.098276 purpose_wedding debt_settlement_flag_N debt_settlement_flag_Y 0 -0.156272 0.061152 -0.061152 1 -0.156272 0.061152 -0.061152 [2 rows x 38 columns] loan_status 0 1 1 0
将数据集分为训练集和测试集
from sklearn.cross_validation import train_test_split
# Whole dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 0)
print("Number transactions train dataset: ", len(X_train))
print("Number transactions test dataset: ", len(X_test))
print("Total number of transactions: ", len(X_train)+len(X_test))
Number transactions train dataset: 27806
Number transactions test dataset: 11918
Total number of transactions: 39724
编写函数 用于寻找最优的参数
def printing_Kfold_scores(x_train_data,y_train_data):
# 五折交叉运算
fold = KFold(len(y_train_data),5,shuffle=False)
penalty_range = ['l1', 'l2']
c_param_range = [0.05, 0.1, 1, 10, 25, 50, 100]
results_table = pd.DataFrame(index = range(len(c_param_range),2), columns = ['C_parameter','Mean recall score'])
results_table['C_parameter'] = c_param_range
j = 0
for chengfa in penalty_range:
for c_param in c_param_range:
# 召回率
TPR = []
# 真负率
TNR = []
for iteration, indices in enumerate(fold,start=1):
lr = LogisticRegression(C = c_param, penalty = 'l1')
lr.fit(x_train_data.iloc[indices[0],:],y_train_data.iloc[indices[0],:].values.ravel())
y_pred = lr.predict(x_train_data.iloc[indices[1],:].values)
# 计算混淆矩阵
cnf_matrix = confusion_matrix(y_train_data.iloc[indices[1],:].values.ravel(),y_pred)
np.set_printoptions(precision=2)
# 召回率(真正率)
tpr = cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1])
# 真负率
ntr = cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[0,1])
TPR .append(tpr)
TNR.append(ntr)
results_table.ix[j,'Mean recall score'] = np.mean(TPR )
j += 1
print('惩罚项是', chengfa, '\t惩罚系数=', c_param, '\t平均召回率 = ', np.mean(TPR), '\t平均真负率 = ', np.mean(TNR)
绘制混淆矩阵
def plot_confusion_matrix(cm, classes, title='Confusion matrix', cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j], horizontalalignment="center", color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
printing_Kfold_scores(X_train,y_train)
从结果中选取最优的参数简历分类器
import itertools
lr = LogisticRegression(C = 100, penalty = 'l1')
lr.fit(X_train,y_train.values.ravel())
y_pred_undersample = lr.predict(X_test.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test,y_pred_undersample)
np.set_printoptions(precision=2)
print("真负率: ", cnf_matrix[0,0]/(cnf_matrix[0,0]+cnf_matrix[0,1]))
print("召回率: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
print("准确率: ", (cnf_matrix[1,1] + cnf_matrix[0,0])/(cnf_matrix[1,0]+cnf_matrix[1,1]+cnf_matrix[0,0]+cnf_matrix[0,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()
number_records_fraud = len(data[data.loan_status == 1])
print(number_records_fraud)
fraud_indices = np.array(data[data.loan_status == 1].index)
34073寻找标签是0(少量拒绝)的样本的索引值和数量
normal_indices = data[data.loan_status == 0].index
number_normal_indices = len(normal_indices)
print(number_normal_indices)
normal_indices
random_normal_indices = np.random.choice(fraud_indices, number_normal_indices, replace = False)
random_normal_indices = np.array(random_normal_indices)
print(len(random_normal_indices))
random_normal_indices[0:20]
under_sample_indices = np.concatenate([normal_indices,random_normal_indices])
print(len(under_sample_indices))
11302按照合并的索引从原始数据中获取下采样的数据
under_sample_data = data.iloc[under_sample_indices,:]
under_sample_data.head()
loan_amnt term int_rate installment emp_length annual_inc dti delinq_2yrs inq_last_6mths open_acc ... purpose_medical purpose_moving purpose_other purpose_renewable_energy purpose_small_business purpose_vacation purpose_wedding debt_settlement_flag_N debt_settlement_flag_Y loan_status
1 -1.170923 1.648746 0.871349 -1.268983 -1.342721 -0.611784 -1.845732 -0.298007 3.859990 -1.432408 ... -0.133247 -0.12183 -0.333916 -0.050986 -0.219564 -0.098276 -0.156272 0.061152 -0.061152 0
8 -0.755623 1.648746 2.484143 -0.825989 -0.233581 -0.454960 -1.164209 -0.298007 1.056856 0.386495 ... -0.133247 -0.12183 -0.333916 -0.050986 4.554363 -0.098276 -0.156272 0.061152 -0.061152 0
9 -0.785766 1.648746 0.179001 -0.974069 -1.342721 -0.847019 0.712601 -0.298007 -0.811900 -1.659771 ... -0.133247 -0.12183 2.994688 -0.050986 -0.219564 -0.098276 -0.156272 0.061152 -0.061152 0
12 -0.300134 -0.606506 0.393683 -0.093776 -1.342721 -0.611784 -0.485682 -0.298007 0.122478 -1.205045 ... -0.133247 -0.12183 -0.333916 -0.050986 -0.219564 -0.098276 -0.156272 0.061152 -0.061152 0
14 -0.166166 -0.606506 -0.368436 0.003667 -0.510866 0.485981 -0.938033 -0.298007 1.056856 1.068584 ... -0.133247 -0.12183 2.994688 -0.050986 -0.219564 -0.098276 -0.156272 0.061152 -0.061152 0
X_undersample = under_sample_data.ix[:, under_sample_data.columns != 'loan_status']
y_undersample = under_sample_data.ix[:, under_sample_data.columns == 'loan_status']
print("Percentage of normal transactions: ", len(under_sample_data[under_sample_data.loan_status == 0])/len(under_sample_data))
print("Percentage of fraud transactions: ", len(under_sample_data[under_sample_data.loan_status == 1])/len(under_sample_data))
print("Total number of transactions in resampled data: ", len(under_sample_data))
Percentage of normal transactions: 0.5 Percentage of fraud transactions: 0.5 Total number of transactions in resampled data: 11302
count_classes = pd.value_counts(under_sample_data['loan_status'], sort = True).sort_index()
print(count_classes)
count_classes.plot(kind = 'bar')
X_train_undersample, X_test_undersample, \
y_train_undersample, y_test_undersample = train_test_split(X_undersample, y_undersample,test_size = 0.3,random_state = 0)
print("")
print("Number transactions train dataset: ", len(X_train_undersample))
print("Number transactions test dataset: ", len(X_test_undersample))
print("Total number of transactions: ", len(X_train_undersample)+len(X_test_undersample))
Number transactions train dataset: 7911 Number transactions test dataset: 3391 Total number of transactions: 11302寻找对于下采样数据集的最优参数
printing_Kfold_scores(X_train_undersample,y_train_undersample)
惩罚项是 l1 惩罚系数= 0.05 平均召回率 = 0.63856736542 平均真负率 = 0.647293869207 惩罚项是 l1 惩罚系数= 0.1 平均召回率 = 0.640638410934 平均真负率 = 0.643538009998 惩罚项是 l1 惩罚系数= 1 平均召回率 = 0.646274901414 平均真负率 = 0.640283210678 惩罚项是 l1 惩罚系数= 10 平均召回率 = 0.646529100623 平均真负率 = 0.640027914019 惩罚项是 l1 惩罚系数= 25 平均召回率 = 0.64653296593 平均真负率 = 0.640278855048 惩罚项是 l1 惩罚系数= 50 平均召回率 = 0.646791030446 平均真负率 = 0.640278855048 惩罚项是 l1 惩罚系数= 100 平均召回率 = 0.646791030446 平均真负率 = 0.640278855048 惩罚项是 l2 惩罚系数= 0.05 平均召回率 = 0.638309300904 平均真负率 = 0.647293869207 惩罚项是 l2 惩罚系数= 0.1 平均召回率 = 0.640638410934 平均真负率 = 0.643538009998 惩罚项是 l2 惩罚系数= 1 平均召回率 = 0.646274901414 平均真负率 = 0.640283210678 惩罚项是 l2 惩罚系数= 10 平均召回率 = 0.646529100623 平均真负率 = 0.640027914019 惩罚项是 l2 惩罚系数= 25 平均召回率 = 0.64653296593 平均真负率 = 0.640278855048 惩罚项是 l2 惩罚系数= 50 平均召回率 = 0.646791030446 平均真负率 = 0.640278855048 惩罚项是 l2 惩罚系数= 100 平均召回率 = 0.64653296593 平均真负率 = 0.640278855048
lr = LogisticRegression(C = 50, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred_undersample = lr.predict(X_test_undersample.values)
cnf_matrix = confusion_matrix(y_test_undersample,y_pred_undersample)
np.set_printoptions(precision=2)
print("真负率: ", cnf_matrix[0, 0]/(cnf_matrix[0,0]+cnf_matrix[0,1]))
print("召回率: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()
lr = LogisticRegression(C = 50, penalty = 'l1')
lr.fit(X_train_undersample,y_train_undersample.values.ravel())
y_pred = lr.predict(X_test.values)
cnf_matrix = confusion_matrix(y_test,y_pred)
np.set_printoptions(precision=2)
print("真负率: ", cnf_matrix[0, 0]/(cnf_matrix[0,0]+cnf_matrix[0,1]))
print("召回率: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, title='Confusion matrix')
plt.show()
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
credit_cards=pd.read_csv('dealed_loans_2016S3--7.csv')
columns=credit_cards.columns
features_columns=columns.delete(len(columns)-1)
features=credit_cards[features_columns]
labels=credit_cards['loan_status']
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size=0.2, random_state=0)
# 创建SMOTE对象,并对分割后的数据集进行处理
oversampler=SMOTE(random_state=0)
os_features,os_labels=oversampler.fit_sample(features_train,labels_train)
len(os_labels[os_labels==1])
len(os_labels[os_labels==0])
27280
27280
通过SMOTE得到的负例样本和正例一样多
# 将数据集转换成dataframe类型的数据
os_features = pd.DataFrame(os_features)
os_labels = pd.DataFrame(os_labels)
# 寻找最优的参数
printing_Kfold_scores(os_features,os_labels)
lr = LogisticRegression(C = 25, penalty = 'l1')
lr.fit(os_features,os_labels.values.ravel())
y_pred = lr.predict(features_test.values)
# Compute confusion matrix
cnf_matrix = confusion_matrix(labels_test,y_pred)
np.set_printoptions(precision=2)
print("真负率: ", cnf_matrix[0, 0]/(cnf_matrix[0,0]+cnf_matrix[0,1]))
print("召回率: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))
# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names , title='Confusion matrix')
plt.show()