注:参考多篇csdn及b站文章所得
一、实验背景
某机构想要预测哪些客户可能会产生贷款违约行为。他们搜集了历史客户行为的部分数据以及目标客户的信息,希望通过历史数据对目标客户进行预测哪些客户会是潜在的违约客户,从而缩小目标范围,实现低风险贷款发放。
搜集到的数据以.CSV存储,分别包括历史客户和目标客户两个文件。数据描述如下:
字段名 |
字段描述 |
数据类型 |
income |
客户收入 |
int |
age |
客户年龄 |
int |
experience |
工作年限 |
int |
profession |
职业 |
string |
married |
婚否 |
string |
house_ownership |
有房/租房/其它 |
string |
car_ownership |
是否有车 |
string |
risk_flag |
是否拖欠贷款 |
string |
currentjobyears |
现有工作年限 |
int |
currenthouseyears |
在当前住所居住时长 |
int |
city |
居住城市 |
string |
state |
居住州/邦 |
string |
二、实验内容
基于分类方法,根据客户历史行为预测潜在的贷款客户
三、实验步骤
1.导入数据
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import warnings
from sklearn.preprocessing import LabelEncoder
warnings.filterwarnings('ignore')
#一、数据导入
app_train=pd.read_csv('D:/班级作业/数据挖掘/实验报告/实验二/archive/historic customer behavior.csv')
app_test =pd.read_csv('D:/班级作业/数据挖掘/实验报告/实验二/archive/target customer.csv')
2.数据探索性分析
#二、数据探索性分析
# 画图代码
def plot_stats(feature, label_rotation=False, horizontal_layout=True):
temp = app_train[feature].value_counts()
df1 = pd.DataFrame({feature: temp.index, 'Number of contracts': temp.values})
# 计算每个属性类别中Risk_Flag=1的个数
cat_perc = app_train[[feature, 'Risk_Flag']].groupby([feature], as_index=False).mean()
cat_perc.sort_values(by='Risk_Flag', ascending=False, inplace=True)
if (horizontal_layout):
fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 6))
else:
fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12, 14))
sns.set_color_codes("pastel")
s = sns.barplot(ax=ax1, x=feature, y="Number of contracts", data=df1)
if (label_rotation):
s.set_xticklabels(s.get_xticklabels(), rotation=90)
s = sns.barplot(ax=ax2, x=feature, y='Risk_Flag', order=cat_perc[feature], data=cat_perc)
if (label_rotation):
s.set_xticklabels(s.get_xticklabels(), rotation=90)
plt.ylabel('Percent of Risk_Flag with value 1 [%]', fontsize=10)
plt.tick_params(axis='both', which='major', labelsize=10)
plt.gcf().subplots_adjust(left=0.05,top=0.91,bottom=0.09)
plt.show();
def plot_distribution(var):
i = 0
t1 = app_train.loc[app_train['Risk_Flag'] != 0]
t0 = app_train.loc[app_train['Risk_Flag'] == 0]
sns.set_style('whitegrid')
plt.figure()
fig, ax = plt.subplots(2, 2, figsize=(6, 6))
for feature in var:
i += 1
plt.subplot(2, 2, i)
sns.kdeplot(t1[feature], bw=0.5, label="Risk_Flag = 1")
sns.kdeplot(t0[feature], bw=0.5, label="Risk_Flag = 0")
plt.ylabel('Density plot', fontsize=12)
plt.xlabel(feature, fontsize=12)
locs, labels = plt.xticks()
plt.tick_params(axis='both', which='major', labelsize=12)
plt.show();
plt.figure(figsize = (5, 6))
#1.年龄的影响
# 按时偿还贷款的KDE(kdeplot,核密度估计图)图
sns.kdeplot(app_train.loc[app_train['Risk_Flag'] == 0, 'Age'], label='Risk_Flag == 0')
# 没有按时偿还贷款的KDE(kdeplot,核密度估计图)图
sns.kdeplot(app_train.loc[app_train['Risk_Flag'] == 1, 'Age'], label='Risk_Flag == 1')
# 标签设置
plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');
plt.gcf().subplots_adjust(left=0.05,right=0.91, top=0.9, bottom=0.09)
plt.show()
#年龄的影响
plot_stats('Age',False,False)
#2.收入的影响
plot_stats('Income',False,False)
#3.房车的影响
plot_stats('FLAG_OWN_CAR')
plot_stats('FLAG_OWN_REALTY')
#4.婚否的影响
plot_stats('Married/Single',True, True)
#5.工作经验的影响
plot_stats('Experience',False,False)
1)用户年龄特征探索
数据表处理结果:
图2:年龄与违约行为是否发生的折线图
分析:
由图2可知,违约用户中20-30的年轻用户分布更多,所以可以假设用户年龄越小,违约的可能性越大;
由图3可知,对用户的年龄进行分捅,进一步观察不同年龄段用户的违约概率。发现[20,25],[25,30]的用户违约的可能性最高,其余年龄段违约可能性相近;
2)用户有无房否
分析:由图可知,没有房的人比有房的人违约率更高
3) 用户有无车否
分析:由图可知,没有车的人比有车的人违约率更高,但相差并不大
4)用户婚否
分析:由图可知,未婚的人比已婚的违约率更高
5)用户收入
分析:由图可知看出收入较低的人违约几率较高
6)用户工作经验
分析:由图可知工作时间较短的人违约几率较高
7)热力图
3.特征预处理
构建新特征DAYS_EMPLOYED_PERCENT: 用户工作年限experience/客户年龄
#三、特征预处理
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
#构造新特征
app_train_domain = app_train.copy()
app_test_domain = app_test.copy()
#构建新特征
app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['Experience'] / app_train_domain['Age']#用户工作时间/年龄
#app_train_domain['INCOME_HOUSE'] = app_train_domain['Income'] / app_train_domain['House_Ownership']#用户工作收入/房子拥有有情况
#app_train_domain['INCOME_CAR'] = app_train_domain['Income'] / app_train_domain['Car_Ownership']#用户工作收入/车子拥有有情况
plt.figure(figsize=(10, 10))
# 构造新特性的迭代器
for i, feature in enumerate(['DAYS_EMPLOYED_PERCENT']):
# 创建子图
plt.subplot(1, 1, i + 1)
# 按期还款用户的KDE图
sns.kdeplot(app_train_domain.loc[app_train_domain['Risk_Flag'] == 0, feature], label='Risk_Flag == 0')
# plot loans that were not repaid
sns.kdeplot(app_train_domain.loc[app_train_domain['Risk_Flag'] == 1, feature], label='Risk_Flag == 1')
# 未按期还款用户的KDE图
plt.title('Distribution of %s by Target Value' % feature)
plt.xlabel('%s' % feature);
plt.ylabel('Density');
plt.tight_layout(h_pad=2.5)
plt.show()
app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['Experience'] / app_test_domain['Age']
4. 建模分析
#四、建模
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import gc
#1. LGBMClassifier模型
def model(features, test_features, encoding='ohe', n_folds=5):
# 提取id
train_ids = features['Id']
test_ids = test_features['ID']
# 提起训练集中的id
labels = features['Risk_Flag']
# 删除训练集中的id和target
features = features.drop(columns=['Id', 'Risk_Flag'])
test_features = test_features.drop(columns=['ID'])
# One Hot 编码
if encoding == 'ohe':
features = pd.get_dummies(features)
test_features = pd.get_dummies(test_features)
# 连接对齐训练集和测试集中的特征
features, test_features = features.align(test_features, join='inner', axis=1)
# 没有分类索引的记录
cat_indices = 'auto'
# I整数标签编码
elif encoding == 'le':
# 创建编码器
label_encoder = LabelEncoder()
# 创建列表,用于储存分类索引
cat_indices = []
# I按列迭代
for i, col in enumerate(features):
if features[col].dtype == 'object':
# 将分类特征映射到整数
features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,)))
test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,)))
# 记录分类索引
cat_indices.append(i)
# 捕获错误,当标签编码方案无效时
else:
raise ValueError("Encoding must be either 'ohe' or 'le'")
print('Training Data Shape: ', features.shape)
print('Testing Data Shape: ', test_features.shape)
# 提取训练集特征的名字
feature_names = list(features.columns)
# 转换为np数组
features = np.array(features)
test_features = np.array(test_features)
# 创建K者交叉验证对象
k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=50)
# 为重要特征设置空数组
feature_importance_values = np.zeros(len(feature_names))
# 创建测试预测的空数组
test_predictions = np.zeros(test_features.shape[0])
# 创建空数组,为了折叠验证预测
out_of_fold = np.zeros(features.shape[0])
# 创建list,保存验证和训练分数
valid_scores = []
train_scores = []
# 创建K折验证迭代器
for train_indices, valid_indices in k_fold.split(features):
# K折训练数据
train_features, train_labels = features[train_indices], labels[train_indices]
# K折验证数据
valid_features, valid_labels = features[valid_indices], labels[valid_indices]
# 创建LGBMClassifier模型
model = lgb.LGBMClassifier(n_estimators=1000, objective='binary',
class_weight='balanced', learning_rate=0.05,
reg_alpha=0.1, reg_lambda=0.1,
subsample=0.8, n_jobs=-1, random_state=50)
# 训练模型
model.fit(train_features, train_labels, eval_metric='auc',
eval_set=[(valid_features, valid_labels), (train_features, train_labels)],
eval_names=['valid', 'train'], categorical_feature=cat_indices,
early_stopping_rounds=100, verbose=200)
# 在训练过程中使用了提前停止,使用best_iteration从最佳迭代中获取训练结果
best_iteration = model.best_iteration_
# 记录重要特征
feature_importance_values += model.feature_importances_ / k_fold.n_splits
# 预测
test_predictions += model.predict_proba(test_features, num_iteration=best_iteration)[:, 1] / k_fold.n_splits
# 在验证集上预测
out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration=best_iteration)[:, 1]
# 记录最好的分数
valid_score = model.best_score_['valid']['auc']
train_score = model.best_score_['train']['auc']
valid_scores.append(valid_score)
train_scores.append(train_score)
# 清楚所有变量
gc.enable()
del model, train_features, valid_features
gc.collect()
# 设置提交文件的数据框格式
submission = pd.DataFrame({'ID': test_ids, 'Risk_Flag': test_predictions})
# 把重要特性变成数据框格式
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': feature_importance_values})
# 整体验证评分
valid_auc = roc_auc_score(labels, out_of_fold)
# 将总分添加到指标中
valid_scores.append(valid_auc)
train_scores.append(np.mean(train_scores))
# 创建验证分数的数据框格式
fold_names = list(range(n_folds))
fold_names.append('overall')
# 验证分数的数据框格式
metrics = pd.DataFrame({'fold': fold_names,
'train': train_scores,
'valid': valid_scores})
return submission, feature_importances, metrics
#利用自定义函数训练模型,并输出5折交叉验证的结果
submission, fi, metrics = model(app_train_domain, app_test_domain)
print('Baseline metrics')
print(metrics)
submission.to_csv('my_submission1.csv',index=False)
del app_train_domain,app_test_domain
gc.collect
def plot_feature_importances(df):
# 根据重要性对特征进行排序
df = df.sort_values('importance', ascending=False).reset_index()
# 将特性的重要性标准化,使其加起来等于1
df['importance_normalized'] = df['importance'] / df['importance'].sum()
# 做一个特征重要性的水平条形图
plt.figure(figsize=(5, 3))
ax = plt.subplot()
# 最重要的特征放在最上面
ax.barh(list(reversed(list(df.index[:15]))),
df['importance_normalized'].head(15),
align='center', edgecolor='k')
# 设置y轴标签和刻度
ax.set_yticks(list(reversed(list(df.index[:15]))))
ax.set_yticklabels(df['feature'].head(15))
# 设置x轴标签、标题
plt.xlabel('Normalized Importance');
plt.title('Feature Importances')
plt.show()
return df
fi_sorted = plot_feature_importances(fi)
#2. xgboost模型
# 定类数据编码
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(app_train.iloc[:,3:6])
enc.categories_
app_train['Married/Single'] = app_train['Married/Single'].map({'single': 0, 'married': 1})
app_train['House_Ownership'] = app_train['House_Ownership'].map({'norent_noown': 0, 'rented': 1 ,'owned': 2})
app_train['Car_Ownership'] = app_train['Car_Ownership'].map({'no': 0, 'yes': 1})
app_train
# 定量特征转化为标准正态分布
from sklearn.preprocessing import StandardScaler
#标准化,返回值为标准化后的数据
app_train[["Income","Age","Experience","CURRENT_JOB_YRS","CURRENT_HOUSE_YRS"]] = StandardScaler().fit_transform(app_train[["Income","Age","Experience","CURRENT_JOB_YRS","CURRENT_HOUSE_YRS"]])
app_train
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
X = app_train[
["Income", "Age", "Experience", "Married/Single", "House_Ownership", "Car_Ownership", "CURRENT_JOB_YRS",
"CURRENT_HOUSE_YRS"]]
y = app_train[["Risk_Flag"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_test
# xgboost模型训练
from xgboost import XGBClassifier
print("---------------------xgboost forest---------------------")
xgbc = XGBClassifier(n_estimatores=180,nthread=-1,early_stopping_rounds=200,max_depth=10).fit(X_train, y_train)
print("训练集精度:")
result = xgbc.score(X_train,y_train)
print(result)
print("验证集精度:")
result = xgbc.score(X_test,y_test)
print(result)
# 绘制ROC曲线
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
# Compute ROC curve and ROC area for each class
y_score = xgbc.fit(X_train, y_train).predict_proba(X_test)
fpr,tpr,thresholds = roc_curve(y_test, y_score[:,1]);
roc_auc = auc(fpr, tpr)
# 确定最佳阈值
right_index = (tpr + (1 - fpr) - 1)
yuzhi = max(right_index)
index = list(right_index).index(max(right_index))
tpr_val = tpr[index]
fpr_val = fpr[index]
# 绘制roc曲线图
plt.subplots(figsize=(7,5.5))
plt.plot(fpr, tpr, color='darkorange',lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()
# 3. 随机森林分类器训练
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
print("---------------------random forest---------------------")
rf = RandomForestClassifier(n_estimators=22, random_state=0, max_depth=20).fit(X_train, y_train)
print("训练集精度:")
result = rf.score(X_train , y_train)
print(result)
print("验证集精度:")
result = rf.score(X_test, y_test)
print(result)
# 绘制ROC曲线
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split
# Compute ROC curve and ROC area for each class
y_score = rf.fit(X_train, y_train).predict_proba(X_test)
fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1]);
roc_auc = auc(fpr, tpr)
# 确定最佳阈值
right_index = (tpr + (1 - fpr) - 1)
yuzhi = max(right_index)
index = list(right_index).index(max(right_index))
tpr_val = tpr[index]
fpr_val = fpr[index]
# 绘制roc曲线图
plt.subplots(figsize=(7, 5.5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()
1)利用LightGBM模型进行建模预测
2)利用随机森林模型进行建模预测
3)利用xgboost模型进行建模预测
四、实验总结及结果讨论分析
1、数据探索性分析
可以得出用户年龄、有无房或车以及婚否、收入、工作经验对其欠贷违约率的影响:
1)年龄[20,30]的用户违约的可能性最高,其余年龄段违约可能性相近,也可以近似看成年龄越小,违约率越大
2)没有房/车的人比有房/车的人违约率更高
3)未婚的人比已婚的违约率更高
4)收入较低的人违约几率较高
5)工作时间较短的人违约几率较高
2、特征预处理分析
假设由于客户工作时间短,工作年限少故而引起了欠贷违约,则构建新特征DAYS_EMPLOYED_PERCENT,用户工作年限experience/客户年龄。
3建模预测分析
1)利用LightGBM模型进行建模预测,K折交叉验证得到模型在训练集上的AUC得分为 0.926473 ,在验证集上的AUC得分为 0.885765
2)利用随机森林模型进行建模预测,得到的训练集精度为0.9116326530612245,得到的验证集精度为0.8934920634920634
3)利用xgboost模型进行建模预测,得到的训练集精度为0.9096315192743765,得到的验证集精度为0.8965079365079365
4)比较三种模型可以看出,xgboost模型的验证集精度最大,随机森林模型的验证集精度比xgboost模型略小,训练集精度比其略大。