基于客户数据,通过预测客户未来两年是否会陷入财务危机的概率来改善银行信用评分的质量。
a、信用评分是指根据银行客户的各种历史信用资料,利用一定的信用评分模型,得到不同等级的信用分数,根据客户的信用分数,授信者可以通过分析客户按时还款的可能性,据此决定是否给予授信以及授信的额度和利率。虽然授信者通过人工分析客户的历史信用资料,同样可以得到这样的分析结果,但利用信用评分却更加快速、更加客观、更具有一致性。
b、信用评分模型可用“四张卡”来表示,分别是 A卡(Application score card,申请评分卡)、B卡(Behavior score card,行为评分卡)、C卡(Collection score card,催收评分卡)和 F卡(Anti-Fraud Card,反欺诈评分卡),分别应用于贷前、贷中、贷后。
c、本次要建立的信用评分卡属于信用评分卡中的A卡(Application score card)即申请评分卡,是客户贷款前对客户实行打分制,以期对客户有一个优质与否的评判。
信用风险评级模型的主要开发流程如下:
1)数据读取:训练集数据、测试集数据
2)探索性分析EDA:变量分布情况-中位数、均值等
3)数据预处理:缺失值处理、异常值处理、特征相关性分析
4)特征选择:变量离散化、WOE变换
5)模型开发:逻辑回归
6)模型评估:K-S指标、拟合度曲线
7)信用评分:好坏比、基础分值等创立标准评分卡
8)对测试集进行预测和转化为信用评分卡
根据业务的背景,可以上客户信息分为以下几类:
基本信息:age,借款人当时的年龄 负债信息:RevolvingUtilizationOfUnsecuredLines-信用贷款比例、DebtRatio-负债比率、NumberOfOpenCreditLinesAndLoans-开放贷款数量、NumberRealEstateLoansOrLines-房地产贷款数量 偿还能力:MonthlyIncome-月收入 历史信用记录:NumberOfTime30-59DaysPastDueNotWorse、NumberOfTime60-89DaysPastDueNotWorse、NumberOfTimes90DaysLate(两年内35-59天逾期次数、两年内60-89天逾期次数、两年内90天或高于90天逾期的次数) 人际社交信息:NumberOfDependents-借款人的家属数量(不包括本人在内) 目标变量:SeriousDlqin2yrs,在接下来两年发生债务危机(1-会,0-不会) 时间窗口:自变量的观察窗口为过去两年,因变量表现窗口为未来两年
#导入包
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import copy
from scipy import stats
from sklearn.ensemble import RandomForestRegressor
import warnings
warnings.filterwarnings('ignore')
#导入统计学常用数据包
from scipy import stats
from scipy.stats import norm, skew
#导入测试集与训练集
train_data = pd.read_csv('cs-training.csv')
test_data = pd.read_csv('cs-test v2.csv')
#查看数据维度和信息
#训练集
print(train_data.shape)
print('-------------------')
print(train_data.info())
print('-------------------')
#测试集
print(test_data.shape)
print('-------------------')
print(test_data.info())
#查看好坏客户分布
#创建子图及间隔设置
f,ax = plt.subplots(figsize=(10,5))
sns.countplot('SeriousDlqin2yrs',data=train_data)
plt.show()
badnum=train_data['SeriousDlqin2yrs'].sum()
goodnum=train_data['SeriousDlqin2yrs'].count()-train_data['SeriousDlqin2yrs'].sum()
print('训练集数据中,好客户数量为:%i,坏客户数量为:%i,坏客户所占比例为:%.2f%%' %(goodnum,badnum,(badnum/train_data['SeriousDlqin2yrs'].count())*100))
#样本标签及其不平衡,后面需要使用balance参数
#检查数据
print("处理前数据大小 : {} ".format(train_data.shape))
#按全部列统计重复值
train_data.duplicated().sum()
#按具体列统计重复值
train_data.duplicated(subset=['CustomerID']).sum()
#获取去重复的字段名列表
cols_df = train_data.columns.values.tolist()
cols_df
#去除训练数据中的重复值
train_data.drop_duplicates(subset = cols_df, inplace=True)
#检查数据
print("处理后数据大小 : {} ".format(train_data.shape))
#缺失值查看
import missingno as msno
msno.matrix(train_data)
train_data.isnull().mean()
##NumberOfDependents存在缺失值,缺失率为0.026,因此直接删除缺失值。
#检查数据
print("处理前数据集大小 : {} ".format(train_data.shape))
train_data.dropna(subset=['NumberOfDependents'],inplace=True)
#检查数据
print('--------------------------------')
print("处理后数据集大小 : {} ".format(train_data.shape))
#Monthlyincome,缺失率为0.198,缺失的数据较多,不能直接删除含有缺失值的记录,所以我们根据变量之间的相关关系填补缺失值,我们采用随机森林法:
# 用随机森林对缺失值预测填充函数
from sklearn.ensemble import RandomForestRegressor
def set_missing(df):
# 把已有的数值型特征取出来
process_df = df.ix[:,[5,0,1,2,3,4,6,7,8,9]] #5-月收入,0-ID,1-9:其它数值字段
# 分成已知该特征和未知该特征两部分
known = process_df[process_df.MonthlyIncome.notnull()].as_matrix() #已知值
unknown = process_df[process_df.MonthlyIncome.isnull()].as_matrix() #缺失值
# X为特征属性值
X = known[:, 1:] #所有行,第1列之后所有列
# y为结果标签值
y = known[:, 0] #所有行,第1列,即月收入那一列
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=200,max_depth=3,n_jobs=-1)
rfr.fit(X,y)
# 用得到的模型进行未知特征值预测
predicted = rfr.predict(unknown[:, 1:]).round(0) #传入缺失值的x
print(predicted)
# 用得到的预测结果填补原缺失数据
df.loc[(df.MonthlyIncome.isnull()), 'MonthlyIncome'] = predicted
#loc是按行名索引,这里是一个筛选操作,选取MonthlyIncome列中为空的位置替换为predicted中的值
return df
set_missing(train_data)
# 本次单变量探索用到的自定义函数
##分位法处理异常值
def IQR_Cleaner(df,atrribute_name,floor,ceiling,method):
##df:要清洗的DataFrame,DataFrame
##atrribute_name:清洗的属性名称,字符串
##floor:清洗的下分位数,float
##ceiling:清洗的上分位数,float
##method:'drop' 即去除异常值,或者 ‘fill’即用上下临界值来替代异常值
Q1 = df[atrribute_name].quantile(floor)
Q3 = df[atrribute_name].quantile(ceiling)
IQR = Q3 - Q1
down = Q1 - 1.5*IQR
up = Q3 + 1.5*IQR
if method == 'drop':
df = df.loc[(df[atrribute_name]down),:]
else:
df.loc[df[atrribute_name]>up,atrribute_name]=up #大于up的值替换为up
df.loc[df[atrribute_name]1].shape[0] #计算大于1的数量
print(" RevolvingUtilizationOfUnsecuredLines字段大于1的记录占比: {} ".format(outlier_num/119587)) #计算大于1的比例
#异常值占比2.7%,占比较少,可以剔除异常值
train_data = train_data.loc[train_data.RevolvingUtilizationOfUnsecuredLines<=1]
#处理后字段分布特征
#输入目标变量
target = train_data.RevolvingUtilizationOfUnsecuredLines
#变量分布特征
Distribution_worker(target,30)
train_data.RevolvingUtilizationOfUnsecuredLines.describe()
#输入目标变量
target = train_data.DebtRatio
#变量分布特征
Distribution_worker(target,30)
#异常值处理
#根据业务逻辑,负债比率大于1的为异常值
outlier_num = train_data.loc[train_data.DebtRatio>1].shape[0]
print(" DebtRatio字段大于1的记录占比: {} ".format(outlier_num/119587))
#异常值占比25.5%,占比较大,用天花板法处理异常值
train_data = IQR_Cleaner(train_data, 'DebtRatio', 0.25, 0.75, 'fill')
#处理后字段分布特征
#输入目标变量
target = train_data.DebtRatio
#变量分布特征
Distribution_worker(target,30)
train_data.DebtRatio.describe()
#输入目标变量
target = train_data.NumberOfOpenCreditLinesAndLoans
#变量分布特征
Distribution_worker(target,20)
#从箱型图可以看出存在一些大于临界值的异常值,用天花板法处理异常值
train_data = IQR_Cleaner(train_data, 'NumberOfOpenCreditLinesAndLoans', 0.25, 0.75, 'fill')
#处理后字段分布特征
#输入目标变量
target = train_data.NumberOfOpenCreditLinesAndLoans
#变量分布特征
Distribution_worker(target,20)
train_data.NumberOfOpenCreditLinesAndLoans.describe()
train_data.NumberOfOpenCreditLinesAndLoans.unique() #离散值变量
#输入目标变量
target = train_data.NumberRealEstateLoansOrLines
#变量分布特征
Distribution_worker(target,5)
#天花板法处理异常值
train_data = IQR_Cleaner(train_data, 'NumberRealEstateLoansOrLines', 0.25, 0.75, 'fill')
#处理后字段分布特征
#输入目标变量
target = train_data.NumberRealEstateLoansOrLines
#变量分布特征
Distribution_worker(target,5)
train_data.NumberRealEstateLoansOrLines.describe()
train_data.NumberRealEstateLoansOrLines.unique() #离散变量
#输入目标变量
target = train_data.NumberOfDependents
#变量分布特征
Distribution_worker(target,5)
#天花板法处理异常值
train_data = IQR_Cleaner(train_data, 'NumberOfDependents', 0.25, 0.75, 'fill')
#处理后字段分布特征
#输入目标变量
target = train_data.NumberOfDependents
#变量分布特征
Distribution_worker(target,5)
train_data.NumberOfDependents.describe()
train_data.NumberOfDependents.unique() #离散变量
##观察历史信用记录数据分布情况
target = train_data[['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate']]
sns.boxplot(data =target, width=0.2,orient='h')
#观察箱形图,存在两个大于90的异常值,删除一个变量的异常值,另外两个也会跟着一起删除
train_data=train_data[train_data['NumberOfTime30-59DaysPastDueNotWorse']<90]
target = train_data[['NumberOfTime30-59DaysPastDueNotWorse','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate']]
sns.boxplot(data =target, width=0.2,orient='h')
print(train_data['NumberOfTime30-59DaysPastDueNotWorse'].unique())
print(train_data['NumberOfTime60-89DaysPastDueNotWorse'].unique())
print(train_data['NumberOfTimes90DaysLate'].unique())
#输入目标变量
target = train_data.dropna().MonthlyIncome
#变量分布特征
Distribution_worker(target,30)
#天花板法处理异常值
train_data = IQR_Cleaner(train_data, 'MonthlyIncome', 0.25, 0.75, 'fill')
#处理后字段分布特征
#输入目标变量
target = train_data.dropna().MonthlyIncome
#变量分布特征
Distribution_worker(target,30)
train_data.dropna().MonthlyIncome.describe()
#分箱
#连续性变量--- 定义自动分箱函数---最优分箱
def mono_bin(Y, X, n=10):# X为待分箱的变量,Y为target变量,n为分箱数量
r = 0 #设定斯皮尔曼 初始值
badnum=Y.sum() #计算坏样本数
goodnum=Y.count()-badnum #计算好样本数
#下面这段就是分箱的核心 ,就是机器来选择指定最优的分箱节点,代替我们自己来设置
while np.abs(r) < 1:
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.qcut(X, n,duplicates="drop")})#用pd.qcut实现最优分箱,Bucket:将X分为n段,n由斯皮尔曼系数决定
d2 = d1.groupby('Bucket', as_index = True)# 按照分箱结果进行分组聚合
r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)# 以斯皮尔曼系数作为分箱终止条件,完全单调相关
n = n - 1
d3 = pd.DataFrame(d2.X.min(), columns = ['min'])
d3['min']=d2.min().X #箱体的左边界
d3['max'] = d2.max().X #箱体的右边界
d3['bad'] = d2.sum().Y #每个箱体中坏样本的数量
d3['total'] = d2.count().Y #每个箱体的总样本数
d3['rate'] = d2.mean().Y
d3['badattr'] = d3['bad']/badnum #每个箱体中坏样本所占坏样本总数的比例
d3['goodattr'] = (d3['total'] - d3['bad'])/goodnum # 每个箱体中好样本所占好样本总数的比例
d3['woe']=np.log(d3['badattr']/d3['goodattr'])# 计算每个箱体的woe值=log(响应样本比例/非响应样本比例)
d3['iv']=(d3['badattr']-d3['goodattr'])*d3['woe'] #iv值=(响应用户比例-非响应用户比例)*woe
iv = ((d3['badattr']-d3['goodattr'])*d3['woe']).sum() # 计算变量的iv值
d4 = (d3.sort_index(by = 'min')).reset_index(drop=True) # 对箱体从大到小进行排序
print('分箱结果:')
print(d4)
print('IV值为:')
print(iv)
woe=list(d4['woe'].round(3))
cut=[] # cut 存放箱段节点
cut.append(float('-inf')) # 在列表前加-inf
for i in range(1,n+1): # n是前面的分箱的分割数,所以分成n+1份
qua=X.quantile(i/(n+1)) #quantile 分为数 得到分箱的节点
cut.append(round(qua,4)) # 保留4位小数 #返回cut
cut.append(float('inf')) # 在列表后加 inf
print('cut箱段节点:')
print(cut)
return d4,iv,cut,woe
x1_d,x1_iv,x1_cut,x1_woe = mono_bin(train_data['SeriousDlqin2yrs'],train_data.RevolvingUtilizationOfUnsecuredLines,n=10)
x2_d,x2_iv,x2_cut,x2_woe = mono_bin(train_data['SeriousDlqin2yrs'],train_data.age)
x4_d,x4_iv,x4_cut,x4_woe = mono_bin(train_data['SeriousDlqin2yrs'],train_data.DebtRatio)
x5_d,x5_iv,x5_cut,x5_woe = mono_bin(train_data['SeriousDlqin2yrs'],train_data.MonthlyIncome)
#离散型变量-手动分箱
def self_bin(Y,X,cut):
badnum=Y.sum() # 坏用户数量
goodnum=Y.count()-badnum #好用户数量
d1 = pd.DataFrame({"X": X, "Y": Y, "Bucket": pd.cut(X, cut)})#建立个数据框 X-- 各个特征变量 , Y--用户好坏标签 , Bucket--各个分箱
d2 = d1.groupby('Bucket', as_index = True)# 按照分箱结果进行分组聚合
d3 = pd.DataFrame(d2.X.min(), columns = ['min']) # 添加 min 列 ,不用管里面的 d2.X.min()
d3['min']=d2.min().X
d3['max'] = d2.max().X
d3['bad'] = d2.sum().Y
d3['total'] = d2.count().Y
d3['rate'] = d2.mean().Y
d3['badattr'] = d3['bad']/badnum #每个箱体中坏样本所占坏样本总数的比例
d3['goodattr'] = (d3['total'] - d3['bad'])/goodnum # 每个箱体中好样本所占好样本总数的比例
d3['woe']=np.log(d3['badattr']/d3['goodattr'])# 计算每个箱体的woe值=log(响应样本比例/非响应样本比例)
d3['iv']=(d3['badattr']-d3['goodattr'])*d3['woe'] #iv值=(响应用户比例-非响应用户比例)*woe
iv = ((d3['badattr']-d3['goodattr'])*d3['woe']).sum() # 计算变量的iv值 # 计算变量的iv值
d4 = (d3.sort_index(by = 'min')).reset_index(drop=True) # 对箱体从大到小进行排序
woe=list(d4['woe'].round(3))
return d4,iv,woe
#观察离散字段的分档
#数据区间
def a_range(a_rate):
rate_0 = np.percentile(a_rate, 0)
rate_10 = np.percentile(a_rate, 10)
rate_20 = np.percentile(a_rate, 20)
rate_30 = np.percentile(a_rate, 30)
rate_40 = np.percentile(a_rate, 40)
rate_50 = np.percentile(a_rate, 50)
rate_60 = np.percentile(a_rate, 60)
rate_70 = np.percentile(a_rate, 70)
rate_80 = np.percentile(a_rate, 80)
rate_90 = np.percentile(a_rate, 90)
rate_100 = np.percentile(a_rate, 100)
rate_name=['rate_0','rate_10','rate_20','rate_30','rate_40','rate_50','rate_60','rate_70','rate_80','rate_90','rate_100']
rate_value=[rate_0,rate_10,rate_20,rate_30,rate_40,rate_50,rate_60,rate_70,rate_80,rate_90,rate_100]
range_rate=pd.DataFrame({"range":rate_name,"range_value":rate_value})
return range_rate
a_range(train_data["NumberOfTime30-59DaysPastDueNotWorse"].dropna().values)
ninf = float('-inf')#负无穷大
pinf = float('inf')#正无穷大
cutx3 = [ninf, 0, 1, pinf]
dfx3,ivx3,woex3 = self_bin(train_data.SeriousDlqin2yrs,train_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3)
dfx3
ivx3
a_range(train_data["NumberOfOpenCreditLinesAndLoans"].dropna().values)
cutx6 = [ninf, 4, 7, 9,12,pinf]
dfx6,ivx6 ,woex6= self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfOpenCreditLinesAndLoans'], cutx6)
dfx6
ivx6
a_range(train_data["NumberOfTimes90DaysLate"].dropna().values)
cutx7 = [ninf, 0,1, pinf]
dfx7,ivx7,woex7 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfTimes90DaysLate'], cutx7)
dfx7
ivx7
a_range(train_data["NumberRealEstateLoansOrLines"].dropna().values)
cutx8 = [ninf, 0,1,2, 3, pinf]
dfx8, ivx8,woex8 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberRealEstateLoansOrLines'], cutx8)
dfx8
ivx8
a_range(train_data["NumberOfTime60-89DaysPastDueNotWorse"].dropna().values)
cutx9 = [ninf, 0, 1, pinf]
dfx9, ivx9,woex9 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfTime60-89DaysPastDueNotWorse'], cutx9)
dfx9
ivx9
a_range(train_data["NumberOfDependents"].dropna().values)
cutx10 = [ninf, 0, 1, 2, pinf]
dfx10,ivx10,woex10 = self_bin(train_data.SeriousDlqin2yrs, train_data['NumberOfDependents'], cutx10)
dfx10
ivx10
corr = train_data.corr()#计算各变量的相关性系数
print(max(corr))
plt.subplots(figsize=(12, 12))
sns.heatmap(corr, annot=True, vmax=1, square=True, cmap='Blues')
#IV值筛选
#通过IV值判断变量预测能力的标准是:小于 0.02: unpredictive;0.02 to 0.1: weak;0.1 to 0.3: medium; 0.3 to 0.5: strong
ivlist=[x1_iv,x2_iv,ivx3,x4_iv,x5_iv,ivx6,ivx7,ivx8,ivx9,ivx10]#各变量IV
index=['x1','x2','x3','x4','x5','x6','x7','x8','x9','x10']#x轴的标签
fig1 = plt.figure(1,figsize=(8,5))
ax1 = fig1.add_subplot(1, 1, 1)
x = np.arange(len(index))+1
ax1.bar(x,ivlist,width=.4) # ax1.bar(range(len(index)),ivlist, width=0.4)#生成柱状图 #ax1.bar(x,ivlist,width=.04)
ax1.set_xticks(x)
ax1.set_xticklabels(index, rotation=0, fontsize=15)
ax1.set_ylabel('IV', fontsize=16) #IV(Information Value),
#在柱状图上添加数字标签
for a, b in zip(x, ivlist):
plt.text(a, b + 0.01, '%.4f' % b, ha='center', va='bottom', fontsize=12)
plt.show()
#进行WOE编码,按照分档,把各分档值赋成对应woe
def trans_woe(var,var_name,woe,cut):
woe_name=var_name+'_woe'
for i in range(len(woe)): # len(woe) 得到woe里 有多少个数值
if i==0:
var.loc[(var[var_name]<=cut[i+1]),woe_name]=woe[i] #将woe的值按 cut分箱的下节点,顺序赋值给var的woe_name 列 ,分箱的第一段
elif (i>0) and (i<=len(woe)-2):
var.loc[((var[var_name]>cut[i])&(var[var_name]<=cut[i+1])),woe_name]=woe[i] # 中间的分箱区间
else:
var.loc[(var[var_name]>cut[len(woe)-1]),woe_name]=woe[len(woe)-1] # 大于最后一个分箱区间的 上限值,最后一个值是正无穷
return var
x1_name='RevolvingUtilizationOfUnsecuredLines'
x2_name='age'
x3_name='NumberOfTime30-59DaysPastDueNotWorse'
x7_name='NumberOfTimes90DaysLate'
x9_name='NumberOfTime60-89DaysPastDueNotWorse'
train_data=trans_woe(train_data,x1_name,x1_woe,x1_cut)
train_data=trans_woe(train_data,x2_name,x2_woe,x2_cut)
train_data=trans_woe(train_data,x3_name,woex3,cutx3)
train_data=trans_woe(train_data,x7_name,woex7,cutx7)
train_data=trans_woe(train_data,x9_name,woex9,cutx9)
Y=train_data['SeriousDlqin2yrs'] #因变量
#自变量,剔除对因变量影响不明显的变量
X=train_data.drop(['SeriousDlqin2yrs','DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
X=train_data.iloc[:,-5:]
X.head()
# STATSMODEL包来建立逻辑回归模型得到回归系数,后面可用于建立标准评分卡
import statsmodels.api as sm
X1=sm.add_constant(X)
logit=sm.Logit(Y,X1)
result=logit.fit()
print(result)
print(result.summary())
#对训练集模型做模拟和评估
import statsmodels.api as sm
train_data=trans_woe(train_data,x1_name,x1_woe,x1_cut)
train_data=trans_woe(train_data,x2_name,x2_woe,x2_cut)
train_data=trans_woe(train_data,x3_name,woex3,cutx3)
train_data=trans_woe(train_data,x7_name,woex7,cutx7)
train_data=trans_woe(train_data,x9_name,woex9,cutx9)
#print(train_data)
#构建测试集的特征和标签
test_X=train_data.iloc[:,-5:] #测试数据 特征
#print(test_X)
test_Y=train_data['SeriousDlqin2yrs'] #测试数据 标签
#print(test_Y)
#print(test_Y.dtype)
#print(test_Y.count())
#评估
from sklearn import metrics
X3=sm.add_constant(test_X)
resu = result.predict(X3) #进行预测
fpr,tpr,threshold=metrics.roc_curve(test_Y,resu) #评估算法
rocauc=metrics.auc(fpr,tpr) #计算AUC
plt.figure(figsize=(8,5)) #只能在这里面设置
plt.plot(fpr,tpr,'b',label='AUC=%0.2f'% rocauc)
plt.legend(loc='lower right',fontsize=14)
plt.plot([0.0, 1.0], [0.0, 1.0], 'r--')
plt.xlim=([0.0, 1.0])
plt.ylim=([0.0, 1.0])
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylabel('TPR-真正率',fontsize=16)
plt.xlabel('FPR-假正率',fontsize=16)
plt.show()
p=20/np.log(2)#比例因子
q=600-20*np.log(20)/np.log(2)#等于offset,偏移量
x_coe=[-2.7340,0.6526,0.5201,0.5581,0.5943,0.4329]#回归系数 ???
baseScore=round(q+p*x_coe[0],0)
#个人总评分=基础分+各部分得分
def get_score(coe,woe,factor):
scores=[]
for w in woe:
score=round(coe*w*factor,0)
scores.append(score)
return scores
#每一项得分
x1_score=get_score(x_coe[1],x1_woe,p)
x2_score=get_score(x_coe[2],x2_woe,p)
x3_score=get_score(x_coe[3],woex3,p)
x7_score=get_score(x_coe[4],woex7,p)
x9_score=get_score(x_coe[5],woex9,p)
def compute_score(series,cut,score):
list = []
i = 0
while i < len(series):
#print(series[i].dtype)
#print(series.iloc[i])
value = series.iloc[i]
j = len(cut) - 2
m = len(cut) - 2
while j >= 0:
if value >= cut[j]:
j = -1
else:
j -= 1
m -= 1
list.append(score[m])
i += 1
return list
train_data['BaseScore']=np.zeros(len(train_data))+baseScore
train_data['x1'] =compute_score(train_data['RevolvingUtilizationOfUnsecuredLines'], x1_cut, x1_score)
train_data['x2'] = compute_score(train_data['age'], x2_cut, x2_score)
train_data['x3'] = compute_score(train_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3_score)
train_data['x7'] = compute_score(train_data['NumberOfTimes90DaysLate'], cutx7, x7_score)
train_data['x9'] = compute_score(train_data['NumberOfTime60-89DaysPastDueNotWorse'],cutx9,x9_score)
train_data['Score'] = train_data['x1'] + train_data['x2'] + train_data['x3'] + train_data['x7'] +train_data['x9'] + baseScore
scoretable1=train_data.iloc[:,[0,-7,-6,-5,-4,-3,-2,-1]] #选取需要的列,就是评分列
scoretable1.head()
colNameDict={'x1': 'RevolvingUtilizationOfUnsecuredLines' ,'x2':'age','x3':'NumberOfTime30-59DaysPastDueNotWorse',
'x7':'NumberOfTimes90DaysLate', 'x9':'NumberOfTime60-89DaysPastDueNotWorse'}
scoretable1=scoretable1.rename(columns=colNameDict,inplace=False)
scoretable1.to_excel(r'训练集评分卡.xlsx')
#Logistic回归模型——将训练集的数据分为测试集和训练集
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#自变量,剔除对因变量影响不明显的变量
train_data=train_data.drop(['DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
training,testing=train_test_split(train_data,test_size=0.25,random_state=1)
#Y=train_data['SeriousDlqin2yrs'] #因变量
#X=train_data.iloc[:,-5:]
#创建训练集和测试集各自的自变量和因变量
x_train=training.iloc[:,-5:]
y_train=training['SeriousDlqin2yrs']
x_test=testing.iloc[:,-5:]
y_test=testing['SeriousDlqin2yrs']
clf = LogisticRegression()
clf.fit(x_train,y_train)
#对测试集做预测
score_proba = clf.predict_proba(x_test)
y_predproba=score_proba[:,1]
coe = clf.coef_
print(coe)
#对模型做评估
from sklearn.metrics import roc_curve,auc
fpr,tpr,threshold = roc_curve(y_test,y_predproba)
auc_score=auc(fpr,tpr)
plt.figure(figsize=(8,5)) #只能在这里面设置
plt.plot(fpr,tpr,'b',label='AUC=%0.2f'% auc_score)
plt.legend(loc='lower right',fontsize=14)
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim=([0, 1])
plt.ylim=([0, 1])
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.ylabel('TPR-真正率',fontsize=16)
plt.xlabel('FPR-假正率',fontsize=16)
plt.show()
KS指标: 用以评估模型对好、坏客户的判别区分能力,计算累计坏客户与累计好客户百分比的最大差距。KS值范围在0%-100%,判别标准如下:
KS: <20% : 差
KS: 20%-40% : 一般
KS: 41%-50% : 好
KS: 51%-75% : 非常好
KS: >75% : 过高,需要谨慎的验证模型
KS = max(tpr-fpr)
fig,ax = plt.subplots()
ax.plot(1-threshold,tpr,label='tpr')
ax.plot(1-threshold,fpr,label='fpr')
ax.plot(1-threshold,tpr-fpr,label='KS')
plt.xlabel('score')
plt.title('KS curve')
plt.xlim=([0.0,1.0])
plt.ylim=([0.0,1.0])
plt.figure(figsize=(20,20))
legend=ax.legend(loc='upper left')
plt.show()
print("KS",KS)
#预测测试集
#测试集转化为WOE值
test_data=trans_woe(test_data,x1_name,x1_woe,x1_cut)
test_data=trans_woe(test_data,x2_name,x2_woe,x2_cut)
test_data=trans_woe(test_data,x3_name,woex3,cutx3)
test_data=trans_woe(test_data,x7_name,woex7,cutx7)
test_data=trans_woe(test_data,x9_name,woex9,cutx9)
#自变量,剔除对因变量影响不明显的变量
test_data=test_data.drop(['DebtRatio','MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans','NumberRealEstateLoansOrLines','NumberOfDependents'],axis=1)
X_test=test_data.iloc[:,-5:]
Y_test=test_data['SeriousDlqin2yrs'] #因变量
X_train=training.copy().iloc[:,-5:]
Y_train=training.copy()['SeriousDlqin2yrs']
clf = LogisticRegression()
clf.fit(X_train,Y_train)
#对测试集做预测
score_proba = clf.predict_proba(X_test)
#print(score_proba)
test_data['y_predproba']=score_proba[:,1]
#y_predproba = score_proba[:,1]
coe = clf.coef_
print('-------------------')
print(coe)
p=20/np.log(2)#比例因子
q=600-20*np.log(20)/np.log(2)#等于offset,偏移量
x_coe=[-2.7340,0.6526,0.5201,0.5581,0.5943,0.4329]#回归系数 ???
baseScore=round(q+p*x_coe[0],0)
#个人总评分=基础分+各部分得分
def get_score(coe,woe,factor):
scores=[]
for w in woe:
score=round(coe*w*factor,0)
scores.append(score)
return scores
#每一项得分
x1_score=get_score(x_coe[1],x1_woe,p)
x2_score=get_score(x_coe[2],x2_woe,p)
x3_score=get_score(x_coe[3],woex3,p)
x7_score=get_score(x_coe[4],woex7,p)
x9_score=get_score(x_coe[5],woex9,p)
def compute_score(series,cut,score):
list = []
i = 0
while i < len(series):
#print(series[i].dtype)
#print(series.iloc[i])
value = series.iloc[i]
j = len(cut) - 2
m = len(cut) - 2
while j >= 0:
if value >= cut[j]:
j = -1
else:
j -= 1
m -= 1
list.append(score[m])
i += 1
return list
test_data['BaseScore']=np.zeros(len(test_data))+baseScore
test_data['x1'] =compute_score(test_data['RevolvingUtilizationOfUnsecuredLines'], x1_cut, x1_score)
test_data['x2'] = compute_score(test_data['age'], x2_cut, x2_score)
test_data['x3'] = compute_score(test_data['NumberOfTime30-59DaysPastDueNotWorse'], cutx3, x3_score)
test_data['x7'] = compute_score(test_data['NumberOfTimes90DaysLate'], cutx7, x7_score)
test_data['x9'] = compute_score(test_data['NumberOfTime60-89DaysPastDueNotWorse'],cutx9,x9_score)
test_data['Score'] = test_data['x1'] + test_data['x2'] + test_data['x3'] + test_data['x7'] +test_data['x9'] + baseScore
scoretable2=test_data.iloc[:,[0,-8,-7,-6,-5,-4,-3,-2,-1]] #选取需要的列,就是评分列
print(scoretable2.head())
colNameDict={'x1': 'RevolvingUtilizationOfUnsecuredLines' ,'x2':'age','x3':'NumberOfTime30-59DaysPastDueNotWorse',
'x7':'NumberOfTimes90DaysLate', 'x9':'NumberOfTime60-89DaysPastDueNotWorse'}
scoretable2=scoretable2.rename(columns=colNameDict,inplace=False)
scoretable2.to_excel(r'测试集评分卡.xlsx')
#导出数据后再根据y_predproba<0.5的,SeriousDlqin2yrs=0,否则为1去填充