通过Python代码封装评分卡设计中经常使用的方法
import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import math from xgboost import XGBClassifier from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor from sklearn.model_selection import cross_val_score import statsmodels.api as sm from sklearn.linear_model import LogisticRegression from sklearn import metrics # EDA分析 # 类别型变量的分布 def plot_cate_var(df, col_list, hspace=0.4, wspace=0.4, plt_size=None, plt_num=None, x=None, y=None): """ df:数据集 col_list:变量list集合 hspace :子图之间的间隔(y轴方向) wspace :子图之间的间隔(x轴方向) plt_size :图纸的尺寸 plt_num :子图的数量 x :子图矩阵中一行子图的数量 y :子图矩阵中一列子图的数量 return :变量的分布图(柱状图形式) """ plt.figure(figsize=plt_size) plt.subplots_adjust(hspace=hspace, wspace=wspace) plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False for i, col in zip(range(1, plt_num + 1, 1), col_list): plt.subplot(x, y, i) plt.title(col) sns.countplot(data=df, y=col) plt.ylabel('') return plt.show() # 数值型变量的分布 def plot_num_col(df, col_list, hspace=0.4, wspace=0.4, plt_type=None, plt_size=None, plt_num=None, x=None, y=None): """ df:数据集 col_list:变量list集合 hspace :子图之间的间隔(y轴方向) wspace :子图之间的间隔(x轴方向) plt_type: 选择直方图/箱线图 plt_size :图纸的尺寸 plt_num :子图的数量 x :子图矩阵中一行子图的数量 y :子图矩阵中一列子图的数量 return :变量的分布图(箱线图/直方图) """ plt.figure(figsize=plt_size) plt.subplots_adjust(hspace=hspace, wspace=wspace) if plt_type == 'hist': for i, col in zip(range(1, plt_num + 1, 1), col_list): plt.subplot(x, y, i) plt.title(col) sns.distplot(df[col].dropna()) plt.xlabel('') if plt_type == 'box': for i, col in zip(range(1, plt_num + 1, 1), col_list): plt.subplot(x, y, i) plt.title(col) sns.boxplot(data=df, x=col) plt.xlabel('') return plt.show() # 类别型变量的违约率分析 def plot_default_cate(df, col_list, target, hspace=0.4, wspace=0.4, plt_size=None, plt_num=None, x=None, y=None): """ df:数据集 col_list:变量list集合 target :目标变量的字段名 hspace :子图之间的间隔(y轴方向) wspace :子图之间的间隔(x轴方向) plt_size :图纸的尺寸 plt_num :子图的数量 x :子图矩阵中一行子图的数量 y :子图矩阵中一列子图的数量 return :违约率分布图(柱状图形式) """ all_bad = df[target].sum() total = df[target].count() all_default_rate = all_bad * 1.0 / total plt.figure(figsize=plt_size) plt.subplots_adjust(hspace=hspace, wspace=wspace) plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False for i, col in zip(range(1, plt_num + 1, 1), col_list): d1 = df.groupby(col) d2 = pd.DataFrame() d2['total'] = d1[target].count() d2['bad'] = d1[target].sum() d2['default_rate'] = d2['bad'] / d2['total'] d2 = d2.reset_index() plt.subplot(x, y, i) plt.title(col) plt.axvline(x=all_default_rate) sns.barplot(data=d2, y=col, x='default_rate') plt.ylabel('') return plt.show() # 数值型变量的违约率分析 def plot_default_num(df, col_list, target, hspace=0.4, wspace=0.4, q=None, plt_size=None, plt_num=None, x=None, y=None): """ df:数据集 col_list:变量list集合 target :目标变量的字段名 hspace :子图之间的间隔(y轴方向) wspace :子图之间的间隔(x轴方向) q :等深分箱的箱体个数 plt_size :图纸的尺寸 plt_num :子图的数量 x :子图矩阵中一行子图的数量 y :子图矩阵中一列子图的数量 return :违约率分布图(折线图形式) """ all_bad = df[target].sum() total = df[target].count() all_default_rate = all_bad * 1.0 / total plt.figure(figsize=plt_size) plt.subplots_adjust(hspace=hspace, wspace=wspace) for i, col in zip(range(1, plt_num + 1, 1), col_list): bucket = pd.qcut(df[col], q=q, duplicates='drop') d1 = df.groupby(bucket) d2 = pd.DataFrame() d2['total'] = d1[target].count() d2['bad'] = d1[target].sum() d2['default_rate'] = d2['bad'] / d2['total'] d2 = d2.reset_index() plt.subplot(x, y, i) plt.title(col) plt.axhline(y=all_default_rate) sns.pointplot(data=d2, x=col, y='default_rate', color='hotpink') plt.xticks(rotation=60) plt.xlabel('') return plt.show() # 变量woe离散化 # 变量woe结果表 def woe_df_concat(bin_df): """ bin_df:list形式,里面存储每个变量的分箱结果 return :woe结果表 """ woe_df_list = [] for df in bin_df: woe_df = df.reset_index().assign(col=df.index.name).rename(columns={df.index.name: 'bin'}) woe_df_list.append(woe_df) woe_result = pd.concat(woe_df_list, axis=0) # 为了便于查看,将字段名列移到第一列的位置上 woe_result1 = woe_result['col'] woe_result2 = woe_result.iloc[:, :-1] woe_result_df = pd.concat([woe_result1, woe_result2], axis=1) woe_result_df = woe_result_df.reset_index(drop=True) return woe_result_df # woe转换 def woe_transform(df, target, df_woe): """ df:数据集 target:目标变量的字段名 df_woe:woe结果表 return:woe转化之后的数据集 """ df2 = df.copy() for col in df2.drop([target], axis=1).columns: x = df2[col] bin_map = df_woe[df_woe.col == col] bin_res = np.array([0] * x.shape[0], dtype=float) for i in bin_map.index: lower = bin_map['min_bin'][i] upper = bin_map['max_bin'][i] if lower == upper: x1 = x[np.where(x == lower)[0]] else: x1 = x[np.where((x >= lower) & (x <= upper))[0]] mask = np.in1d(x, x1) bin_res[mask] = bin_map['woe'][i] bin_res = pd.Series(bin_res, index=x.index) bin_res.name = x.name df2[col] = bin_res return df2 # 变量分箱 # 类别性变量的分箱 def binning_cate(df, col_list, target): """ df:数据集 col_list:变量list集合 target:目标变量的字段名 return: bin_df :list形式,里面存储每个变量的分箱结果 iv_value:list形式,里面存储每个变量的IV值 """ total = df[target].count() bad = df[target].sum() good = total - bad all_odds = good * 1.0 / bad bin_df = [] iv_value = [] for col in col_list: d1 = df.groupby([col], as_index=True) d2 = pd.DataFrame() d2['min_bin'] = d1[col].min() d2['max_bin'] = d1[col].max() d2['total'] = d1[target].count() d2['totalrate'] = d2['total'] / total d2['bad'] = d1[target].sum() d2['badrate'] = d2['bad'] / d2['total'] d2['good'] = d2['total'] - d2['bad'] d2['goodrate'] = d2['good'] / d2['total'] d2['badattr'] = d2['bad'] / bad d2['goodattr'] = (d2['total'] - d2['bad']) / good d2['odds'] = d2['good'] / d2['bad'] GB_list = [] for i in d2.odds: if i >= all_odds: GB_index = str(round((i / all_odds) * 100, 0)) + str('G') else: GB_index = str(round((all_odds / i) * 100, 0)) + str('B') GB_list.append(GB_index) d2['GB_index'] = GB_list d2['woe'] = np.log(d2['badattr'] / d2['goodattr']) d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['woe'] d2['IV'] = d2['bin_iv'].sum() iv = d2['bin_iv'].sum().round(3) print('变量名:{}'.format(col)) print('IV:{}'.format(iv)) print('\t') bin_df.append(d2) iv_value.append(iv) return bin_df, iv_value # 类别性变量iv的明细表 def iv_cate(df, col_list, target): """ df:数据集 col_list:变量list集合 target:目标变量的字段名 return:变量的iv明细表 """ bin_df, iv_value = binning_cate(df, col_list, target) iv_df = pd.DataFrame({'col': col_list, 'iv': iv_value}) iv_df = iv_df.sort_values('iv', ascending=False) return iv_df # 数值型变量的分箱 # 先用卡方分箱输出变量的分割点 def split_data(df, col, split_num): """ df: 原始数据集 col:需要分箱的变量 split_num:分割点的数量 """ df2 = df.copy() count = df2.shape[0] # 总样本数 n = math.floor(count / split_num) # 按照分割点数目等分后每组的样本数 split_index = [i * n for i in range(1, split_num)] # 分割点的索引 values = sorted(list(df2[col])) # 对变量的值从小到大进行排序 split_value = [values[i] for i in split_index] # 分割点对应的value split_value = sorted(list(set(split_value))) # 分割点的value去重排序 return split_value def assign_group(x, split_bin): """ x:变量的value split_bin:split_data得出的分割点list """ n = len(split_bin) if x <= min(split_bin): return min(split_bin) # 如果x小于分割点的最小值,则x映射为分割点的最小值 elif x > max(split_bin): # 如果x大于分割点的最大值,则x映射为分割点的最大值 return 10e10 else: for i in range(n - 1): if split_bin[i] < x <= split_bin[i + 1]: # 如果x在两个分割点之间,则x映射为分割点较大的值 return split_bin[i + 1] def bin_bad_rate(df, col, target, grantRateIndicator=0): """ df:原始数据集 col:原始变量/变量映射后的字段 target:目标变量的字段 grantRateIndicator:是否输出总体的违约率 """ total = df.groupby([col])[target].count() bad = df.groupby([col])[target].sum() total_df = pd.DataFrame({'total': total}) bad_df = pd.DataFrame({'bad': bad}) regroup = pd.merge(total_df, bad_df, left_index=True, right_index=True, how='left') regroup = regroup.reset_index() regroup['bad_rate'] = regroup['bad'] / regroup['total'] # 计算根据col分组后每组的违约率 dict_bad = dict(zip(regroup[col], regroup['bad_rate'])) # 转为字典形式 if grantRateIndicator == 0: return (dict_bad, regroup) total_all = df.shape[0] bad_all = df[target].sum() all_bad_rate = bad_all / total_all # 计算总体的违约率 return (dict_bad, regroup, all_bad_rate) def cal_chi2(df, all_bad_rate): """ df:bin_bad_rate得出的regroup all_bad_rate:bin_bad_rate得出的总体违约率 """ df2 = df.copy() df2['expected'] = df2['total'] * all_bad_rate # 计算每组的坏用户期望数量 combined = zip(df2['expected'], df2['bad']) # 遍历每组的坏用户期望数量和实际数量 chi = [(i[0] - i[1]) ** 2 / i[0] for i in combined] # 计算每组的卡方值 chi2 = sum(chi) # 计算总的卡方值 return chi2 def assign_bin(x, cutoffpoints): """ x:变量的value cutoffpoints:分箱的切割点 """ bin_num = len(cutoffpoints) + 1 # 箱体个数 if x <= cutoffpoints[0]: # 如果x小于最小的cutoff点,则映射为Bin 0 return 'Bin 0' elif x > cutoffpoints[-1]: # 如果x大于最大的cutoff点,则映射为Bin(bin_num-1) return 'Bin {}'.format(bin_num - 1) else: for i in range(0, bin_num - 1): if cutoffpoints[i] < x <= cutoffpoints[i + 1]: # 如果x在两个cutoff点之间,则x映射为Bin(i+1) return 'Bin {}'.format(i + 1) def ChiMerge(df, col, target, max_bin=5, min_binpct=0): col_unique = sorted(list(set(df[col]))) # 变量的唯一值并排序 n = len(col_unique) # 变量唯一值得个数 df2 = df.copy() if n > 100: # 如果变量的唯一值数目超过100,则将通过split_data和assign_group将x映射为split对应的value split_col = split_data(df2, col, 100) # 通过这个目的将变量的唯一值数目人为设定为100 df2['col_map'] = df2[col].map(lambda x: assign_group(x, split_col)) else: df2['col_map'] = df2[col] # 变量的唯一值数目没有超过100,则不用做映射 # 生成dict_bad,regroup,all_bad_rate的元组 (dict_bad, regroup, all_bad_rate) = bin_bad_rate(df2, 'col_map', target, grantRateIndicator=1) col_map_unique = sorted(list(set(df2['col_map']))) # 对变量映射后的value进行去重排序 group_interval = [[i] for i in col_map_unique] # 对col_map_unique中每个值创建list并存储在group_interval中 while (len(group_interval) > max_bin): # 当group_interval的长度大于max_bin时,执行while循环 chi_list = [] for i in range(len(group_interval) - 1): temp_group = group_interval[i] + group_interval[i + 1] # temp_group 为生成的区间,list形式,例如[1,3] chi_df = regroup[regroup['col_map'].isin(temp_group)] chi_value = cal_chi2(chi_df, all_bad_rate) # 计算每一对相邻区间的卡方值 chi_list.append(chi_value) best_combined = chi_list.index(min(chi_list)) # 最小的卡方值的索引 # 将卡方值最小的一对区间进行合并 group_interval[best_combined] = group_interval[best_combined] + group_interval[best_combined + 1] # 删除合并前的右区间 group_interval.remove(group_interval[best_combined + 1]) # 对合并后每个区间进行排序 group_interval = [sorted(i) for i in group_interval] # cutoff点为每个区间的最大值 cutoffpoints = [max(i) for i in group_interval[:-1]] # 检查是否有箱只有好样本或者只有坏样本 df2['col_map_bin'] = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints)) # 将col_map映射为对应的区间Bin # 计算每个区间的违约率 (dict_bad, regroup) = bin_bad_rate(df2, 'col_map_bin', target) # 计算最小和最大的违约率 [min_bad_rate, max_bad_rate] = [min(dict_bad.values()), max(dict_bad.values())] # 当最小的违约率等于0,说明区间内只有好样本,当最大的违约率等于1,说明区间内只有坏样本 while min_bad_rate == 0 or max_bad_rate == 1: bad01_index = regroup[regroup['bad_rate'].isin([0, 1])].col_map_bin.tolist() # 违约率为1或0的区间 bad01_bin = bad01_index[0] if bad01_bin == max(regroup.col_map_bin): cutoffpoints = cutoffpoints[:-1] # 当bad01_bin是最大的区间时,删除最大的cutoff点 elif bad01_bin == min(regroup.col_map_bin): cutoffpoints = cutoffpoints[1:] # 当bad01_bin是最小的区间时,删除最小的cutoff点 else: bad01_bin_index = list(regroup.col_map_bin).index(bad01_bin) # 找出bad01_bin的索引 prev_bin = list(regroup.col_map_bin)[bad01_bin_index - 1] # bad01_bin前一个区间 df3 = df2[df2.col_map_bin.isin([prev_bin, bad01_bin])] (dict_bad, regroup1) = bin_bad_rate(df3, 'col_map_bin', target) chi1 = cal_chi2(regroup1, all_bad_rate) # 计算前一个区间和bad01_bin的卡方值 later_bin = list(regroup.col_map_bin)[bad01_bin_index + 1] # bin01_bin的后一个区间 df4 = df2[df2.col_map_bin.isin([later_bin, bad01_bin])] (dict_bad, regroup2) = bin_bad_rate(df4, 'col_map_bin', target) chi2 = cal_chi2(regroup2, all_bad_rate) # 计算后一个区间和bad01_bin的卡方值 if chi1 < chi2: # 当chi1=chi2时,删除bin01对应的cutoff点 cutoffpoints.remove(cutoffpoints[bad01_bin_index]) df2['col_map_bin'] = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints)) (dict_bad, regroup) = bin_bad_rate(df2, 'col_map_bin', target) # 重新将col_map映射至区间,并计算最小和最大的违约率,直达不再出现违约率为0或1的情况,循环停止 [min_bad_rate, max_bad_rate] = [min(dict_bad.values()), max(dict_bad.values())] # 检查分箱后的最小占比 if min_binpct > 0: group_values = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints)) df2['col_map_bin'] = group_values # 将col_map映射为对应的区间Bin group_df = group_values.value_counts().to_frame() group_df['bin_pct'] = group_df['col_map'] / n # 计算每个区间的占比 min_pct = group_df.bin_pct.min() # 得出最小的区间占比 while min_pct < min_binpct and len(cutoffpoints) > 2: # 当最小的区间占比小于min_pct且cutoff点的个数大于2,执行循环 # 下面的逻辑基本与“检验是否有箱体只有好/坏样本”的一致 min_pct_index = group_df[group_df.bin_pct == min_pct].index.tolist() min_pct_bin = min_pct_index[0] if min_pct_bin == max(group_df.index): cutoffpoints = cutoffpoints[:-1] elif min_pct_bin == min(group_df.index): cutoffpoints = cutoffpoints[1:] else: minpct_bin_index = list(group_df.index).index(min_pct_bin) prev_pct_bin = list(group_df.index)[minpct_bin_index - 1] df5 = df2[df2['col_map_bin'].isin([min_pct_bin, prev_pct_bin])] (dict_bad, regroup3) = bin_bad_rate(df5, 'col_map_bin', target) chi3 = cal_chi2(regroup3, all_bad_rate) later_pct_bin = list(group_df.index)[minpct_bin_index + 1] df6 = df2[df2['col_map_bin'].isin([min_pct_bin, later_pct_bin])] (dict_bad, regroup4) = bin_bad_rate(df6, 'col_map_bin', target) chi4 = cal_chi2(regroup4, all_bad_rate) if chi3 < chi4: cutoffpoints.remove(cutoffpoints[minpct_bin_index - 1]) else: cutoffpoints.remove(cutoffpoints[minpct_bin_index]) return cutoffpoints # 数值型变量的分箱(卡方分箱) def binning_num(df, target, col_list, max_bin=None, min_binpct=None): """ df:数据集 target:目标变量的字段名 col_list:变量list集合 max_bin:最大的分箱个数 min_binpct:区间内样本所占总体的最小比 return: bin_df :list形式,里面存储每个变量的分箱结果 iv_value:list形式,里面存储每个变量的IV值 """ total = df[target].count() bad = df[target].sum() good = total - bad all_odds = good / bad inf = float('inf') ninf = float('-inf') bin_df = [] iv_value = [] for col in col_list: cut = ChiMerge(df, col, target, max_bin=max_bin, min_binpct=min_binpct) cut.insert(0, ninf) cut.append(inf) bucket = pd.cut(df[col], cut) d1 = df.groupby(bucket) d2 = pd.DataFrame() d2['min_bin'] = d1[col].min() d2['max_bin'] = d1[col].max() d2['total'] = d1[target].count() d2['totalrate'] = d2['total'] / total d2['bad'] = d1[target].sum() d2['badrate'] = d2['bad'] / d2['total'] d2['good'] = d2['total'] - d2['bad'] d2['goodrate'] = d2['good'] / d2['total'] d2['badattr'] = d2['bad'] / bad d2['goodattr'] = (d2['total'] - d2['bad']) / good d2['odds'] = d2['good'] / d2['bad'] GB_list = [] for i in d2.odds: if i >= all_odds: GB_index = str(round((i / all_odds) * 100, 0)) + str('G') else: GB_index = str(round((all_odds / i) * 100, 0)) + str('B') GB_list.append(GB_index) d2['GB_index'] = GB_list d2['woe'] = np.log(d2['badattr'] / d2['goodattr']) d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['woe'] d2['IV'] = d2['bin_iv'].sum() iv = d2['bin_iv'].sum().round(3) print('变量名:{}'.format(col)) print('IV:{}'.format(iv)) print('\t') bin_df.append(d2) iv_value.append(iv) return bin_df, iv_value # 数值型变量的iv明细表 def iv_num(df, target, col_list, max_bin=None, min_binpct=None): """ df:数据集 target:目标变量的字段名 col_list:变量list集合 max_bin:最大的分箱个数 min_binpct:区间内样本所占总体的最小比 return :变量的iv明细表 """ bin_df, iv_value = binning_num(df, target, col_list, max_bin=max_bin, min_binpct=min_binpct) iv_df = pd.DataFrame({'col': col_list, 'iv': iv_value}) iv_df = iv_df.sort_values('iv', ascending=False) return iv_df # 自定义分箱 def binning_self(df, col, target, cut=None, right_border=True): """ df: 数据集 col:分箱的单个变量名 cut:划分区间的list right_border:设定左开右闭、左闭右开 return: bin_df: df形式,单个变量的分箱结果 iv_value: 单个变量的iv """ total = df[target].count() bad = df[target].sum() good = total - bad all_odds = good / bad bucket = pd.cut(df[col], cut, right=right_border) d1 = df.groupby(bucket) d2 = pd.DataFrame() d2['min_bin'] = d1[col].min() d2['max_bin'] = d1[col].max() d2['total'] = d1[target].count() d2['totalrate'] = d2['total'] / total d2['bad'] = d1[target].sum() d2['badrate'] = d2['bad'] / d2['total'] d2['good'] = d2['total'] - d2['bad'] d2['goodrate'] = d2['good'] / d2['total'] d2['badattr'] = d2['bad'] / bad d2['goodattr'] = (d2['total'] - d2['bad']) / good d2['odds'] = d2['good'] / d2['bad'] GB_list = [] for i in d2.odds: if i >= all_odds: GB_index = str(round((i / all_odds) * 100, 0)) + str('G') else: GB_index = str(round((all_odds / i) * 100, 0)) + str('B') GB_list.append(GB_index) d2['GB_index'] = GB_list d2['woe'] = np.log(d2['badattr'] / d2['goodattr']) d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['woe'] d2['IV'] = d2['bin_iv'].sum() iv_value = d2['bin_iv'].sum().round(3) print('变量名:{}'.format(col)) print('IV:{}'.format(iv_value)) bin_df = d2.copy() return bin_df, iv_value # 变量分箱结果的检查 # woe的可视化 def plot_woe(bin_df, hspace=0.4, wspace=0.4, plt_size=None, plt_num=None, x=None, y=None): """ bin_df:list形式,里面存储每个变量的分箱结果 hspace :子图之间的间隔(y轴方向) wspace :子图之间的间隔(x轴方向) plt_size :图纸的尺寸 plt_num :子图的数量 x :子图矩阵中一行子图的数量 y :子图矩阵中一列子图的数量 return :每个变量的woe变化趋势图 """ plt.figure(figsize=plt_size) plt.subplots_adjust(hspace=hspace, wspace=wspace) for i, df in zip(range(1, plt_num + 1, 1), bin_df): col_name = df.index.name df = df.reset_index() plt.subplot(x, y, i) plt.title(col_name) sns.barplot(data=df, x=col_name, y='woe') plt.xlabel('') plt.xticks(rotation=30) return plt.show() # 检验woe是否单调 def woe_monoton(bin_df): """ bin_df:list形式,里面存储每个变量的分箱结果 return : woe_notmonoton_col :woe没有呈单调变化的变量,list形式 woe_judge_df :df形式,每个变量的检验结果 """ woe_notmonoton_col = [] col_list = [] woe_judge = [] for woe_df in bin_df: col_name = woe_df.index.name woe_list = list(woe_df.woe) if woe_df.shape[0] == 2: # print('{}是否单调: True'.format(col_name)) col_list.append(col_name) woe_judge.append('True') else: woe_not_monoton = [(woe_list[i] < woe_list[i + 1] and woe_list[i] < woe_list[i - 1]) or ( woe_list[i] > woe_list[i + 1] and woe_list[i] > woe_list[i - 1]) for i in range(1, len(woe_list) - 1, 1)] if True in woe_not_monoton: # print('{}是否单调: False'.format(col_name)) woe_notmonoton_col.append(col_name) col_list.append(col_name) woe_judge.append('False') else: # print('{}是否单调: True'.format(col_name)) col_list.append(col_name) woe_judge.append('True') woe_judge_df = pd.DataFrame({'col': col_list, 'judge_monoton': woe_judge}) return woe_notmonoton_col, woe_judge_df # 检查某个区间的woe是否大于1 def woe_large(bin_df): """ bin_df:list形式,里面存储每个变量的分箱结果 return: woe_large_col: 某个区间woe大于1的变量,list集合 woe_judge_df :df形式,每个变量的检验结果 """ woe_large_col = [] col_list = [] woe_judge = [] for woe_df in bin_df: col_name = woe_df.index.name woe_list = list(woe_df.woe) woe_large = list(filter(lambda x: x >= 1, woe_list)) if len(woe_large) > 0: col_list.append(col_name) woe_judge.append('True') woe_large_col.append(col_name) else: col_list.append(col_name) woe_judge.append('False') woe_judge_df = pd.DataFrame({'col': col_list, 'judge_large': woe_judge}) return woe_large_col, woe_judge_df # 变量筛选 # xgboost筛选变量 def select_xgboost(df, target, imp_num=None): """ df:数据集 target:目标变量的字段名 imp_num:筛选变量的个数 return: xg_fea_imp:变量的特征重要性 xg_select_col:筛选出的变量 """ x = df.drop([target], axis=1) y = df[target] xgmodel = XGBClassifier(random_state=0) xgmodel = xgmodel.fit(x, y, eval_metric='auc') xg_fea_imp = pd.DataFrame({'col': list(x.columns), 'imp': xgmodel.feature_importances_}) xg_fea_imp = xg_fea_imp.sort_values('imp', ascending=False).reset_index(drop=True).iloc[:imp_num, :] xg_select_col = list(xg_fea_imp.col) return xg_fea_imp, xg_select_col # 随机森林筛选变量 def select_rf(df, target, imp_num=None): """ df:数据集 target:目标变量的字段名 imp_num:筛选变量的个数 return: rf_fea_imp:变量的特征重要性 rf_select_col:筛选出的变量 """ x = df.drop([target], axis=1) y = df[target] rfmodel = RandomForestClassifier(random_state=0) rfmodel = rfmodel.fit(x, y) rf_fea_imp = pd.DataFrame({'col': list(x.columns), 'imp': rfmodel.feature_importances_}) rf_fea_imp = rf_fea_imp.sort_values('imp', ascending=False).reset_index(drop=True).iloc[:imp_num, :] rf_select_col = list(rf_fea_imp.col) return rf_fea_imp, rf_select_col # 相关性可视化 def plot_corr(df, col_list, threshold=None, plt_size=None, is_annot=True): """ df:数据集 col_list:变量list集合 threshold: 相关性设定的阈值 plt_size:图纸尺寸 is_annot:是否显示相关系数值 return :相关性热力图 """ corr_df = df.loc[:, col_list].corr() plt.figure(figsize=plt_size) sns.heatmap(corr_df, annot=is_annot, cmap='rainbow', vmax=1, vmin=-1, mask=np.abs(corr_df) <= threshold) return plt.show() # 相关性剔除 def forward_delete_corr(df, col_list, threshold=None): """ df:数据集 col_list:变量list集合 threshold: 相关性设定的阈值 return:相关性剔除后的变量 """ list_corr = col_list[:] for col in list_corr: corr = df.loc[:, list_corr].corr()[col] corr_index = [x for x in corr.index if x != col] corr_values = [x for x in corr.values if x != 1] for i, j in zip(corr_index, corr_values): if abs(j) >= threshold: list_corr.remove(i) return list_corr # 相关性变量映射关系 def corr_mapping(df, col_list, threshold=None): """ df:数据集 col_list:变量list集合 threshold: 相关性设定的阈值 return:强相关性变量之间的映射关系表 """ corr_df = df.loc[:, col_list].corr() col_a = [] col_b = [] corr_value = [] for col, i in zip(col_list[:-1], range(1, len(col_list), 1)): high_corr_col = [] high_corr_value = [] corr_series = corr_df[col][i:] for i, j in zip(corr_series.index, corr_series.values): if abs(j) >= threshold: high_corr_col.append(i) high_corr_value.append(j) col_a.extend([col] * len(high_corr_col)) col_b.extend(high_corr_col) corr_value.extend(high_corr_value) corr_map_df = pd.DataFrame({'col_A': col_a, 'col_B': col_b, 'corr': corr_value}) return corr_map_df # 显著性筛选,在筛选前需要做woe转换 def forward_delete_pvalue(x_train, y_train): """ x_train -- x训练集 y_train -- y训练集 return :显著性筛选后的变量 """ col_list = list(x_train.columns) pvalues_col = [] for col in col_list: pvalues_col.append(col) x_train2 = sm.add_constant(x_train.loc[:, pvalues_col]) sm_lr = sm.Logit(y_train, x_train2) sm_lr = sm_lr.fit() for i, j in zip(sm_lr.pvalues.index[1:], sm_lr.pvalues.values[1:]): if j >= 0.05: pvalues_col.remove(i) x_new_train = x_train.loc[:, pvalues_col] x_new_train2 = sm.add_constant(x_new_train) lr = sm.Logit(y_train, x_new_train2) lr = lr.fit() print(lr.summary2()) return pvalues_col # 逻辑回归系数符号筛选,在筛选前需要做woe转换 def forward_delete_coef(x_train, y_train): """ x_train -- x训练集 y_train -- y训练集 return : coef_col回归系数符号筛选后的变量 lr_coe:每个变量的系数值 """ col_list = list(x_train.columns) coef_col = [] for col in col_list: coef_col.append(col) x_train2 = x_train.loc[:, coef_col] sk_lr = LogisticRegression(random_state=0).fit(x_train2, y_train) coef_df = pd.DataFrame({'col': coef_col, 'coef': sk_lr.coef_[0]}) if coef_df[coef_df.coef < 0].shape[0] > 0: coef_col.remove(col) x_new_train = x_train.loc[:, coef_col] lr = LogisticRegression(random_state=0).fit(x_new_train, y_train) lr_coe = pd.DataFrame({'col': coef_col, 'coef': lr.coef_[0]}) return coef_col, lr_coe # 数据预处理 # 每个变量缺失率的计算 def missing_cal(df): """ df :数据集 return:每个变量的缺失率 """ missing_series = df.isnull().sum() / df.shape[0] missing_df = pd.DataFrame(missing_series).reset_index() missing_df = missing_df.rename(columns={'index': 'col', 0: 'missing_pct'}) missing_df = missing_df.sort_values('missing_pct', ascending=False).reset_index(drop=True) return missing_df # 变量的缺失分布图 def plot_missing_var(df, plt_size=None): """ df: 数据集 plt_size :图纸的尺寸 return: 缺失分布图(直方图形式) """ missing_df = missing_cal(df) plt.figure(figsize=plt_size) plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False x = missing_df['missing_pct'] plt.hist(x=x, bins=np.arange(0, 1.1, 0.1), color='hotpink', ec='k', alpha=0.8) plt.ylabel('缺失值个数') plt.xlabel('缺失率') return plt.show() # 单个样本的缺失分布 def plot_missing_user(df, plt_size=None): """ df: 数据集 plt_size: 图纸的尺寸 return :缺失分布图(折线图形式) """ missing_series = df.isnull().sum(axis=1) list_missing_num = sorted(list(missing_series.values)) plt.figure(figsize=plt_size) plt.rcParams['font.sans-serif'] = ['Microsoft YaHei'] plt.rcParams['axes.unicode_minus'] = False plt.plot(range(df.shape[0]), list_missing_num) plt.ylabel('缺失变量个数') plt.xlabel('sanples') return plt.show() # 缺失值剔除(单个变量) def missing_delete_var(df, threshold=None): """ df:数据集 threshold:缺失率删除的阈值 return :删除缺失后的数据集 """ df2 = df.copy() missing_df = missing_cal(df) missing_col_num = missing_df[missing_df.missing_pct >= threshold].shape[0] missing_col = list(missing_df[missing_df.missing_pct >= threshold].col) df2 = df2.drop(missing_col, axis=1) print('缺失率超过{}的变量个数为{}'.format(threshold, missing_col_num)) return df2 # 缺失值剔除(单个样本) def missing_delete_user(df, threshold=None): """ df:数据集 threshold:缺失个数删除的阈值 return :删除缺失后的数据集 """ df2 = df.copy() missing_series = df.isnull().sum(axis=1) missing_list = list(missing_series) missing_index_list = [] for i, j in enumerate(missing_list): if j >= threshold: missing_index_list.append(i) df2 = df2[~(df2.index.isin(missing_index_list))] print('缺失变量个数在{}以上的用户数有{}个'.format(threshold, len(missing_index_list))) return df2 # 缺失值填充(类别型变量) def fillna_cate_var(df, col_list, fill_type=None): """ df:数据集 col_list:变量list集合 fill_type: 填充方式:众数/当做一个类别 return :填充后的数据集 """ df2 = df.copy() for col in col_list: if fill_type == 'class': df2[col] = df2[col].fillna('unknown') if fill_type == 'mode': df2[col] = df2[col].fillna(df2[col].mode()[0]) return df2 # 数值型变量的填充 # 针对缺失率在5%以下的变量用中位数填充 # 缺失率在5%--15%的变量用随机森林填充,可先对缺失率较低的变量先用中位数填充,在用没有缺失的样本来对变量作随机森林填充 # 缺失率超过15%的变量建议当做一个类别 def fillna_num_var(df, col_list, fill_type=None, filled_df=None): """ df:数据集 col_list:变量list集合 fill_type:填充方式:中位数/随机森林/当做一个类别 filled_df :已填充好的数据集,当填充方式为随机森林时 使用 return:已填充好的数据集 """ df2 = df.copy() for col in col_list: if fill_type == 'median': df2[col] = df2[col].fillna(df2[col].median()) if fill_type == 'class': df2[col] = df2[col].fillna(-999) if fill_type == 'rf': rf_df = pd.concat([df2[col], filled_df], axis=1) known = rf_df[rf_df[col].notnull()] unknown = rf_df[rf_df[col].isnull()] x_train = known.drop([col], axis=1) y_train = known[col] x_pre = unknown.drop([col], axis=1) rf = RandomForestRegressor(random_state=0) rf.fit(x_train, y_train) y_pre = rf.predict(x_pre) df2.loc[df2[col].isnull(), col] = y_pre return df2 # 常变量/同值化处理 def const_delete(df, col_list, threshold=None): """ df:数据集 col_list:变量list集合 threshold:同值化处理的阈值 return :处理后的数据集 """ df2 = df.copy() const_col = [] for col in col_list: const_pct = df2[col].value_counts().iloc[0] / df2[df2[col].notnull()].shape[0] if const_pct >= threshold: const_col.append(col) df2 = df2.drop(const_col, axis=1) print('常变量/同值化处理的变量个数为{}'.format(len(const_col))) return df2 # 分类型变量的降基处理 def descending_cate(df, col_list, threshold=None): """ df: 数据集 col_list:变量list集合 threshold:降基处理的阈值 return :处理后的数据集 """ df2 = df.copy() for col in col_list: value_series = df[col].value_counts() / df[df[col].notnull()].shape[0] small_value = [] for value_name, value_pct in zip(value_series.index, value_series.values): if value_pct <= threshold: small_value.append(value_name) df2.loc[df2[col].isin(small_value), col] = 'other' return df2 # 模型评估 # AUC def plot_roc(y_label, y_pred): """ y_label:测试集的y y_pred:对测试集预测后的概率 return:ROC曲线 """ tpr, fpr, threshold = metrics.roc_curve(y_label, y_pred) AUC = metrics.roc_auc_score(y_label, y_pred) fig = plt.figure(figsize=(6, 4)) ax = fig.add_subplot(1, 1, 1) ax.plot(tpr, fpr, color='blue', label='AUC=%.3f' % AUC) ax.plot([0, 1], [0, 1], 'r--') ax.set_ylim(0, 1) ax.set_xlim(0, 1) ax.set_title('ROC') ax.legend(loc='best') return plt.show(ax) # KS def plot_model_ks(y_label, y_pred): """ y_label:测试集的y y_pred:对测试集预测后的概率 return:KS曲线 """ pred_list = list(y_pred) label_list = list(y_label) total_bad = sum(label_list) total_good = len(label_list) - total_bad items = sorted(zip(pred_list, label_list), key=lambda x: x[0]) step = (max(pred_list) - min(pred_list)) / 200 pred_bin = [] good_rate = [] bad_rate = [] ks_list = [] for i in range(1, 201): idx = min(pred_list) + i * step pred_bin.append(idx) label_bin = [x[1] for x in items if x[0] < idx] bad_num = sum(label_bin) good_num = len(label_bin) - bad_num goodrate = good_num / total_good badrate = bad_num / total_bad ks = abs(goodrate - badrate) good_rate.append(goodrate) bad_rate.append(badrate) ks_list.append(ks) fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(1, 1, 1) ax.plot(pred_bin, good_rate, color='green', label='good_rate') ax.plot(pred_bin, bad_rate, color='red', label='bad_rate') ax.plot(pred_bin, ks_list, color='blue', label='good-bad') ax.set_title('KS:{:.3f}'.format(max(ks_list))) ax.legend(loc='best') return plt.show(ax) # 交叉验证 def cross_verify(x, y, estimators, fold, scoring='roc_auc'): """ x:自变量的数据集 y:target的数据集 estimators:验证的模型 fold:交叉验证的策略 scoring:评级指标,默认auc return:交叉验证的结果 """ cv_result = cross_val_score(estimator=estimators, X=x, y=y, cv=fold, n_jobs=-1, scoring=scoring) print('CV的最大AUC为:{}'.format(cv_result.max())) print('CV的最小AUC为:{}'.format(cv_result.min())) print('CV的平均AUC为:{}'.format(cv_result.mean())) plt.figure(figsize=(6, 4)) plt.title('交叉验证的评价指标分布图') plt.boxplot(cv_result, patch_artist=True, showmeans=True, boxprops={'color': 'black', 'facecolor': 'yellow'}, meanprops={'marker': 'D', 'markerfacecolor': 'tomato'}, flierprops={'marker': 'o', 'markerfacecolor': 'red', 'color': 'black'}, medianprops={'linestyle': '--', 'color': 'orange'}) return plt.show() # 学习曲线 def plot_learning_curve(estimator, x, y, cv=None, train_size=np.linspace(0.1, 1.0, 5), plt_size=None): """ estimator :画学习曲线的基模型 x:自变量的数据集 y:target的数据集 cv:交叉验证的策略 train_size:训练集划分的策略 plt_size:画图尺寸 return:学习曲线 """ from sklearn.model_selection import learning_curve train_sizes, train_scores, test_scores = learning_curve(estimator=estimator, X=x, y=y, cv=cv, n_jobs=-1, train_sizes=train_size) train_scores_mean = np.mean(train_scores, axis=1) train_scores_std = np.std(train_scores, axis=1) test_scores_mean = np.mean(test_scores, axis=1) test_scores_std = np.std(test_scores, axis=1) plt.figure(figsize=plt_size) plt.xlabel('Training-example') plt.ylabel('score') plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color='r') plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color='g') plt.plot(train_sizes, train_scores_mean, 'o-', color='r', label='Training-score') plt.plot(train_sizes, test_scores_mean, 'o-', color='g', label='cross-val-score') plt.legend(loc='best') return plt.show() # 混淆矩阵 /分类报告 def plot_matrix_report(y_label, y_pred): """ y_label:测试集的y y_pred:对测试集预测后的概率 return:混淆矩阵 """ matrix_array = metrics.confusion_matrix(y_label, y_pred) plt.matshow(matrix_array, cmap=plt.cm.summer_r) plt.colorbar() for x in range(len(matrix_array)): for y in range(len(matrix_array)): plt.annotate(matrix_array[x, y], xy=(x, y), ha='center', va='center') plt.xlabel('True label') plt.ylabel('Predict label') print(metrics.classification_report(y_label, y_pred)) return plt.show() # 评分卡实现 # 评分卡刻度 def cal_scale(score, odds, PDO, model): """ odds:设定的坏好比 score:在这个odds下的分数 PDO: 好坏翻倍比 model:逻辑回归模型 return :A,B,base_score """ B = 20 / (np.log(odds) - np.log(2 * odds)) A = score - B * np.log(odds) base_score = A + B * model.intercept_[0] print('B: {:.2f}'.format(B)) print('A: {:.2f}'.format(A)) print('基础分为:{:.2f}'.format(base_score)) return A, B, base_score # 变量得分表 def score_df_concat(woe_df, model, B): """ woe_df: woe结果表 model:逻辑回归模型 return:变量得分结果表 """ coe = list(model.coef_[0]) columns = list(woe_df.col.unique()) scores = [] for c, col in zip(coe, columns): score = [] for w in list(woe_df[woe_df.col == col].woe): s = round(c * w * B, 0) score.append(s) scores.extend(score) woe_df['score'] = scores score_df = woe_df.copy() return score_df # 分数转换 def score_transform(df, target, df_score): """ df:数据集 target:目标变量的字段名 df_score:得分结果表 return:得分转化之后的数据集 """ df2 = df.copy() for col in df2.drop([target], axis=1).columns: x = df2[col] bin_map = df_score[df_score.col == col] bin_res = np.array([0] * x.shape[0], dtype=float) for i in bin_map.index: lower = bin_map['min_bin'][i] upper = bin_map['max_bin'][i] if lower == upper: x1 = x[np.where(x == lower)[0]] else: x1 = x[np.where((x >= lower) & (x <= upper))[0]] mask = np.in1d(x, x1) bin_res[mask] = bin_map['score'][i] bin_res = pd.Series(bin_res, index=x.index) bin_res.name = x.name df2[col] = bin_res return df2 # 得分的KS def plot_score_ks(df, score_col, target): """ df:数据集 target:目标变量的字段名 score_col:最终得分的字段名 """ total_bad = df[target].sum() total_good = df[target].count() - total_bad score_list = list(df[score_col]) target_list = list(df[target]) items = sorted(zip(score_list, target_list), key=lambda x: x[0]) step = (max(score_list) - min(score_list)) / 200 score_bin = [] good_rate = [] bad_rate = [] ks_list = [] for i in range(1, 201): idx = min(score_list) + i * step score_bin.append(idx) target_bin = [x[1] for x in items if x[0] < idx] bad_num = sum(target_bin) good_num = len(target_bin) - bad_num goodrate = good_num / total_good badrate = bad_num / total_bad ks = abs(goodrate - badrate) good_rate.append(goodrate) bad_rate.append(badrate) ks_list.append(ks) fig = plt.figure(figsize=(8, 6)) ax = fig.add_subplot(1, 1, 1) ax.plot(score_bin, good_rate, color='green', label='good_rate') ax.plot(score_bin, bad_rate, color='red', label='bad_rate') ax.plot(score_bin, ks_list, color='blue', label='good-bad') ax.set_title('KS:{:.3f}'.format(max(ks_list))) ax.legend(loc='best') return plt.show(ax) # PR曲线 def plot_PR(df, score_col, target, plt_size=None): """ df:得分的数据集 score_col:分数的字段名 target:目标变量的字段名 plt_size:绘图尺寸 return: PR曲线 """ total_bad = df[target].sum() score_list = list(df[score_col]) target_list = list(df[target]) score_unique_list = sorted(set(list(df[score_col]))) items = sorted(zip(score_list, target_list), key=lambda x: x[0]) precison_list = [] tpr_list = [] for score in score_unique_list: target_bin = [x[1] for x in items if x[0] <= score] bad_num = sum(target_bin) total_num = len(target_bin) precison = bad_num / total_num tpr = bad_num / total_bad precison_list.append(precison) tpr_list.append(tpr) plt.figure(figsize=plt_size) plt.title('PR曲线') plt.xlabel('查全率') plt.ylabel('精确率') plt.plot(tpr_list, precison_list, color='tomato', label='PR曲线') plt.legend(loc='best') return plt.show() # 得分分布图 def plot_score_hist(df, target, score_col, plt_size=None, cutoff=None): """ df:数据集 target:目标变量的字段名 score_col:最终得分的字段名 plt_size:图纸尺寸 cutoff :划分拒绝/通过的点 return :好坏用户的得分分布图 """ plt.figure(figsize=plt_size) x1 = df[df[target] == 1][score_col] x2 = df[df[target] == 0][score_col] sns.kdeplot(x1, shade=True, label='坏用户', color='hotpink') sns.kdeplot(x2, shade=True, label='好用户', color='seagreen') plt.axvline(x=cutoff) plt.legend() return plt.show() # 得分明细表 def score_info(df, score_col, target, x=None, y=None, step=None): """ df:数据集 target:目标变量的字段名 score_col:最终得分的字段名 x:最小区间的左值 y:最大区间的右值 step:区间的分数间隔 return :得分明细表 """ df['score_bin'] = pd.cut(df[score_col], bins=np.arange(x, y, step), right=True) total = df[target].count() bad = df[target].sum() good = total - bad group = df.groupby('score_bin') score_info_df = pd.DataFrame() score_info_df['用户数'] = group[target].count() score_info_df['坏用户'] = group[target].sum() score_info_df['好用户'] = score_info_df['用户数'] - score_info_df['坏用户'] score_info_df['违约占比'] = score_info_df['坏用户'] / score_info_df['用户数'] score_info_df['累计用户'] = score_info_df['用户数'].cumsum() score_info_df['坏用户累计'] = score_info_df['坏用户'].cumsum() score_info_df['好用户累计'] = score_info_df['好用户'].cumsum() score_info_df['坏用户累计占比'] = score_info_df['坏用户累计'] / bad score_info_df['好用户累计占比'] = score_info_df['好用户累计'] / good score_info_df['累计用户占比'] = score_info_df['累计用户'] / total score_info_df['累计违约占比'] = score_info_df['坏用户累计'] / score_info_df['累计用户'] score_info_df = score_info_df.reset_index() return score_info_df # 绘制提升图和洛伦兹曲线 def plot_lifting(df, score_col, target, bins=10, plt_size=None): """ df:数据集,包含最终的得分 score_col:最终分数的字段名 target:目标变量名 bins:分数划分成的等份数 plt_size:绘图尺寸 return:提升图和洛伦兹曲线 """ score_list = list(df[score_col]) label_list = list(df[target]) items = sorted(zip(score_list, label_list), key=lambda x: x[0]) step = round(df.shape[0] / bins, 0) bad = df[target].sum() all_badrate = float(1 / bins) all_badrate_list = [all_badrate] * bins all_badrate_cum = list(np.cumsum(all_badrate_list)) all_badrate_cum.insert(0, 0) score_bin_list = [] bad_rate_list = [] for i in range(0, bins, 1): index_a = int(i * step) index_b = int((i + 1) * step) score = [x[0] for x in items[index_a:index_b]] tup1 = (min(score),) tup2 = (max(score),) score_bin = tup1 + tup2 score_bin_list.append(score_bin) label_bin = [x[1] for x in items[index_a:index_b]] bin_bad = sum(label_bin) bin_bad_rate = bin_bad / bad bad_rate_list.append(bin_bad_rate) bad_rate_cumsum = list(np.cumsum(bad_rate_list)) bad_rate_cumsum.insert(0, 0) plt.figure(figsize=plt_size) x = score_bin_list y1 = bad_rate_list y2 = all_badrate_list y3 = bad_rate_cumsum y4 = all_badrate_cum plt.subplot(1, 2, 1) plt.title('提升图') plt.xticks(np.arange(bins) + 0.15, x, rotation=90) bar_width = 0.3 plt.bar(np.arange(bins), y1, width=bar_width, color='hotpink', label='score_card') plt.bar(np.arange(bins) + bar_width, y2, width=bar_width, color='seagreen', label='random') plt.legend(loc='best') plt.subplot(1, 2, 2) plt.title('洛伦兹曲线图') plt.plot(y3, color='hotpink', label='score_card') plt.plot(y4, color='seagreen', label='random') plt.xticks(np.arange(bins + 1), rotation=0) plt.legend(loc='best') return plt.show() # 设定cutoff点,衡量有效性 def rule_verify(df, col_score, target, cutoff): """ df:数据集 target:目标变量的字段名 col_score:最终得分的字段名 cutoff :划分拒绝/通过的点 return :混淆矩阵 """ df['result'] = df.apply(lambda x: 30 if x[col_score] <= cutoff else 10, axis=1) TP = df[(df['result'] == 30) & (df[target] == 1)].shape[0] FN = df[(df['result'] == 30) & (df[target] == 0)].shape[0] bad = df[df[target] == 1].shape[0] good = df[df[target] == 0].shape[0] refuse = df[df['result'] == 30].shape[0] passed = df[df['result'] == 10].shape[0] acc = round(TP / refuse, 3) tpr = round(TP / bad, 3) fpr = round(FN / good, 3) pass_rate = round(refuse / df.shape[0], 3) matrix_df = pd.pivot_table(df, index='result', columns=target, aggfunc={col_score: pd.Series.count}, values=col_score) print('精确率:{}'.format(acc)) print('查全率:{}'.format(tpr)) print('误伤率:{}'.format(fpr)) print('规则拒绝率:{}'.format(pass_rate)) return matrix_df # 绘制变量的得分占比偏移图 def plot_var_shift(df, day_col, score_col, plt_size=None): """ df:变量在一段时间内,每个区间上的得分 day_col:时间的字段名(天) score_col:得分的字段名 plt_size: 绘图尺寸 return:变量区间得分的偏移图 """ day_list = sorted(set(list(df[day_col]))) score_list = sorted(set(list(df[score_col]))) # 计算每天各个区间得分的占比 prop_day_list = [] for day in day_list: prop_list = [] for score in score_list: prop = df[(df[day_col] == day) & (df[score_col] == score)].shape[0] / df[df[day_col] == day].shape[0] prop_list.append(prop) prop_day_list.append(prop_list) # 将得分占比的转化为画图的格式 sub_list = [] for p in prop_day_list: p_cumsum = list(np.cumsum(p)) p_cumsum = p_cumsum[:-1] p_cumsum.insert(0, 0) bar1_list = [1] * int(len(p_cumsum)) sub = [bar1_list[i] - p_cumsum[i] for i in range(len(p_cumsum))] sub_list.append(sub) array = np.array(sub_list) stack_prop_list = [] # 面积图的y值 bar_prop_list = [] # 堆积柱状图的y for i in range(len(score_list)): bar_prop = array[:, i] bar_prop_list.append(bar_prop) stack_prop = [] for j in bar_prop: a = j b = j stack_prop.append(a) stack_prop.append(b) stack_prop_list.append(stack_prop) # 画图的x坐标轴 x_bar = list(range(1, len(day_list) * 2, 2)) # 堆积柱状图的x值 x_stack = [] # 面积图的x值 for i in x_bar: c = i - 0.5 d = i + 0.5 x_stack.append(c) x_stack.append(d) # 绘图 fig = plt.figure(figsize=plt_size) ax1 = fig.add_subplot(1, 1, 1) # 先清除x轴的刻度 ax1.xaxis.set_major_formatter(plt.FuncFormatter(''.format)) ax1.set_xticks(range(1, len(day_list) * 2, 2)) # 将y轴的刻度设置为百分比形式 def to_percent(temp, position): return '%1.0f' % (100 * temp) + '%' plt.gca().yaxis.set_major_formatter(plt.FuncFormatter(to_percent)) # 自定义x轴刻度标签 for a, b in zip(x_bar, day_list): ax1.text(a, -0.08, b, ha='center', va='bottom') # 绘制面积图和堆积柱状图 for i, s in zip(range(len(day_list)), score_list): ax1.stackplot(x_stack, stack_prop_list[i], alpha=0.25) ax1.bar(x_bar, bar_prop_list[i], width=1, label='得分:{}'.format(s)) # 添加y轴刻度虚线 ax1.grid(True, 'major', 'y', ls='--', lw=.5, c='black', alpha=.3) ax1.legend(loc='best') plt.show() # 计算评分的PSI def score_psi(df1, df2, id_col, score_col, x, y, step=None): """ df1:建模样本的得分,包含用户id,得分 df2:上线样本的得分,包含用户id,得分 id_col:用户id字段名 score_col:得分的字段名 x:划分得分区间的left值 y:划分得分区间的right值 step:步长 return: 得分psi表 """ df1['score_bin'] = pd.cut(df1[score_col], bins=np.arange(x, y, step)) model_score_group = df1.groupby('score_bin', as_index=False)[id_col].count().assign( pct=lambda x: x[id_col] / x[id_col].sum()).rename(columns={id_col: '建模样本户数', 'pct': '建模户数占比'}) df2['score_bin'] = pd.cut(df2[score_col], bins=np.arange(x, y, step)) online_score_group = df2.groupby('score_bin', as_index=False)[id_col].count().assign( pct=lambda x: x[id_col] / x[id_col].sum()).rename(columns={id_col: '线上样本户数', 'pct': '线上户数占比'}) score_compare = pd.merge(model_score_group, online_score_group, on='score_bin', how='inner') score_compare['占比差异'] = score_compare['线上户数占比'] - score_compare['建模户数占比'] score_compare['占比权重'] = np.log(score_compare['线上户数占比'] / score_compare['建模户数占比']) score_compare['Index'] = score_compare['占比差异'] * score_compare['占比权重'] score_compare['PSI'] = score_compare['Index'].sum() return score_compare # 评分比较分布图 def plot_score_compare(df, plt_size=None): fig = plt.figure(figsize=plt_size) x = df.score_bin y1 = df.建模户数占比 y2 = df.线上户数占比 width = 0.3 plt.title('评分分布对比图') plt.xlabel('得分区间') plt.ylabel('用户占比') plt.xticks(np.arange(len(x)) + 0.15, x) plt.bar(np.arange(len(y1)), y1, width=width, color='seagreen', label='建模样本') plt.bar(np.arange(len(y2)) + width, y2, width=width, color='hotpink', label='上线样本') plt.legend() return plt.show() # 变量稳定度分析 def var_stable(score_result, df, var, id_col, score_col, bins): """ score_result:评分卡的score明细表,包含区间,用户数,用户占比,得分 var:分析的变量名 df:上线样本变量的得分,包含用户id,变量的value,变量的score id_col:df的用户id字段名 score_col:df的得分字段名 bins:变量划分的区间 return :变量的稳定性分析表 """ model_var_group = score_result.loc[score_result.col == var, ['bin', 'total', 'totalrate', 'score']].reset_index( drop=True).rename(columns={'total': '建模用户数', 'totalrate': '建模用户占比', 'score': '得分'}) df['bin'] = pd.cut(df[score_col], bins=bins) online_var_group = df.groupby('bin', as_index=False)[id_col].count().assign( pct=lambda x: x[id_col] / x[id_col].sum()).rename(columns={id_col: '线上用户数', 'pct': '线上用户占比'}) var_stable_df = pd.merge(model_var_group, online_var_group, on='bin', how='inner') var_stable_df = var_stable_df.iloc[:, [0, 3, 1, 2, 4, 5]] var_stable_df['得分'] = var_stable_df['得分'].astype('int64') var_stable_df['建模样本权重'] = np.abs(var_stable_df['得分'] * var_stable_df['建模用户占比']) var_stable_df['线上样本权重'] = np.abs(var_stable_df['得分'] * var_stable_df['线上用户占比']) var_stable_df['权重差距'] = var_stable_df['线上样本权重'] - var_stable_df['建模样本权重'] return var_stable_df