混淆矩阵按照预测(0,1)和真实(0,1)形成了2*2的矩阵,基于此计算模型的性能指标。
有一点需要注意,预测的初始值一般是类似概率的连续值,那么在什么阈值之下预测为1呢?毕竟要作出分类预测才能计算模型“准不准”。
以sklearn的规范为例,
y = target
y_hat = predict_proba
# 将真实值、预测概率值组成df
def make_ordered_pct_df(y, y_hat):
tem_df = pd.DataFrame()
tem_df['predict'] = y_hat
tem_df['real'] = y
tem_df['real'] = tem_df['real'].apply(
lambda x: 'Target' if x == 1 else 'NotTarget')
return tem_df.dropna()
# 给到阈值之后才会有predict 0, 1
def get_confusion_metrics_parts(ordered_pct_df, cut_proba):
TP = (ordered_pct_df.predict >= cut_proba) & (
ordered_pct_df.real == 'Target')
FP = (ordered_pct_df.predict >= cut_proba) & (
ordered_pct_df.real == 'NotTarget')
TN = (ordered_pct_df.predict < cut_proba) & (
ordered_pct_df.real == 'NotTarget')
FN = (ordered_pct_df.predict < cut_proba) & (
ordered_pct_df.real == 'Target')
return TP.sum(), FP.sum(), TN.sum(), FN.sum()
# 计算precision(模型效率)和sensitivity(模型战果)
def cal_confusion_metrics(TP, FP, TN, FN):
precision = TP / (TP + FP) # 预测效率
sensitivity = TP / (TP + FN) # 体量效果
return precision, sensitivity
# 把上面两个函数联合起来用
def cal_precision_sensitiviy(ordered_pct_df, cut_proba):
TP, FP, TN, FN = get_confusion_metrics_parts(ordered_pct_df, cut_proba)
precision, sensitivity = cal_confusion_metrics(TP, FP, TN, FN)
return precision, sensitivity
# 计算KS值
def ks(ordered_pct_df):
tem_df = ordered_pct_df
tem_df1 = tem_df.groupby(['predict', 'real']).size().unstack()
tem_df2 = tem_df1.fillna(0)
tem_df2['acc0'] = np.cumsum(tem_df2['NotTarget'])
tem_df2['acc1'] = np.cumsum(tem_df2['Target'])
tem_df2['ecd0'] = tem_df2.acc0 / sum(tem_df2['NotTarget'])
tem_df2['ecd1'] = tem_df2.acc1 / sum(tem_df2['Target'])
tem_df2['ks'] = (tem_df2.ecd0 - tem_df2.ecd1)
return tem_df2.ks.max()
# 由KS曲线(最大差值点)建议概率阈值
def ks_suggest_proba(ordered_pct_df):
tem_df = ordered_pct_df
tem_df1 = tem_df.groupby(['predict', 'real']).size().unstack()
tem_df2 = tem_df1.fillna(0)
tem_df2['acc0'] = np.cumsum(tem_df2['NotTarget'])
tem_df2['acc1'] = np.cumsum(tem_df2['Target'])
tem_df2['ecd0'] = tem_df2.acc0 / sum(tem_df2['NotTarget'])
tem_df2['ecd1'] = tem_df2.acc1 / sum(tem_df2['Target'])
tem_df2['ks'] = (tem_df2.ecd0 - tem_df2.ecd1)
return tem_df2.ks.idxmax()
# 如果要画图的话
def ks_df(ordered_pct_df):
tem_df = ordered_pct_df
tem_df1 = tem_df.groupby(['predict', 'real']).size().unstack()
tem_df2 = tem_df1.fillna(0)
tem_df2['acc0'] = np.cumsum(tem_df2['NotTarget'])
tem_df2['acc1'] = np.cumsum(tem_df2['Target'])
tem_df2['ecd0'] = tem_df2.acc0 / sum(tem_df2['NotTarget'])
tem_df2['ecd1'] = tem_df2.acc1 / sum(tem_df2['Target'])
tem_df2['ks'] = (tem_df2.ecd0 - tem_df2.ecd1)
return tem_df2[['ecd0', 'ecd1']]
# 按十等分分bin(里面的函数要改掉)
# 1 处理连续(数值)型列
def num_series_mapper(s, ruler):
left_ruler = np.array(ruler[0])
right_ruler = np.array(ruler[1])
s_null = s[pd.isnull(s)]
s_null = s_null.fillna(-999)
s_notnull = s[~pd.isnull(s)]
s1 = np.array(s_notnull.apply(float)).reshape(-1, 1)
left_judge = (s1 - left_ruler) >= 0
right_judge = (s1 - right_ruler) < 0
result = (left_judge & right_judge)
new_s = pd.Series(result.argmax(axis=1) - 1)
new_s.index = s_notnull.index
return pd.concat([s_null, new_s])
def num_q_ruler(s, quantile_array=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]):
s1 = s[pd.notnull(s)]
s2 = s1.apply(float)
num_min = -1.7e300
num_max = 1.7e300
smin = s2.min()
smax = float(s2.max() + abs(s2.mean())) # 本意是加上一个极小浮点,但是仍然存在取舍偏差
q_series = s2.quantile(quantile_array)
full_set = list(set([num_max, num_min, smax, smin] + list(q_series)))
full_set.sort() # 至少剩下三个值
tem_left_ruler = full_set[:-1]
tem_right_ruler = full_set[1:]
left_ruler = [tem_left_ruler[0], tem_left_ruler[-1]]+tem_left_ruler[1:-1]
right_ruler = [tem_right_ruler[0],
tem_right_ruler[-1]]+tem_right_ruler[1:-1]
return (left_ruler, right_ruler)
def ks_bin(ordered_pct_df, bins=10):
tem_df = ordered_pct_df
quantiles = np.linspace(0, 1, bins+1)
q_ruler = num_q_ruler(tem_df['predict'], quantiles[1:-1])
tem_df['predict_map'] = num_series_mapper(tem_df['predict'], q_ruler)
tem_df1 = tem_df.groupby(['predict_map', 'real']).size().unstack()
tem_df2 = tem_df1.fillna(0)
tem_df2['acc0'] = np.cumsum(tem_df2['NotTarget'])
tem_df2['acc1'] = np.cumsum(tem_df2['Target'])
tem_df2['ecd0'] = tem_df2.acc0 / sum(tem_df2['NotTarget'])
tem_df2['ecd1'] = tem_df2.acc1 / sum(tem_df2['Target'])
tem_df2['ks'] = (tem_df2.ecd0 - tem_df2.ecd1)
return tem_df2.ks.max()