Python一些可能用的到的函数系列2(模型性能)

基于混淆矩阵计算的模型指标

混淆矩阵按照预测(0,1)和真实(0,1)形成了2*2的矩阵,基于此计算模型的性能指标。
有一点需要注意,预测的初始值一般是类似概率的连续值,那么在什么阈值之下预测为1呢?毕竟要作出分类预测才能计算模型“准不准”。
以sklearn的规范为例,
y = target
y_hat = predict_proba


# 将真实值、预测概率值组成df
def make_ordered_pct_df(y, y_hat):
    tem_df = pd.DataFrame()
    tem_df['predict'] = y_hat
    tem_df['real'] = y
    tem_df['real'] = tem_df['real'].apply(
        lambda x: 'Target' if x == 1 else 'NotTarget')
    return tem_df.dropna()
    
# 给到阈值之后才会有predict 0, 1
def get_confusion_metrics_parts(ordered_pct_df, cut_proba):
    TP = (ordered_pct_df.predict >= cut_proba) & (
        ordered_pct_df.real == 'Target')
    FP = (ordered_pct_df.predict >= cut_proba) & (
        ordered_pct_df.real == 'NotTarget')

    TN = (ordered_pct_df.predict < cut_proba) & (
        ordered_pct_df.real == 'NotTarget')
    FN = (ordered_pct_df.predict < cut_proba) & (
        ordered_pct_df.real == 'Target')
    return TP.sum(), FP.sum(), TN.sum(), FN.sum()

# 计算precision(模型效率)和sensitivity(模型战果)
def cal_confusion_metrics(TP, FP, TN, FN):
    precision = TP / (TP + FP)  # 预测效率
    sensitivity = TP / (TP + FN)  # 体量效果
    return precision, sensitivity
    
# 把上面两个函数联合起来用
def cal_precision_sensitiviy(ordered_pct_df, cut_proba):
    TP, FP, TN, FN = get_confusion_metrics_parts(ordered_pct_df, cut_proba)
    precision, sensitivity = cal_confusion_metrics(TP, FP, TN, FN)
    return precision, sensitivity

# 计算KS值
def ks(ordered_pct_df):
    tem_df = ordered_pct_df
    tem_df1 = tem_df.groupby(['predict', 'real']).size().unstack()
    tem_df2 = tem_df1.fillna(0)
    tem_df2['acc0'] = np.cumsum(tem_df2['NotTarget'])
    tem_df2['acc1'] = np.cumsum(tem_df2['Target'])
    tem_df2['ecd0'] = tem_df2.acc0 / sum(tem_df2['NotTarget'])
    tem_df2['ecd1'] = tem_df2.acc1 / sum(tem_df2['Target'])
    tem_df2['ks'] = (tem_df2.ecd0 - tem_df2.ecd1)
    return tem_df2.ks.max()
    
# 由KS曲线(最大差值点)建议概率阈值
def ks_suggest_proba(ordered_pct_df):
    tem_df = ordered_pct_df
    tem_df1 = tem_df.groupby(['predict', 'real']).size().unstack()
    tem_df2 = tem_df1.fillna(0)
    tem_df2['acc0'] = np.cumsum(tem_df2['NotTarget'])
    tem_df2['acc1'] = np.cumsum(tem_df2['Target'])
    tem_df2['ecd0'] = tem_df2.acc0 / sum(tem_df2['NotTarget'])
    tem_df2['ecd1'] = tem_df2.acc1 / sum(tem_df2['Target'])
    tem_df2['ks'] = (tem_df2.ecd0 - tem_df2.ecd1)
    return tem_df2.ks.idxmax()
# 如果要画图的话
def ks_df(ordered_pct_df):
    tem_df = ordered_pct_df
    tem_df1 = tem_df.groupby(['predict', 'real']).size().unstack()
    tem_df2 = tem_df1.fillna(0)
    tem_df2['acc0'] = np.cumsum(tem_df2['NotTarget'])
    tem_df2['acc1'] = np.cumsum(tem_df2['Target'])
    tem_df2['ecd0'] = tem_df2.acc0 / sum(tem_df2['NotTarget'])
    tem_df2['ecd1'] = tem_df2.acc1 / sum(tem_df2['Target'])
    tem_df2['ks'] = (tem_df2.ecd0 - tem_df2.ecd1)
    return tem_df2[['ecd0', 'ecd1']]

# 按十等分分bin(里面的函数要改掉)
# 1 处理连续(数值)型列
def num_series_mapper(s, ruler):
    left_ruler = np.array(ruler[0])
    right_ruler = np.array(ruler[1])
    s_null = s[pd.isnull(s)]
    s_null = s_null.fillna(-999)
    s_notnull = s[~pd.isnull(s)]
    s1 = np.array(s_notnull.apply(float)).reshape(-1, 1)
    left_judge = (s1 - left_ruler) >= 0
    right_judge = (s1 - right_ruler) < 0
    result = (left_judge & right_judge)
    new_s = pd.Series(result.argmax(axis=1) - 1)
    new_s.index = s_notnull.index
    return pd.concat([s_null, new_s])


def num_q_ruler(s, quantile_array=[0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]):
    s1 = s[pd.notnull(s)]
    s2 = s1.apply(float)
    num_min = -1.7e300
    num_max = 1.7e300
    smin = s2.min()
    smax = float(s2.max() + abs(s2.mean()))  # 本意是加上一个极小浮点,但是仍然存在取舍偏差
    q_series = s2.quantile(quantile_array)
    full_set = list(set([num_max, num_min, smax, smin] + list(q_series)))
    full_set.sort()  # 至少剩下三个值
    tem_left_ruler = full_set[:-1]
    tem_right_ruler = full_set[1:]

    left_ruler = [tem_left_ruler[0], tem_left_ruler[-1]]+tem_left_ruler[1:-1]
    right_ruler = [tem_right_ruler[0],
                   tem_right_ruler[-1]]+tem_right_ruler[1:-1]

    return (left_ruler, right_ruler)


def ks_bin(ordered_pct_df, bins=10):
    tem_df = ordered_pct_df
    quantiles = np.linspace(0, 1, bins+1)
    q_ruler = num_q_ruler(tem_df['predict'], quantiles[1:-1])
    tem_df['predict_map'] = num_series_mapper(tem_df['predict'], q_ruler)

    tem_df1 = tem_df.groupby(['predict_map', 'real']).size().unstack()
    tem_df2 = tem_df1.fillna(0)
    tem_df2['acc0'] = np.cumsum(tem_df2['NotTarget'])
    tem_df2['acc1'] = np.cumsum(tem_df2['Target'])
    tem_df2['ecd0'] = tem_df2.acc0 / sum(tem_df2['NotTarget'])
    tem_df2['ecd1'] = tem_df2.acc1 / sum(tem_df2['Target'])
    tem_df2['ks'] = (tem_df2.ecd0 - tem_df2.ecd1)
    return tem_df2.ks.max()

你可能感兴趣的:(Tips,python)