python卡方分箱自动化脚本

import sys
import numpy as np
import pandas as pd
import math
import os
from tqdm import tqdm

pd.set_option('display.float_format', lambda x: '%.3f' % x)

"""
    1.自定义缺失值处理函数
		1.1 缺失值计算

"""


def missing_cal(df):
    """
		计算特征数据缺失占比
        :param df: 数据集
        :param threshold:
        :return: 每个变量的缺失率
    """
    missing_series = df.isnull().sum() / df.shape[0]  # 此处需要修改
    missing_df = pd.DataFrame(missing_series).reset_index()
    missing_df = missing_df.rename(columns={
     'index': 'col',
                                            0: 'missing_pct'})
    missing_df = missing_df.sort_values('missing_pct', ascending=False).reset_index(drop=True)
    return missing_df


"""
	1.2 按特征(列)删除
		若字段数据缺失严重,可先检查字段特性,是业务层面设计需求,或者是数据抓取异常
		如无上述问题,建议删除缺失值占比大于设定阈值的字段
		常见阈值为90%以上或者40%~50%以上,根据特征是否对应明确的业务含义而决定是否保留
"""


def missing_delete_var(df, threshold=None):
    """
    :param df: 数据集
    :param threshold: 确实率删除的阈值
    :return: 删除后的数据集
    """
    df2 = df.copy()
    missing_df = missing_cal(df)
    missing_col_num = missing_df[missing_df.missing_pct >= threshold].shape[0]
    missing_col = list(missing_df[missing_df.missing_pct >= threshold].col)
    df2 = df2.drop(missing_col, axis=1)
    return df2


def missing_delete_user(df, threshold=None):
    df2 = df.copy()
    missing_series = df.isnull().sum(axis=1)
    missing_list = list(missing_series)
    missing_index_list = []
    for i, j in enumerate(missing_list):
        if j >= threshold:
            missing_index_list.append(i)
    df2 = df2[~(df2.index.isin(missing_index_list))]
    return df2


"""
	2. 自定义常变量处理函数
		同值化较严重的字段,如无特殊业务含义,某一数据占比超过阈值时,建议删除
"""


def const_delete(df, col_list, threshold=None):
    """

    :param df: 数据集
    :param col_list: 变量list集合
    :param threshold: 同值化处理的阈值
    :return: 处理后的数据
    """
    df2 = df.copy()
    const_col = []
    for col in col_list:
        const_pct = df2[col].value_counts().iloc[0] / df2[df2[col].notnull()].shape[0]
        if const_pct >= threshold:
            const_col.append(col)
    df2 = df2.drop(const_col, axis=1)
    return df2


"""
    3. 自定义data_processing函数,执行完整数据预处理步骤:
        1、导入数据
        2、删除缺失值(自定义函数)
        3、删除常变量(自定义函数)
          1)常变量(自定义函数)
          2)方差为0
    4、缺失值填充
          1)分类型特征填充(自定义函数)
          2)连续型特征填充(自定义函数)
"""


def data_processing(df, target):
    """
    :param df: 包含了label(target)和特征的宽表
    :param target: label(target)
    :return: 清洗后的数据集
    """
    # 特征缺失处理
    df = missing_delete_var(df, threshold=0.8)
    # 样本缺失处理
    df = missing_delete_user(df, threshold=int(df.shape[1] * 0.8))
    col_list = [x for x in df.columns if x != target]
    # 常变量处理
    df = const_delete(df, col_list, threshold=0.9)
    desc = df.describe().T
    # 剔除方差为0的特征
    std_0_col = list(desc[desc['std'] == 0].index)
    if len(std_0_col) > 0:
        df = df.drop(std_0_col, axis=1)
    df.reset_index(drop=True, inplace=True)

    # 缺失值计算和填充
    miss_df = missing_cal(df)
    cate_col = list(df.select_dtypes(include=['O']).columns)
    num_col = [x for x in list(df.select_dtypes(include=['int64', 'float64']).columns) if x != 'label']

    # 分类型特征填充
    cate_miss_col1 = [x for x in list(miss_df[miss_df.missing_pct > 0.05]['col']) if x in cate_col]
    cate_miss_col2 = [x for x in list(miss_df[miss_df.missing_pct <= 0.05]['col']) if x in cate_col]
    num_miss_col1 = [x for x in list(miss_df[miss_df.missing_pct > 0.05]['col']) if x in num_col]
    num_miss_col2 = [x for x in list(miss_df[miss_df.missing_pct <= 0.05]['col']) if x in num_col]
    for col in cate_miss_col1:
        df[col] = df[col].fillna('未知')
    for col in cate_miss_col2:
        df[col] = df[col].fillna(df[col].mode()[0])
    for col in num_miss_col1:
        df[col] = df[col].fillna(-999)
    for col in num_miss_col2:
        df[col] = df[col].fillna(df[col].median())

    return df, miss_df


###############################################################################################################
"""
	三、特征分箱
		分箱逻辑:
			1、类别型特征
			  1)类别数在5个以下,可以直接根据类别来分箱 (binning_cate)
			  2)类别数在5个以上,建议做降基处理,再根据降基后的类别做分箱
			2、数值型特征
			  1)离散型数值特征(特征value的变动幅度较小):
			    若特征value的非重复计数在5个以下,可以直接根据非重复计数值来分箱(binning_cate)
			    若特征value的非重复计数在5个以上,建议根据业务解释或者数据分布做自定义分箱(binning_self)
			  2)连续型数值特征(特征value的变动幅度较大):
			    可以用卡方分箱或自定义分箱。(binning_num,binning_self)
			    PS:一些特征用卡方分可能会报错,建议这些特征改为手动自定义分箱
			3、特征有缺失
			  1)缺失率在5%以下,可以先对缺失做填充处理再分箱(binning_num)
			  2)缺失率在5%以上,建议将缺失当作一个类别来分箱(binning_sparse_col)
			4、稀疏特征分箱
			  建议将稀疏值(一般为0)单独分为一箱,剩下的值做卡方或者自定义分箱(binning_sparse_col)
"""

"""
	1.自定义指标评估函数
		KS、precision、 tpr、 fpr
"""


def cal_ks(df, col, target):
    """

    :param df: 数据集
    :param col:输入特征
    :param target:好坏标记的字段名
    :return: KS值, precision准确率, tpr召回率, fpr打扰率
    """

    bad = df[target].sum()
    good = df[target].count() - bad
    value_list = list(df[col])
    label_list = list(df[target])
    value_count = df[col].nunique()

    items = sorted(zip(value_list, label_list), key=lambda x: x[0])

    value_bin = []
    ks_list = []
    if value_count <= 200:
        for i in sorted(set(value_list)):
            value_bin.append(i)
            label_bin = [x[1] for x in items if x[0] < i]
            badrate = sum(label_bin) / bad
            goodrate = (len(label_bin) - sum(label_bin)) / good
            ks = abs(goodrate - badrate)
            ks_list.append(ks)
    else:
        for i in range(1, 201):
            step = (max(value_list) - min(value_list)) / 200
            idx = min(value_list) + i * step
            value_bin.append(idx)
            label_bin = [x[1] for x in items if x[0] < idx]
            badrate = sum(label_bin) / bad
            goodrate = (len(label_bin) - sum(label_bin)) / good
            ks = abs(goodrate - badrate)
            ks_list.append(ks)
    ks = round(max(ks_list), 3)

    ks_value = [value_bin[i] for i, j in enumerate(ks_list) if j == max(ks_list)][0]
    precision = df[(df[col] <= ks_value) & (df[target] == 1)].shape[0] / df[df[col] <= ks_value].shape[0]
    tpr = df[(df[col] <= ks_value) & (df[target] == 1)].shape[0] / bad
    fpr = df[(df[col] <= ks_value) & (df[target] == 0)].shape[0] / good

    return ks, precision, tpr, fpr


"""
	2.自定义卡方分箱函数
		2.1 变量分割点
"""


def split_data(df, col, split_num):
    """
	先用卡方分箱输出变量的分割点
    :param df: 原始数据
    :param col: 需要分箱的变量
    :param split_num: 分割点的数量
    :return:
    """
    df2 = df.copy()
    count = df2.shape[0]  # 总样本数
    n = math.floor(count / split_num)  # 按照分割点数目等分后每组的样本数
    split_index = [i * n for i in range(1, split_num)]  # 分割点的索引
    values = sorted(list(df2[col]))  # 对变量的值从小到大进行排序
    split_value = [values[i] for i in split_index]  # 分割点对应的value
    split_value = sorted(list(set(split_value)))  # 分割点的value去重排序
    return split_value


def assign_group(x, split_bin):
    n = len(split_bin)
    if x <= min(split_bin):
        return min(split_bin)  # 如果x小于分割点的最小值,则x映射为分割点的最小值
    elif x > max(split_bin):  # 如果x大于分割点的最大值,则x映射为分割点的最大值
        return 10e10
    else:
        for i in range(n - 1):
            if split_bin[i] < x <= split_bin[i + 1]:  # 如果x在两个分割点之间,则x映射为分割点较大的值
                return split_bin[i + 1]


"""
	2.2 计算违约率
"""


def bin_bad_rate(df, col, target, grantRateIndicator=0):
    """

    :param df: 原始数据
    :param col: 原始变量/变量映射后的字段
    :param target:目标变量的字段
    :param grantRateIndicator:是否输出整体违约率
    :return:
    """
    total = df.groupby([col])[target].count()
    bad = df.groupby([col])[target].sum()
    total_df = pd.DataFrame({
     'total': total})
    bad_df = pd.DataFrame({
     'bad': bad})
    regroup = pd.merge(total_df, bad_df, left_index=True, right_index=True, how='left')
    regroup = regroup.reset_index()
    regroup['bad_rate'] = regroup['bad'] / regroup['total']  # 计算根据col分组后每组的违约率
    dict_bad = dict(zip(regroup[col], regroup['bad_rate']))  # 转为字典形式
    if grantRateIndicator == 0:
        return (dict_bad, regroup)
    total_all = df.shape[0]
    bad_all = df[target].sum()
    all_bad_rate = bad_all / total_all  # 计算总体的违约率
    return (dict_bad, regroup, all_bad_rate)


"""
	2.3 计算卡方值
"""


def cal_chi2(df, all_bad_rate):
    df2 = df.copy()
    df2['expected'] = df2['total'] * all_bad_rate  # 计算每组的坏用户期望数量
    combined = zip(df2['expected'], df2['bad'])  # 遍历每组的坏用户期望数量和实际数量
    chi = [(i[0] - i[1]) ** 2 / i[0] for i in combined]  # 计算每组的卡方值
    chi2 = sum(chi)  # 计算总的卡方值
    return chi2


def assign_bin(x, cutoffpoints):
    bin_num = len(cutoffpoints) + 1  # 箱体个数
    if x <= cutoffpoints[0]:  # 如果x小于最小的cutoff点,则映射为Bin 0
        return 'Bin 0'
    elif x > cutoffpoints[-1]:  # 如果x大于最大的cutoff点,则映射为Bin(bin_num-1)
        return 'Bin {}'.format(bin_num - 1)
    else:
        for i in range(0, bin_num - 1):
            if cutoffpoints[i] < x <= cutoffpoints[i + 1]:  # 如果x在两个cutoff点之间,则x映射为Bin(i+1)
                return 'Bin {}'.format(i + 1)


"""
	2.4 卡方分箱(干货)
"""


def ChiMerge(df, col, target, max_bin=5, min_binpct=0):
    col_unique = sorted(list(set(df[col])))  # 变量的唯一值并排序
    n = len(col_unique)  # 变量唯一值得个数
    df2 = df.copy()
    if n > 100:  # 如果变量的唯一值数目超过100,则将通过split_data和assign_group将x映射为split对应的value
        split_col = split_data(df2, col, 100)  # 通过这个目的将变量的唯一值数目人为设定为100
        df2['col_map'] = df2[col].map(lambda x: assign_group(x, split_col))
    else:
        df2['col_map'] = df2[col]  # 变量的唯一值数目没有超过100,则不用做映射
    # 生成dict_bad,regroup,all_bad_rate的元组
    (dict_bad, regroup, all_bad_rate) = bin_bad_rate(df2, 'col_map', target, grantRateIndicator=1)
    col_map_unique = sorted(list(set(df2['col_map'])))  # 对变量映射后的value进行去重排序
    group_interval = [[i] for i in col_map_unique]  # 对col_map_unique中每个值创建list并存储在group_interval中

    while (len(group_interval) > max_bin):  # 当group_interval的长度大于max_bin时,执行while循环
        chi_list = []
        for i in range(len(group_interval) - 1):
            temp_group = group_interval[i] + group_interval[i + 1]  # temp_group 为生成的区间,list形式,例如[1,3]
            chi_df = regroup[regroup['col_map'].isin(temp_group)]
            chi_value = cal_chi2(chi_df, all_bad_rate)  # 计算每一对相邻区间的卡方值
            chi_list.append(chi_value)
        best_combined = chi_list.index(min(chi_list))  # 最小的卡方值的索引
        # 将卡方值最小的一对区间进行合并
        group_interval[best_combined] = group_interval[best_combined] + group_interval[best_combined + 1]
        # 删除合并前的右区间
        group_interval.remove(group_interval[best_combined + 1])
        # 对合并后每个区间进行排序
    group_interval = [sorted(i) for i in group_interval]
    # cutoff点为每个区间的最大值
    cutoffpoints = [max(i) for i in group_interval[:-1]]

    # 检查是否有箱只有好样本或者只有坏样本
    df2['col_map_bin'] = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints))  # 将col_map映射为对应的区间Bin
    # 计算每个区间的违约率
    (dict_bad, regroup) = bin_bad_rate(df2, 'col_map_bin', target)
    # 计算最小和最大的违约率
    [min_bad_rate, max_bad_rate] = [min(dict_bad.values()), max(dict_bad.values())]
    # 当最小的违约率等于0,说明区间内只有好样本,当最大的违约率等于1,说明区间内只有坏样本
    while min_bad_rate == 0 or max_bad_rate == 1:
        bad01_index = regroup[regroup['bad_rate'].isin([0, 1])].col_map_bin.tolist()  # 违约率为1或0的区间
        bad01_bin = bad01_index[0]
        if bad01_bin == max(regroup.col_map_bin):
            cutoffpoints = cutoffpoints[:-1]  # 当bad01_bin是最大的区间时,删除最大的cutoff点
        elif bad01_bin == min(regroup.col_map_bin):
            cutoffpoints = cutoffpoints[1:]  # 当bad01_bin是最小的区间时,删除最小的cutoff点
        else:
            bad01_bin_index = list(regroup.col_map_bin).index(bad01_bin)  # 找出bad01_bin的索引
            prev_bin = list(regroup.col_map_bin)[bad01_bin_index - 1]  # bad01_bin前一个区间
            df3 = df2[df2.col_map_bin.isin([prev_bin, bad01_bin])]
            (dict_bad, regroup1) = bin_bad_rate(df3, 'col_map_bin', target)
            chi1 = cal_chi2(regroup1, all_bad_rate)  # 计算前一个区间和bad01_bin的卡方值
            later_bin = list(regroup.col_map_bin)[bad01_bin_index + 1]  # bin01_bin的后一个区间
            df4 = df2[df2.col_map_bin.isin([later_bin, bad01_bin])]
            (dict_bad, regroup2) = bin_bad_rate(df4, 'col_map_bin', target)
            chi2 = cal_chi2(regroup2, all_bad_rate)  # 计算后一个区间和bad01_bin的卡方值
            if chi1 < chi2:  # 当chi1
                cutoffpoints.remove(cutoffpoints[bad01_bin_index - 1])
            else:  # 当chi1>=chi2时,删除bin01对应的cutoff点
                cutoffpoints.remove(cutoffpoints[bad01_bin_index])
        df2['col_map_bin'] = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints))
        (dict_bad, regroup) = bin_bad_rate(df2, 'col_map_bin', target)
        # 重新将col_map映射至区间,并计算最小和最大的违约率,直达不再出现违约率为0或1的情况,循环停止
        [min_bad_rate, max_bad_rate] = [min(dict_bad.values()), max(dict_bad.values())]

    # 检查分箱后的最小占比
    if min_binpct > 0:
        group_values = df2['col_map'].apply(lambda x: assign_bin(x, cutoffpoints))
        df2['col_map_bin'] = group_values  # 将col_map映射为对应的区间Bin
        group_df = group_values.value_counts().to_frame()
        group_df['bin_pct'] = group_df['col_map'] / n  # 计算每个区间的占比
        min_pct = group_df.bin_pct.min()  # 得出最小的区间占比
        while min_pct < min_binpct and len(cutoffpoints) > 2:  # 当最小的区间占比小于min_pct且cutoff点的个数大于2,执行循环
            # 下面的逻辑基本与“检验是否有箱体只有好/坏样本”的一致
            min_pct_index = group_df[group_df.bin_pct == min_pct].index.tolist()
            min_pct_bin = min_pct_index[0]
            if min_pct_bin == max(group_df.index):
                cutoffpoints = cutoffpoints[:-1]
            elif min_pct_bin == min(group_df.index):
                cutoffpoints = cutoffpoints[1:]
            else:
                minpct_bin_index = list(group_df.index).index(min_pct_bin)
                prev_pct_bin = list(group_df.index)[minpct_bin_index - 1]
                df5 = df2[df2['col_map_bin'].isin([min_pct_bin, prev_pct_bin])]
                (dict_bad, regroup3) = bin_bad_rate(df5, 'col_map_bin', target)
                chi3 = cal_chi2(regroup3, all_bad_rate)
                later_pct_bin = list(group_df.index)[minpct_bin_index + 1]
                df6 = df2[df2['col_map_bin'].isin([min_pct_bin, later_pct_bin])]
                (dict_bad, regroup4) = bin_bad_rate(df6, 'col_map_bin', target)
                chi4 = cal_chi2(regroup4, all_bad_rate)
                if chi3 < chi4:
                    cutoffpoints.remove(cutoffpoints[minpct_bin_index - 1])
                else:
                    cutoffpoints.remove(cutoffpoints[minpct_bin_index])
    return cutoffpoints


"""
	3. 自定义变量分箱函数
		3.1 类别型特征
"""


def binning_cate(df, col, target):
    """

    :param df: 数据集
    :param col: 输入特征
    :param target: 好坏标记的字段名
    :return: bin_df 特征的评估结果
    """
    total = df[target].count()
    bad = df[target].sum()
    good = total - bad
    d1 = df.groupby([col], as_index=True)
    d2 = pd.DataFrame()
    d2['样本数'] = d1[target].count()
    d2['黑样本数'] = d1[target].sum()
    d2['白样本数'] = d2['样本数'] - d2['黑样本数']
    d2['逾期用户占比'] = d2['黑样本数'] / d2['样本数']
    d2['badattr'] = d2['黑样本数'] / bad
    d2['goodattr'] = d2['白样本数'] / good
    d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])
    d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']
    d2['IV'] = d2['bin_iv'].sum()

    bin_df = d2.reset_index()
    bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)
    bin_df.rename(columns={
     col: '分箱结果'}, inplace=True)
    bin_df['特征名'] = col
    bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)
    return bin_df


"""
	3.2 数值型特征
		3.2.1 离散型数值特征
"""


def binning_self(df, col, target, cut=None, right_border=True):
    """

    :param df: 数据集
    :param col: 输入的特征
    :param target: 好坏标记的字段名
    :param cut: 总定义划分区间的list
    :param right_border: 设置左开右闭,左闭右开
    :return: bin_df 特征的评估结果
    """

    total = df[target].count()
    bad = df[target].sum()
    good = total - bad
    bucket = pd.cut(df[col], cut, right=right_border)
    d1 = df.groupby(bucket)
    d2 = pd.DataFrame()
    d2['样本数'] = d1[target].count()
    d2['黑样本数'] = d1[target].sum()
    d2['白样本数'] = d2['样本数'] - d2['黑样本数']
    d2['逾期用户占比'] = d2['黑样本数'] / d2['样本数']
    d2['badattr'] = d2['黑样本数'] / bad
    d2['goodattr'] = d2['白样本数'] / good
    d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])
    d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']
    d2['IV'] = d2['bin_iv'].sum()

    bin_df = d2.reset_index()
    bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)
    bin_df.rename(columns={
     col: '分箱结果'}, inplace=True)
    bin_df['特征名'] = col
    bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

    ks, precision, tpr, fpr = cal_ks(df, col, target)
    bin_df['准确率'] = precision
    bin_df['召回率'] = tpr
    bin_df['打扰率'] = fpr
    bin_df['KS'] = ks

    return bin_df


"""
	3.2.2 连续型数值特征
"""


def binning_num(df, target, col, max_bin=None, min_binpct=None):
    """

    :param df: 数据集
    :param target: 好坏标记的字段名
    :param col: 输入的特征
    :param max_bin: 最大分箱个数
    :param min_binpct: 区间内样本所占总体的最小比
    :return:
    """
    total = df[target].count()
    bad = df[target].sum()
    good = total - bad
    inf = float('inf')
    ninf = float('-inf')

    cut = ChiMerge(df, col, target, max_bin=max_bin, min_binpct=min_binpct)
    cut.insert(0, ninf)
    cut.append(inf)
    bucket = pd.cut(df[col], cut)
    d1 = df.groupby(bucket)
    d2 = pd.DataFrame()
    d2['样本数'] = d1[target].count()
    d2['黑样本数'] = d1[target].sum()
    d2['白样本数'] = d2['样本数'] - d2['黑样本数']
    d2['逾期用户占比'] = d2['黑样本数'] / d2['样本数']
    d2['badattr'] = d2['黑样本数'] / bad
    d2['goodattr'] = d2['白样本数'] / good
    d2['WOE'] = np.log(d2['badattr'] / d2['goodattr'])
    d2['bin_iv'] = (d2['badattr'] - d2['goodattr']) * d2['WOE']
    d2['IV'] = d2['bin_iv'].sum()

    bin_df = d2.reset_index()
    bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)
    bin_df.rename(columns={
     col: '分箱结果'}, inplace=True)
    bin_df['特征名'] = col
    bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

    ks, precision, tpr, fpr = cal_ks(df, col, target)
    bin_df['准确率'] = precision
    bin_df['召回率'] = tpr
    bin_df['打扰率'] = fpr
    bin_df['KS'] = ks

    return bin_df


"""
	3.3 特征分箱
"""


def binning_sparse_col(df, target, col, max_bin=None, min_binpct=None, sparse_value=None):
    """

    :param df: 数据集
    :param target: 好坏标记的字段名
    :param col: 输入的特征
    :param max_bin: 最大分箱个数
    :param min_binpct: 区间内样本所占总体的最小比
    :param sparse_value: 单独分为一箱的values值
    :return: 特征的评估结果
    """

    total = df[target].count()
    bad = df[target].sum()
    good = total - bad

    # 对稀疏值0值或者缺失值单独分箱
    temp1 = df[df[col] == sparse_value]
    temp2 = df[~(df[col] == sparse_value)]

    bucket_sparse = pd.cut(temp1[col], [float('-inf'), sparse_value])
    group1 = temp1.groupby(bucket_sparse)
    bin_df1 = pd.DataFrame()
    bin_df1['样本数'] = group1[target].count()
    bin_df1['黑样本数'] = group1[target].sum()
    bin_df1['白样本数'] = bin_df1['样本数'] - bin_df1['黑样本数']
    bin_df1['逾期用户占比'] = bin_df1['黑样本数'] / bin_df1['样本数']
    bin_df1['badattr'] = bin_df1['黑样本数'] / bad
    bin_df1['goodattr'] = bin_df1['白样本数'] / good
    bin_df1['WOE'] = np.log(bin_df1['badattr'] / bin_df1['goodattr'])
    bin_df1['bin_iv'] = (bin_df1['badattr'] - bin_df1['goodattr']) * bin_df1['WOE']

    bin_df1 = bin_df1.reset_index()

    # 对剩余部分做卡方分箱
    cut = ChiMerge(temp2, col, target, max_bin=max_bin, min_binpct=min_binpct)
    cut.insert(0, sparse_value)
    cut.append(float('inf'))

    bucket = pd.cut(temp2[col], cut)
    group2 = temp2.groupby(bucket)
    bin_df2 = pd.DataFrame()
    bin_df2['样本数'] = group2[target].count()
    bin_df2['黑样本数'] = group2[target].sum()
    bin_df2['白样本数'] = bin_df2['样本数'] - bin_df2['黑样本数']
    bin_df2['逾期用户占比'] = bin_df2['黑样本数'] / bin_df2['样本数']
    bin_df2['badattr'] = bin_df2['黑样本数'] / bad
    bin_df2['goodattr'] = bin_df2['白样本数'] / good
    bin_df2['WOE'] = np.log(bin_df2['badattr'] / bin_df2['goodattr'])
    bin_df2['bin_iv'] = (bin_df2['badattr'] - bin_df2['goodattr']) * bin_df2['WOE']

    bin_df2 = bin_df2.reset_index()

    # 合并分箱结果
    bin_df = pd.concat([bin_df1, bin_df2], axis=0)
    bin_df['IV'] = bin_df['bin_iv'].sum().round(3)

    bin_df.drop(['badattr', 'goodattr', 'bin_iv'], axis=1, inplace=True)
    bin_df.rename(columns={
     col: '分箱结果'}, inplace=True)
    bin_df['特征名'] = col
    bin_df = pd.concat([bin_df['特征名'], bin_df.iloc[:, :-1]], axis=1)

    ks, precision, tpr, fpr = cal_ks(df, col, target)
    bin_df['准确率'] = precision
    bin_df['召回率'] = tpr
    bin_df['打扰率'] = fpr
    bin_df['KS'] = ks

    return bin_df


"""
	四. 自定义get_feature_result函数,执行完整数据预处理步骤:
		1、数据预处理,调用data_processing函数
		2、变量分箱
		  1)类别型变量分箱
		  2)数值型变量分箱
		  2)卡方分箱报错的变量分箱
		3、得到分箱结果feature_result及其评估指标
		   order_col = ['特征名', '分箱结果', '样本数', '黑样本数', '白样本数', '逾期用户占比', 'WOE', 'IV', '准确率', '召回率', '打扰率', 'KS']
"""


def get_feature_result(df, target):
    """

    :param df: 含有特征和标签的宽表
    :param target: 好坏标签字段名
    :return: 每个特征的评估结果
    """
    if target not in df.columns:

        print('请将特征文件关联样本好坏标签(字段名label)后再重新运行!')

    else:

        print('数据清洗开始')
        df, miss_df = data_processing(df, target)
        print('数据清洗完成')

        cate_col = list(df.select_dtypes(include=['O']).columns)
        num_col = [x for x in list(df.select_dtypes(include=['int64', 'float64']).columns) if x != 'label']

        # 类别性变量分箱

        bin_cate_list = []
        for col in cate_col:
            bin_cate = binning_cate(df, col, target)
            bin_cate['rank'] = list(range(1, bin_cate.shape[0] + 1, 1))
            bin_cate_list.append(bin_cate)

        # 数值型特征分箱
        num_col1 = [x for x in list(miss_df[miss_df.missing_pct > 0.05]['col']) if x in num_col]
        num_col2 = [x for x in list(miss_df[miss_df.missing_pct <= 0.05]['col']) if x in num_col]

        print('特征分箱开始')
        bin_num_list1 = []
        err_col1 = []
        for col in tqdm(num_col1):
            try:
                bin_df1 = binning_sparse_col(df, 'label', col, min_binpct=0.05, max_bin=4, sparse_value=-999)
                bin_df1['rank'] = list(range(1, bin_df1.shape[0] + 1, 1))
                bin_num_list1.append(bin_df1)
            except (IndexError, ZeroDivisionError):
                err_col1.append(col)
            continue

        bin_num_list2 = []
        err_col2 = []
        for col in tqdm(num_col2):
            try:
                bin_df2 = binning_num(df, 'label', col, min_binpct=0.05, max_bin=5)
                bin_df2['rank'] = list(range(1, bin_df2.shape[0] + 1, 1))
                bin_num_list2.append(bin_df2)
            except (IndexError, ZeroDivisionError):
                err_col2.append(col)
            continue

        # 卡方分箱报错的特征分箱
        err_col = err_col1 + err_col2
        bin_num_list3 = []
        if len(err_col) > 0:
            for col in tqdm(err_col):
                ninf = float('-inf')
                inf = float('inf')
                q_25 = df[col].quantile(0.25)
                q_50 = df[col].quantile(0.5)
                q_75 = df[col].quantile(0.75)

                cut = list(sorted(set([ninf, q_25, q_50, q_75, inf])))

                bin_df3 = binning_self(df, col, target, cut=cut, right_border=True)
                bin_df3['rank'] = list(range(1, bin_df3.shape[0] + 1, 1))
                bin_num_list3.append(bin_df3)
        print('特征分箱结束')

        bin_all_list = bin_num_list1 + bin_num_list2 + bin_num_list3 + bin_cate_list

        feature_result = pd.concat(bin_all_list, axis=0)
        feature_result = feature_result.sort_values(['IV', 'rank'], ascending=[False, True])
        feature_result = feature_result.drop(['rank'], axis=1)
        order_col = ['特征名', '分箱结果', '样本数', '黑样本数', '白样本数', '逾期用户占比', 'WOE', 'IV', '准确率', '召回率', '打扰率', 'KS']
        feature_result = feature_result[order_col]
        return feature_result


"""
	五. 导入数据,运行函数,实现自动化特征评估功能
"""


def main():
    # if len(sys.argv)==1:
    #     print('请数据特征数据文件:')
    #     sys.exit()
    # feature_file = sys.argv[1]
    # file_path=os.getcwd()
    # if 'xlsx' in feature_file or 'xls' in feature_file:
    #     df = pd.read_excel(file_path+'/'+feature_file,encoding='gbk')
    # else:
    #     df = pd.read_csv(file_path+'/'+feature_file, encoding='gbk')

    df = pd.read_csv('gm_model.csv')

    # df_feature = df.drop(['name', 'idcard', 'mobile','input_timestamp'], axis=1)
    df_feature = df.drop(['id_card_no', 'card_name', 'loan_date'], axis=1)
    result_bin = get_feature_result(df_feature, 'label')
    result_bin.to_csv('estimate_result.csv', sep=',', encoding='gbk', index=False)


if __name__ == '__main__':
    main()

你可能感兴趣的:(评分卡模型,风控模型)