python_等频分箱_等距分箱_特征无监督分桶

python_等频分箱_等距分箱

data_temp = data
# 分箱:等距等频分箱
# 等距分箱
#bins=10 分箱数
data_temp['deposit_cur_balance_bins'] = binning(df = data_temp,col = 'deposit_cur_balance',method='frequency',bins=10)
data_temp['deposit_year_total_balance_bins'] = binning(df = data_temp,col = 'deposit_year_total_balance',method='frequency',bins=10)


def binning(df,col,method,bins):
#     需要判断类型
    uniqs=df[col].nunique()
    if uniqs<=bins:
        raise KeyError('nunique is smaller than bins: '+col)
#         print('nunique is smaller than bins: '+col)
        return 
#     左开右闭
    def ff(x,fre_list):
        if x<=fre_list[0]:
            return 0
        elif x>fre_list[-1]:
            return len(fre_list)-1
        else :
            for i in range(len(fre_list)-1):
                if x>fre_list[i] and x<=fre_list[i+1]:
                    return i
# 等距分箱
    if method=='distance':
        umax=np.percentile(df[col],99.99)
        umin=np.percentile(df[col],0.01)
        step=(umax-umin)/bins
        fre_list=[umin+i*step for i in range(bins+1)]
        return df[col].map(lambda x:ff(x,fre_list))
        #等频分箱
    elif method=='frequency' :
        fre_list=[np.percentile(df[col],100/bins*i) for i in range(bins+1)]
        fre_list=sorted(list(set(fre_list)))
        return df[col].map(lambda x:ff(x,fre_list))


你可能感兴趣的:(Python)