Python 批量计算变量iv值

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
data=pd.read_excel(r'E:\lll\20200311人工客群分布\sx_all_data_0311.xlsx')#导入数据

variable=pd.read_excel(r'E:\lll\20200311人工客群分布\变量类型.xlsx')#变量列表
continue_variable=list(variable['连续变量'])#获取连续变量列表
'''
连续型变量iv计算
'''
def feature_woe_iv(x: pd.Series, y: pd.Series, nan: float = -999.) -> pd.DataFrame:
    '''
        计算变量各个分箱的WOE、IV值,返回一个DataFrame
    '''
    x = x.fillna(nan)
    boundary = optimal_binning_boundary(x, y, nan) # 获得最优分箱边界值列表
    df = pd.concat([x, y], axis=1) # 合并x、y为一个DataFrame,方便后续计算
    df.columns = ['x', 'y'] # 特征变量、目标变量字段的重命名
    df['bins'] = pd.cut(x=x, bins=boundary, right=False)  # 获得每个x值所在的分箱区间
    
    grouped =  df.groupby('bins')['y'] # 统计各分箱区间的好、坏、总客户数量
    result_df = grouped.agg([('good',  lambda y: (y == 0).sum()), 
                             ('bad',   lambda y: (y == 1).sum()),
                             ('total', 'count')])

    result_df['good_pct'] = result_df['good'] / result_df['good'].sum() #好客户占比
    result_df['bad_pct'] = result_df['bad'] / result_df['bad'].sum()# 坏客户占比
    result_df['total_pct'] = result_df['total'] / result_df['total'].sum()#总客户占
    result_df['IV'] =result_df['iv'].sum()
    result_df['变量'] =i
    result_df=result_df.reset_index() 
    return result_df
a=[]
for i in list(continue_variable['连续变量']): 
    a.append(feature_woe_iv(x=data[i],y=data['dpd30']))
'''
离散型变量iv计算
'''
de_variable=list(continue_variable['离散变量'].unique())
def de_variable(data):
	for i in de_variable: 
	    povit1=pd.pivot_table(data,index=[i],values=['contract_no','dpd30'],
	                         aggfunc={r'contract_no':'count',r'dpd30':np.sum})
	    povit1=povit1.rename(columns={'contract_no':'total','dpd30':'bad'})
	    
	    povit1['good'] = povit1['total'] - povit1['bad'].sum()# 好客户占比
	
	    povit1['good_pct'] = povit1['good'] / povit1['good'].sum() # 好客户占比
	    povit1['bad_pct'] = povit1['bad'] / povit1['bad'].sum() # 坏客户占比
	    povit1['total_pct'] = povit1['total'] / povit1['total'].sum()# 总客户占比
	
	    povit1['bad_rate'] = povit1['bad'] / povit1['total'] # 坏比率
	    povit1['woe'] = np.log(povit1['good_pct'] / povit1['bad_pct'])# WOE
	    povit1['iv'] = (povit1['good_pct'] - povit1['bad_pct']) * povit1['woe']  # IV 
	    povit1['IV'] =povit1['iv'].sum()
	    povit1['变量'] =i
	    povit1=povit1.reset_index() 
	    povit1=povit1.rename(columns={i:'bins'})
	    a.append(povit1)
iv_result=pd.concat(a,axis=0)#合并所有变量iv
iv_result.to_excel(r'E:\lusiru\20200311人工客群分布\iv_result.xlsx')

你可能感兴趣的:(python,学习)