第二次任务:对数据已经预处理的变量,使用IV和随机森林的特征重要性进行筛选;
目录:
1、导入数据
2、IV值计算
3、importance计算
4、特征筛选
1、导入数据
#导入需要的包
import numpy as np
import pandas as pd
import LR as lr
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
####数据导入
data = pd.read_csv(r'/data/1/home/mabufa/data/task/data_task02.csv')
####标签区分
label = data['status']
data_var = data.drop(['status'], axis=1)
2、IV值计算
####定义IV值计算公式
def calc_iv(df, feature, target, pr = False):
'''
input:
df: data
feature: independent variable
target: good/bad
pr: True to enable printing of output
output:
iv:float
data:pandas.DataFrame
'''
lst = []
df[feature] = df[feature].fillna('NULL')
for i in range(df[feature].nunique()): #nunique()返回不同行或列的值,去重后的数量;axis=0/1对应列或行;
val = list(df[feature].unique())[i] #对于一维数组或列表,去除其中重复的元素,并按元素由大到小返回一个新的无元素重复的元组或列表;
lst.append([feature,
val,
df[df[feature] == val].count()[feature],
df[(df[feature] == val) & (df[target] == 0)].count()[feature],
df[(df[feature] == val) & (df[target] == 1)].count()[feature]])
data = pd.DataFrame(lst, columns=['Variable','Value','All','Good','Bad'])
data['Share'] = data['All'] / data['All'].sum() #分组的占比
data['Bad Rate'] = data['Bad'] / data['All']
data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])
data = data.replace({'WoE':{np.inf:0,-np.inf:0}}) #替换正无穷和负无穷为0,以字典的形式;
data['IV'] = data['WoE']*(data['Distribution Good'] - data['Distribution Bad'])
data = data.sort_values(by=['Variable','Value'],ascending=[True,True]) #以列['Variable','Value']排序,采用升序;
data.index = range(len(data.index)) #重置索引
if pr:
print(data)
print('IV = ',data['IV'].sum())
iv = data['IV'].sum()
data = data.append(data)
return iv,data
##IV值计算
IV_dict = {}
f_col = data_var.columns
for x in f_col:
IV_1,df = calc_iv(data, x, 'status')
# print('{}: {}'.format(x, IV_1))
IV_dict[x] = IV_1
#将变量IV值进行降序排列,方便后续挑选变量
IV_dict_sorted = sorted(IV_dict.items(), key=lambda x: x[1], reverse=True)
IV_values = [i[1] for i in IV_dict_sorted]
IV_name = [i[0] for i in IV_dict_sorted]
plt.figure(figsize=(20,6))
plt.title('feature IV')
plt.bar(range(len(IV_values)),IV_values)
#### 随机森林,查看importance
param = {'n_estimators': list(range(10, 1001, 50))}
g = GridSearchCV(estimator = RandomForestClassifier(random_state=2019),
param_grid=param, cv=5)
g.fit(data_var, label)
g.best_estimator_
#调参
param = {'n_estimators': list(range(770, 870, 10))}
forest_grid = GridSearchCV(estimator = RandomForestClassifier(random_state=2019),
param_grid=param, cv=5)
forest_grid.fit(data_var, label)
rnd_clf = forest_grid.best_estimator_
rnd_clf
##f_importance
f_importance = {}
importances = rnd_clf.feature_importances_
indices = np.argsort(importances)[::-1] #argsort()函数将importances中的元素从小到大排列,提取其对应的index(索引),然后输出到indices;
for f in range(data_var.shape[1]):
# print(" % s%f" % (f_col[indices[f]], importances[indices[f]]))
f_importance[f_col[indices[f]]] = importances[indices[f]]
#将变量importances进行降序排列,方便后续挑选变量
im_dict_sorted = sorted(f_importance.items(), key=lambda x: x[1], reverse=True)
im_values = [i[1] for i in im_dict_sorted]
im_name = [i[0] for i in im_dict_sorted]
plt.figure(figsize=(20,6))
plt.title('feature importances')
plt.bar(range(len(im_values)),im_values)
##IV值和importance的字典转换
df_iv = pd.DataFrame(IV_dict_sorted, columns=['vars','iv'])
df_im = pd.DataFrame(im_dict_sorted, columns=['vars','importances'])
df_iv_im = df_iv .merge(df_im [['vars','importances']],on=['vars'],how='left')
##特征筛选:IV值取大于0.1,具有较强预测能力;importance阈值简单设为0.015;
df_iv_im = df_iv_im[df_iv_im['iv']>0.1]
df_iv_im = df_iv_im [df_iv_im ['importances']>0.015]