常用库导入
# 导入相关库
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection, metrics #Additional scklearn functions
from sklearn.model_selection import GridSearchCV #Perforing grid search
from sklearn.svm import SVC
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize # Helps us in Standardizing the data
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score, average_precision_score # Evaluation
from collections import Counter
import lightgbm as lgb
# 过采样
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN
pd.set_option("display.max_rows",1000)
pd.set_option("display.max_columns",20)
pd.set_option('precision',7)
pd.set_option('large_repr', 'truncate')
数据导入
# 数据导入
train = pd.read_csv('cs-training.csv')
test = pd.read_csv('cs-test.csv')
# train.info()
# train.describe().T
# test.info()
# test.describe().T
数据描述
# 数据描述
target = 'SeriousDlqin2yrs' #目标字段名列表
IDcol = 'CustomerID' #ID字段名列表
predictors = [x for x in train.columns if x not in [target,IDcol]] # 原始预测字段列表
print('train shape ',train.shape)
print(train.dtypes.value_counts())
train.info()
train.describe().T
train.head()
train.info()
train = train.apply(pd.to_numeric, errors='ignore') # 自动转换格式
train.info()
test.info()
test = test.apply(pd.to_numeric, errors='ignore') # 自动转换格式
test.info()
train[predictors].describe().T
train.hist(bins=60,figsize = (12,12))
test.hist(bins=60,figsize = (12,12))
数据预处理——增加新变量
#增加新变量
train_add = train.copy()
train_add['Debt'] = np.multiply( train_add['MonthlyIncome'] , train_add['MonthlyIncome'])
train_add.hist(bins=60,figsize = (12,12))
#---------------------------------------------------------
#增加新变量
test_add = test.copy()
test_add['Debt'] = np.multiply( test_add['MonthlyIncome'] , test_add['MonthlyIncome'])
test_add.hist(bins=60,figsize = (12,12))
数据预处理——二值化
# 数值数据处理
# 二值化(效果一般,如果量纲跨度比较大,建议用log(x+1)平滑处理)
train_0 = train_add.copy()
train_0.info()
train_0['Is_30-59DaysPastDueNotWorse'] = np.where(train_0['NumberOfTime30-59DaysPastDueNotWorse'].isnull(),\
np.nan,(np.where(train_0['NumberOfTime30-59DaysPastDueNotWorse']>0,1,0)))
train_0['Is_60-89DaysPastDueNotWorse'] = np.where(train_0['NumberOfTime60-89DaysPastDueNotWorse'].isnull(),\
np.nan,(np.where(train_0['NumberOfTime60-89DaysPastDueNotWorse']>0,1,0)))
train_0['Is_90DaysLate'] = np.where(train_0['NumberOfTimes90DaysLate'].isnull(),\
np.nan,(np.where(train_0['NumberOfTimes90DaysLate']>0,1,0)))
train_0['Is_RealEstateLoansOrLines'] = np.where(train_0['NumberRealEstateLoansOrLines'].isnull(),\
np.nan,(np.where(train_0['NumberRealEstateLoansOrLines']>=2,1,0)))
train_0['Is_Dependents'] = np.where(train_0['NumberOfDependents'].isnull(),\
np.nan,(np.where(train_0['NumberOfDependents']>=2,1,0)))
train_0['Is_UnsecuredLines'] = np.where(train_0['RevolvingUtilizationOfUnsecuredLines'].isnull(),\
np.nan,(np.where(train_0['RevolvingUtilizationOfUnsecuredLines']>1,1,0)))
train_0.info()
# 数值数据处理
# 二值化
test_0 = test_add.copy()
test_0.info()
test_0['Is_30-59DaysPastDueNotWorse'] = np.where(test_0['NumberOfTime30-59DaysPastDueNotWorse'].isnull(),\
np.nan,(np.where(test_0['NumberOfTime30-59DaysPastDueNotWorse']>0,1,0)))
test_0['Is_60-89DaysPastDueNotWorse'] = np.where(test_0['NumberOfTime60-89DaysPastDueNotWorse'].isnull(),\
np.nan,(np.where(test_0['NumberOfTime60-89DaysPastDueNotWorse']>0,1,0)))
test_0['Is_90DaysLate'] = np.where(test_0['NumberOfTimes90DaysLate'].isnull(),\
np.nan,(np.where(test_0['NumberOfTimes90DaysLate']>0,1,0)))
test_0['Is_RealEstateLoansOrLines'] = np.where(test_0['NumberRealEstateLoansOrLines'].isnull(),\
np.nan,(np.where(test_0['NumberRealEstateLoansOrLines']>=2,1,0)))
test_0['Is_Dependents'] = np.where(test_0['NumberOfDependents'].isnull(),\
np.nan,(np.where(test_0['NumberOfDependents']>=2,1,0)))
test_0['Is_UnsecuredLines'] = np.where(test_0['RevolvingUtilizationOfUnsecuredLines'].isnull(),\
np.nan,(np.where(test_0['RevolvingUtilizationOfUnsecuredLines']>1,1,0)))
test_0.info()
异常值处理
# 异常值填充nan
def cap_nan(x, quantile=[0.25, 0.75]):
"""盖帽法处理异常值
Args:
x:pd.Series列,连续变量
quantile:指定盖帽法的上下分位数范围
"""
# 生成分位数
Q25, Q75=x.quantile(quantile).values.tolist()
low = Q25 - 1.5*(Q75 - Q25)
up = Q75 + 1.5*(Q75 - Q25)
# 替换异常值为指定的分位数
if low > x.min():
x = x.copy()
x.loc[x<low] = np.nan
if up < x.max():
x = x.copy()
x.loc[x>up] = np.nan
return(x)
# 异常值填充最大最小
def cap_low_up(x, quantile=[0.25, 0.75]):
"""盖帽法处理异常值
Args:
x:pd.Series列,连续变量
quantile:指定盖帽法的上下分位数范围
"""
# 生成分位数
Q25, Q75=x.quantile(quantile).values.tolist()
low = Q25 - 1.5*(Q75 - Q25)
up = Q75 + 1.5*(Q75 - Q25)
# 替换异常值为指定的分位数
if low > x.min():
x = x.copy()
x.loc[x<low] = low
if up < x.max():
x = x.copy()
x.loc[x>up] = up
return(x)
train_14 = train_0.copy()
train_14[predictors] = train_14[predictors].apply(cap_nan)
train_14.hist(bins=30,figsize = (12,12))
train_fp = train_14.copy().apply(pd.to_numeric, errors='ignore') # 自动转换格式
train_fp.info()
train_fp.describe().T
# test_fp = test_12.copy().apply(pd.to_numeric, errors='ignore') # 自动转换格式
# test_fp.info()
train_fp = train_14.copy().apply(pd.to_numeric, errors='ignore') # 自动转换格式
train_fp.info()
train_fp.describe().T
test_fp = test_14.copy().apply(pd.to_numeric, errors='ignore') # 自动转换格式
test_fp.info()
异常值填充
# 异常值填充
train_fp = train_fp.fillna(-9999)
train_fp.info()
train_fp.describe().T
# test_fp = test_fp.fillna(-9999)
# test_fp.info()
#--------------------------------------------------------------------------------------------
test_fp = test_fp.fillna(-9999)
test_fp.info()
# test_fp = test_fp.fillna(-9999)
# test_fp.info()
train_fp.to_csv('case1_nosmote.csv')
建模前数据准备
# 定义字段
# drop_list = [target,IDcol] + ['age','MonthlyIncome','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate',\
# 'NumberOfDependents','RevolvingUtilizationOfUnsecuredLines','NumberRealEstateLoansOrLines'\
# 'Is_RealEstateLoansOrLines','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','NumberRealEstateLoansOrLines']
drop_list = [target,IDcol,'MonthlyIncome','Is_30-59DaysPastDueNotWorse', 'Is_60-89DaysPastDueNotWorse', 'Is_90DaysLate', 'Is_RealEstateLoansOrLines', 'Is_Dependents', 'Is_UnsecuredLines']
predictors2 = [x for x in train_fp.columns if x not in drop_list] # 原始预测字段列表
print(predictors2)
特征选择
# 特征重要性(建立模型查看)
X_train = train_fp[predictors2]
y_train = train_fp[target]
# print(np.isnan(train_data3).any())
# Feature Importance
from sklearn import ensemble
model = ensemble.ExtraTreesRegressor(n_estimators=200, max_depth=20, max_features=0.5, n_jobs=-1, random_state=0)
model.fit(X_train, y_train)
## plotando as importâncias ##
feat_names = np.array(predictors2)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]
plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()
变量相关性分析
# 变量相关性分析
import seaborn as sns
corr = train_fp[predictors2].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt=".2g")
数据不平衡——过采样
# 过采样
sm = SMOTE() # lets reduce the imbalance
X_train = train_fp[predictors2]
print(predictors2)
y_train = train_fp[target]
counter = Counter(y_train)
print(counter)
X_res, y_res = sm.fit_sample(X_train, y_train)
counter = Counter(y_res)
print(counter)
train = pd.concat([X_res,y_res],axis=1)
train.info()
XGBOOST建模
确定学习速率和tree_based 参数调优的估计器数目
# 第一步:确定学习速率和tree_based 参数调优的估计器数目
# 需要修改目标变量
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50,target='SeriousDlqin2yrs'):
if useTrainCV:
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
# label=as.numeric(dtrain[target].values)
cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
metrics='auc', early_stopping_rounds=early_stopping_rounds)
# metrics='auc'aucpr
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')
#Predict training set:
dtrain_predictions = alg.predict(dtrain[predictors])
dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]
#Print model report:
print ("\nModel Report")
print("cv n_estimators: %f" % cvresult.shape[0])
print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
print ("f1 (Train): %f" % metrics.f1_score(dtrain[target], dtrain_predprob.round())) #dtrain_predprob.round()
# 'macro'
print ("f1 (macro): %f" % metrics.f1_score(dtrain[target], dtrain_predprob.round(),average='macro')) #dtrain_predprob.round()
print ("precision (Train): %f" % metrics.precision_score(dtrain[target], dtrain_predprob.round()))
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
#Choose all predictors except target & IDcols
# predictors = [x for x in predictors2 if x not in [target,IDcol]]
xgb1 = XGBClassifier(
learning_rate =0.1,# 0.1
n_estimators=81,#1000
max_depth=5,
min_child_weight=3,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,#scale_pos_weight=1 样本不平衡时使用该参数,加快收敛
#reg_alpha =1,
#reg_lambda = 10,
seed=27)
# 修改训练数据
modelfit(xgb1, train, predictors2,target)
超参数选择
# max_depth 和 min_weight 参数调优
# 调整scoring参数,调整目标值
param_test1 = {
'max_depth':range(3,10,2),
'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=81, max_depth=5,min_child_weight=1, \
gamma=0, subsample=0.8, colsample_bytree=0.8,\
objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
param_grid = param_test1,scoring='f1_macro',n_jobs=4,iid=False, cv=5)
#scoring='roc_auc','precision','recall','f1','f1_macro'
gsearch1.fit(train_fp[predictors],train_fp[target])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_
测试数据结果
# 预测值
test_fp.info()
y_pred = xgb1.predict(test_fp[predictors2])
ans = pd.read_csv('cs-test v2.csv')
ans.info()
y_true = ans[target]
# 计算精度
print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
print("roc_auc_score : %.4g" % metrics.roc_auc_score(y_true, y_pred))
print("f1_score : %.4g" % metrics.f1_score(y_true, y_pred))
print("precision_score: %.4g" % metrics.precision_score(y_true, y_pred))