CDA level 2 案例1实现

案例

相关代码:

常用库导入

#  导入相关库
import numpy as np
import pandas as pd
import seaborn as sns
sns.set_style("whitegrid") 
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4

import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import model_selection, metrics   #Additional     scklearn functions
from sklearn.model_selection import GridSearchCV   #Perforing grid search
from sklearn.svm import SVC
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, normalize  # Helps us in Standardizing the data
from sklearn.metrics import precision_score, f1_score, recall_score, accuracy_score, average_precision_score # Evaluation
from collections import Counter
import lightgbm as lgb

# 过采样
import imblearn
from imblearn.over_sampling import SMOTE
from sklearn.impute import SimpleImputer
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
from imblearn.combine import SMOTEENN

pd.set_option("display.max_rows",1000)
pd.set_option("display.max_columns",20)
pd.set_option('precision',7)
pd.set_option('large_repr', 'truncate')

数据导入

#  数据导入
train =  pd.read_csv('cs-training.csv')
test = pd.read_csv('cs-test.csv')  

# train.info()
# train.describe().T
# test.info()
# test.describe().T

数据描述

#  数据描述
target = 'SeriousDlqin2yrs' #目标字段名列表
IDcol = 'CustomerID' #ID字段名列表
predictors = [x for x in train.columns if x not in [target,IDcol]] # 原始预测字段列表
print('train shape  ',train.shape)
print(train.dtypes.value_counts())
train.info()
train.describe().T
train.head()  

train.info()
train = train.apply(pd.to_numeric, errors='ignore') # 自动转换格式
train.info()
test.info()
test = test.apply(pd.to_numeric, errors='ignore') # 自动转换格式
test.info()
train[predictors].describe().T
train.hist(bins=60,figsize = (12,12))

test.hist(bins=60,figsize = (12,12))

数据预处理——增加新变量


#增加新变量
train_add = train.copy()
train_add['Debt'] = np.multiply( train_add['MonthlyIncome'] , train_add['MonthlyIncome'])
train_add.hist(bins=60,figsize = (12,12))
#---------------------------------------------------------
#增加新变量
test_add = test.copy()
test_add['Debt'] = np.multiply( test_add['MonthlyIncome'] , test_add['MonthlyIncome'])
test_add.hist(bins=60,figsize = (12,12))

数据预处理——二值化

# 数值数据处理
# 二值化(效果一般,如果量纲跨度比较大,建议用log(x+1)平滑处理)
train_0 = train_add.copy()
train_0.info()
train_0['Is_30-59DaysPastDueNotWorse'] = np.where(train_0['NumberOfTime30-59DaysPastDueNotWorse'].isnull(),\
                                                           np.nan,(np.where(train_0['NumberOfTime30-59DaysPastDueNotWorse']>0,1,0)))
train_0['Is_60-89DaysPastDueNotWorse'] = np.where(train_0['NumberOfTime60-89DaysPastDueNotWorse'].isnull(),\
                                                           np.nan,(np.where(train_0['NumberOfTime60-89DaysPastDueNotWorse']>0,1,0)))
train_0['Is_90DaysLate'] = np.where(train_0['NumberOfTimes90DaysLate'].isnull(),\
                                                           np.nan,(np.where(train_0['NumberOfTimes90DaysLate']>0,1,0)))
train_0['Is_RealEstateLoansOrLines'] = np.where(train_0['NumberRealEstateLoansOrLines'].isnull(),\
                                                           np.nan,(np.where(train_0['NumberRealEstateLoansOrLines']>=2,1,0)))
train_0['Is_Dependents'] = np.where(train_0['NumberOfDependents'].isnull(),\
                                                           np.nan,(np.where(train_0['NumberOfDependents']>=2,1,0)))
train_0['Is_UnsecuredLines'] = np.where(train_0['RevolvingUtilizationOfUnsecuredLines'].isnull(),\
                                                           np.nan,(np.where(train_0['RevolvingUtilizationOfUnsecuredLines']>1,1,0)))
train_0.info()
# 数值数据处理
# 二值化
test_0 = test_add.copy()
test_0.info()
test_0['Is_30-59DaysPastDueNotWorse'] = np.where(test_0['NumberOfTime30-59DaysPastDueNotWorse'].isnull(),\
                                                           np.nan,(np.where(test_0['NumberOfTime30-59DaysPastDueNotWorse']>0,1,0)))
test_0['Is_60-89DaysPastDueNotWorse'] = np.where(test_0['NumberOfTime60-89DaysPastDueNotWorse'].isnull(),\
                                                           np.nan,(np.where(test_0['NumberOfTime60-89DaysPastDueNotWorse']>0,1,0)))
test_0['Is_90DaysLate'] = np.where(test_0['NumberOfTimes90DaysLate'].isnull(),\
                                                           np.nan,(np.where(test_0['NumberOfTimes90DaysLate']>0,1,0)))
test_0['Is_RealEstateLoansOrLines'] = np.where(test_0['NumberRealEstateLoansOrLines'].isnull(),\
                                                           np.nan,(np.where(test_0['NumberRealEstateLoansOrLines']>=2,1,0)))
test_0['Is_Dependents'] = np.where(test_0['NumberOfDependents'].isnull(),\
                                                           np.nan,(np.where(test_0['NumberOfDependents']>=2,1,0)))
test_0['Is_UnsecuredLines'] = np.where(test_0['RevolvingUtilizationOfUnsecuredLines'].isnull(),\
                                                           np.nan,(np.where(test_0['RevolvingUtilizationOfUnsecuredLines']>1,1,0)))
test_0.info()

异常值处理

# 异常值填充nan
def cap_nan(x, quantile=[0.25, 0.75]):
    """盖帽法处理异常值
    Args:
    x:pd.Series列,连续变量
    quantile:指定盖帽法的上下分位数范围
    """ 
    # 生成分位数
    Q25, Q75=x.quantile(quantile).values.tolist() 
    low = Q25 - 1.5*(Q75 - Q25)
    up = Q75 + 1.5*(Q75 - Q25)
    # 替换异常值为指定的分位数
    if low > x.min():
        x = x.copy()
        x.loc[x<low] = np.nan
    if up < x.max():
        x = x.copy()
        x.loc[x>up] = np.nan
    
    return(x)

# 异常值填充最大最小
def cap_low_up(x, quantile=[0.25, 0.75]):
    """盖帽法处理异常值
    Args:
    x:pd.Series列,连续变量
    quantile:指定盖帽法的上下分位数范围
    """ 
    # 生成分位数
    Q25, Q75=x.quantile(quantile).values.tolist() 
    low = Q25 - 1.5*(Q75 - Q25)
    up = Q75 + 1.5*(Q75 - Q25)
    # 替换异常值为指定的分位数
    if low > x.min():
        x = x.copy()
        x.loc[x<low] = low
    if up < x.max():
        x = x.copy()
        x.loc[x>up] = up
    
    return(x)
train_14 = train_0.copy()
train_14[predictors] = train_14[predictors].apply(cap_nan)

train_14.hist(bins=30,figsize = (12,12))
train_fp = train_14.copy().apply(pd.to_numeric, errors='ignore') # 自动转换格式
train_fp.info()
train_fp.describe().T
# test_fp = test_12.copy().apply(pd.to_numeric, errors='ignore') # 自动转换格式
# test_fp.info()

train_fp = train_14.copy().apply(pd.to_numeric, errors='ignore') # 自动转换格式
train_fp.info()
train_fp.describe().T
test_fp = test_14.copy().apply(pd.to_numeric, errors='ignore') # 自动转换格式
test_fp.info()

异常值填充

# 异常值填充
train_fp = train_fp.fillna(-9999)
train_fp.info()
train_fp.describe().T
# test_fp = test_fp.fillna(-9999)
# test_fp.info()
#--------------------------------------------------------------------------------------------
test_fp = test_fp.fillna(-9999)
test_fp.info()
# test_fp = test_fp.fillna(-9999)
# test_fp.info()
train_fp.to_csv('case1_nosmote.csv')

建模前数据准备

# 定义字段
# drop_list = [target,IDcol] + ['age','MonthlyIncome','NumberOfTime60-89DaysPastDueNotWorse','NumberOfTimes90DaysLate',\
#                               'NumberOfDependents','RevolvingUtilizationOfUnsecuredLines','NumberRealEstateLoansOrLines'\
#                               'Is_RealEstateLoansOrLines','NumberOfTime30-59DaysPastDueNotWorse','DebtRatio','NumberRealEstateLoansOrLines']
drop_list = [target,IDcol,'MonthlyIncome','Is_30-59DaysPastDueNotWorse', 'Is_60-89DaysPastDueNotWorse', 'Is_90DaysLate', 'Is_RealEstateLoansOrLines', 'Is_Dependents', 'Is_UnsecuredLines'] 
predictors2 = [x for x in train_fp.columns if x not in drop_list] # 原始预测字段列表
print(predictors2)

特征选择

# 特征重要性(建立模型查看)
X_train = train_fp[predictors2]
y_train = train_fp[target]
# print(np.isnan(train_data3).any())
# Feature Importance
from sklearn import ensemble
model = ensemble.ExtraTreesRegressor(n_estimators=200, max_depth=20, max_features=0.5, n_jobs=-1, random_state=0)
model.fit(X_train, y_train)

## plotando as importâncias ##
feat_names = np.array(predictors2)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1][:20]

plt.figure(figsize=(12,12))
plt.title("Feature importances")
plt.bar(range(len(indices)), importances[indices], color="r", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), feat_names[indices], rotation='vertical')
plt.xlim([-1, len(indices)])
plt.show()  

变量相关性分析

# 变量相关性分析
import seaborn as sns
corr = train_fp[predictors2].corr()
plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, fmt=".2g")  

数据不平衡——过采样

# 过采样
sm = SMOTE() # lets reduce the imbalance 
X_train = train_fp[predictors2]

print(predictors2)
y_train = train_fp[target]

counter = Counter(y_train)
print(counter)

X_res, y_res = sm.fit_sample(X_train, y_train)

counter = Counter(y_res)
print(counter)

train = pd.concat([X_res,y_res],axis=1)
train.info()

XGBOOST建模
确定学习速率和tree_based 参数调优的估计器数目

# 第一步:确定学习速率和tree_based 参数调优的估计器数目
# 需要修改目标变量
def modelfit(alg, dtrain, predictors,useTrainCV=True, cv_folds=5, early_stopping_rounds=50,target='SeriousDlqin2yrs'):
    if useTrainCV:
        xgb_param = alg.get_xgb_params()
        xgtrain = xgb.DMatrix(dtrain[predictors].values, label=dtrain[target].values)
        # label=as.numeric(dtrain[target].values)
        cvresult = xgb.cv(xgb_param, xgtrain, num_boost_round=alg.get_params()['n_estimators'], nfold=cv_folds,
            metrics='auc', early_stopping_rounds=early_stopping_rounds)
        # metrics='auc'aucpr
        alg.set_params(n_estimators=cvresult.shape[0])

    #Fit the algorithm on the data
    alg.fit(dtrain[predictors], dtrain[target],eval_metric='auc')

    #Predict training set:
    dtrain_predictions = alg.predict(dtrain[predictors])
    dtrain_predprob = alg.predict_proba(dtrain[predictors])[:,1]

    #Print model report:
    print ("\nModel Report")
    print("cv n_estimators: %f" % cvresult.shape[0])
    print ("Accuracy : %.4g" % metrics.accuracy_score(dtrain[target].values, dtrain_predictions))
    print ("AUC Score (Train): %f" % metrics.roc_auc_score(dtrain[target], dtrain_predprob))
    print ("f1 (Train): %f" % metrics.f1_score(dtrain[target], dtrain_predprob.round())) #dtrain_predprob.round()
    # 'macro'
    print ("f1 (macro): %f" % metrics.f1_score(dtrain[target], dtrain_predprob.round(),average='macro')) #dtrain_predprob.round()
    print ("precision (Train): %f" % metrics.precision_score(dtrain[target], dtrain_predprob.round()))

    feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
    feat_imp.plot(kind='bar', title='Feature Importances')
    plt.ylabel('Feature Importance Score')
#Choose all predictors except target & IDcols

# predictors = [x for x in predictors2 if x not in [target,IDcol]]
xgb1 = XGBClassifier(
 learning_rate =0.1,# 0.1
 n_estimators=81,#1000
 max_depth=5,
 min_child_weight=3,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,#scale_pos_weight=1 样本不平衡时使用该参数,加快收敛
#reg_alpha =1,
#reg_lambda = 10,
 seed=27)
# 修改训练数据
modelfit(xgb1, train, predictors2,target)

超参数选择

#  max_depth 和 min_weight 参数调优
# 调整scoring参数,调整目标值
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator = XGBClassifier(learning_rate =0.1, n_estimators=81, max_depth=5,min_child_weight=1, \
                                                  gamma=0, subsample=0.8, colsample_bytree=0.8,\
                                                  objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27), 
 param_grid = param_test1,scoring='f1_macro',n_jobs=4,iid=False, cv=5) 
#scoring='roc_auc','precision','recall','f1','f1_macro'
gsearch1.fit(train_fp[predictors],train_fp[target])
gsearch1.cv_results_, gsearch1.best_params_, gsearch1.best_score_

测试数据结果

# 预测值
test_fp.info()
y_pred = xgb1.predict(test_fp[predictors2])
ans = pd.read_csv('cs-test v2.csv')
ans.info()
y_true = ans[target]
# 计算精度
print("Accuracy : %.4g" % metrics.accuracy_score(y_true, y_pred))
print("roc_auc_score : %.4g" % metrics.roc_auc_score(y_true, y_pred))
print("f1_score : %.4g" % metrics.f1_score(y_true, y_pred))
print("precision_score: %.4g" % metrics.precision_score(y_true, y_pred))

你可能感兴趣的:(CDA数据分析)