本实战项目基于Lending Club的数据集【数据集地址:https://github.com/H-Freax/lendingclub_analyse】
本实战项目基于Colab环境
本数据分析实战项目分为两篇,第一篇主要介绍了基于LightGBM的Baseline方法,以及三种添加衍生变量的方法,找到了四组可以提升效果的衍生变量,第二篇主要介绍了基于机器学习方法及深度学习网络方法的数据分析,同时对机器学习方法的集成以及将深度学习网络与机器学习方法的融合进行了实践。
train_ML = df_train.copy()
test_ML = df_test.copy()
train_ML.fillna(0,inplace=True)
test_ML.fillna(0,inplace=True)
X_train = train_ML.drop(columns=['loan_status']).values
Y_train = train_ML['loan_status'].values.astype(int)
X_test = test_ML.drop(columns=['loan_status']).values
Y_test = test_ML['loan_status'].values.astype(int)
from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators = 100,random_state = 20)
rnd_clf.fit(X_train,Y_train)
rnd_clf.score(X_test,Y_test)
0.9164
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=20) #random_state用于复现
sgd_clf.fit(X_train,Y_train)
sgd_clf.score(X_test,Y_test)
0.8639
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state = 20)
lr_clf.fit(X_train,Y_train)
lr_clf.score(X_test,Y_test)
0.9111
from sklearn.ensemble import GradientBoostingClassifier
gdbt_clf = GradientBoostingClassifier(random_state = 20)
gdbt_clf.fit(X_train,Y_train)
gdbt_clf.score(X_test,Y_test)
0.91772
from sklearn.model_selection import cross_val_predict
y_train_pred=cross_val_predict(gdbt_clf,X_train,Y_train,cv=3)
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
conf_mx=confusion_matrix(Y_train,y_train_pred)
conf_mx
plt.matshow(conf_mx,cmap=plt.cm.gray)
plt.show()
conf_mx
array([[ 8271, 1941],
[ 2098, 37690]])
from sklearn.svm import SVC
svm_clf = SVC()
svm_clf.fit(X_train,Y_train)
svm_clf.score(X_test,Y_test)
0.80448
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier()
ada_clf.fit(X_train,Y_train)
ada_clf.score(X_test,Y_test)
0.91604
from lightgbm import LGBMClassifier
lgbm_clf = LGBMClassifier()
lgbm_clf.fit(X_train,Y_train)
lgbm_clf.score(X_test,Y_test)
0.91768
from xgboost import XGBClassifier #XGB分类器
xgb_clf = XGBClassifier()
xgb_clf.fit(X_train,Y_train)
xgb_clf.score(X_test,Y_test)
0.91712
from sklearn.naive_bayes import GaussianNB
nby_clf = GaussianNB()
nby_clf.fit(X_train,Y_train)
nby_clf.score(X_test,Y_test)
0.90478
from sklearn.neighbors import KNeighborsClassifier
knc_clf = KNeighborsClassifier()
knc_clf.fit(X_train,Y_train)
knc_clf.score(X_test,Y_test)
0.84852
from sklearn.ensemble import VotingClassifier #投票分类器
voting_clf = VotingClassifier(estimators=[('rf',rnd_clf ),('gdbt',gdbt_clf ),('ada',ada_clf ),('lgbm',lgbm_clf ),('xgb',xgb_clf )],#estimators:子分类器
voting='hard') #参数voting代表你的投票方式,hard,soft
# 训练模型,输出各模型准确率
from sklearn.metrics import accuracy_score
for clf in (lr_clf,rnd_clf,svm_clf,voting_clf):
clf.fit(X_train,Y_train)
y_pre = clf.predict(X_test)
print(clf.__class__,accuracy_score(y_pre,Y_test))
结果输出
<class 'sklearn.linear_model._logistic.LogisticRegression'> 0.91108
<class 'sklearn.ensemble._forest.RandomForestClassifier'> 0.9164
<class 'sklearn.svm._classes.SVC'> 0.80448
<class 'sklearn.ensemble._voting.VotingClassifier'> 0.91814
如果所有的分类器都能够估计类概率(即sklearn中它们都有一个predict_proba()方法),那么可以求出类别的概率平均值,投票分类器将具有最高概率的类作为自己的预测。这称为软投票。在代码中只需要两处做更改,在支持向量机中,需要将参数probablity设置为True,使支持向量机具备预测类概率的功能。投票分类器中需要把voting设置为soft
#soft voting
svm_clf1 = SVC(probability=True)
voting_clf = VotingClassifier(estimators=[('lf',lr_clf),('svc',svm_clf1),('rf',rnd_clf)],
voting='soft')
for clf in (lr_clf, rnd_clf, svm_clf1, voting_clf):
clf.fit(X_train,Y_train)
y_pre = clf.predict(X_test)
print(clf.__class__,accuracy_score(y_pre,Y_test))
输出结果
<class 'sklearn.linear_model._logistic.LogisticRegression'> 0.91108
<class 'sklearn.ensemble._forest.RandomForestClassifier'> 0.9164
<class 'sklearn.svm._classes.SVC'> 0.80448
<class 'sklearn.ensemble._voting.VotingClassifier'> 0.91664
常规情况下soft通常会效果变好,但是在这次融合中,效果下降了
堆叠是一种集成学习技术,它使用来自多个模型(例如决策树,knn或svm)的预测来构建新模型。该模型用于对测试集进行预测。以下是简单堆叠集成的逐步说明:
把训练集分为10个组。
基础模型(例如决策树)用训练集中的9个组进行训练,用第10个组进行预测。
然后将基础模型(例如决策树)拟合到整个训练数据集上。
使用此模型,在测试集上进行预测。
对另一个基本模型(比如knn)重复步骤2到4,产生对训练集和测试集的另一组预测。
训练集的预测被用作构建新模型的特征。
该模型用于对测试预测集进行最终预测。
from sklearn.model_selection import StratifiedKFold
def Stacking(model,train,y,test,n_fold):
folds=StratifiedKFold(n_splits=n_fold,random_state=1)
test_pred=np.empty((test.shape[0],1),float)
train_pred=np.empty((0,1),float)
for train_indices,val_indices in folds.split(train,y.values):
x_train,x_val=train.iloc[train_indices],train.iloc[val_indices]
y_train,y_val=y.iloc[train_indices],y.iloc[val_indices]
model.fit(X=x_train,y=y_train)
train_pred=np.append(train_pred,model.predict(x_val))
test_pred=np.column_stack((test_pred,model.predict(test)))
test_pred_a=np.mean(test_pred,axis=1) #按行计算均值
return test_pred_a.reshape(-1,1),train_pred
采用gdbt和lgbm堆第0层
x_train=train_ML.drop(columns=['loan_status'])
x_test=test_ML.drop(columns=['loan_status'])
y_train=train_ML['loan_status']
test_pred1 ,train_pred1=Stacking(model=gdbt_clf,n_fold=10, train=x_train,test=x_test,y=y_train)
print(test_pred1.size)
train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)
test_pred2 ,train_pred2=Stacking(model=lgbm_clf,n_fold=10,train=x_train,test=x_test,y=y_train)
print(test_pred2.size)
train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)
采用随机森林堆第一层
dff = pd.concat([train_pred1, train_pred2], axis=1)
dff_test = pd.concat([test_pred1, test_pred2], axis=1)
rnd_clf.fit(dff,y_train)
rnd_clf.score(dff_test, Y_test)
0.91798
混合遵循与堆叠相同的方法,但仅使用来自训练集的保留/(验证)集来进行预测。换句话说,与堆叠不同,预测仅在保留集上进行。保留集和它的预测用于构建模型,用测试集对这个模型进行测试。以下是混合过程的详细说明:
原始的训练集分为训练集和验证集。
把模型拟合到训练集上。
预测在验证集和测试集上进行。
验证集及其预测用作构建新模型的特征。
该模型用于对测试集和元特征(meta-features)进行最终预测。
同样的顺序
先采用gdbt和lgbm
再采用随机森林
x_train=train_ML.drop(columns=['loan_status'])
x_test=test_ML.drop(columns=['loan_status'])
y_train=train_ML['loan_status']
val_pred1 = gdbt_clf.predict(x_train)
test_pred1 = gdbt_clf.predict(x_test)
val_pred1 = pd.DataFrame(val_pred1)
test_pred1 = pd.DataFrame(test_pred1)
val_pred2 = lgbm_clf.predict(x_train)
test_pred2 = lgbm_clf.predict(x_test)
val_pred2 = pd.DataFrame(val_pred2)
test_pred2 = pd.DataFrame(test_pred2)
df2_val = pd.concat([x_train,val_pred1,val_pred2],axis = 1)
df2_test = pd.concat([x_test,test_pred1,test_pred2],axis = 1)
rnd_clf.fit(df2_val,y_train)
rnd_clf.score(df2_test,Y_test)
0.91668
train_DL = df_train.copy()
test_DL = df_test.copy()
train_DL.fillna(0,inplace=True)
test_DL.fillna(0,inplace=True)
X_train = train_DL.drop(columns=['loan_status']).values
Y_train = train_DL['loan_status'].values.astype(int)
X_test = test_DL.drop(columns=['loan_status']).values
Y_test = test_DL['loan_status'].values.astype(int)
from tensorflow.keras.utils import to_categorical
Y_test=to_categorical(Y_test,2).astype(int)
Y_train=to_categorical(Y_train,2).astype(int)
import keras as K
from keras.layers.core import Dropout
init = K.initializers.glorot_uniform(seed=1)
model = K.models.Sequential()
model.add(K.layers.Dense(units=146, input_dim=145, kernel_initializer=init, activation='relu'))
model.add(K.layers.Dense(units=147, kernel_initializer=init, activation='relu'))
model.add(K.layers.Dense(units=2, kernel_initializer=init, activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])
b_size = 128
max_epochs = 100
print("Starting training ")
h = model.fit(X_train, Y_train, batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1)
print("Training finished \n")
eval = model.evaluate(X_test, Y_test, verbose=0)
print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%% \n" \
% (eval[0], eval[1] * 100) )
Evaluation on test data: loss = 0.244760 accuracy = 90.52%
train_DL = df_train.copy()
test_DL = df_test.copy()
train_DL.fillna(0,inplace=True)
test_DL.fillna(0,inplace=True)
X_train = train_DL.drop(columns=['loan_status']).values
Y_train = train_DL['loan_status'].values.astype(int)
X_test = test_DL.drop(columns=['loan_status']).values
Y_test = test_DL['loan_status'].values.astype(int)
from tensorflow.keras.utils import to_categorical
Y_test=to_categorical(Y_test,2).astype(int)
Y_train=to_categorical(Y_train,2).astype(int)
import keras as K
from keras.layers.core import Dropout
init = K.initializers.glorot_uniform(seed=1)
simple_adam = K.optimizers.Adam()#trick为添加了adam
model = K.models.Sequential()
model.add(K.layers.Dense(units=146, input_dim=145, kernel_initializer=init, activation='relu'))
# model.add(Dropout(0.1))#采用dropout效果并不好
model.add(K.layers.Dense(units=147, kernel_initializer=init, activation='relu'))
# model.add(Dropout(0.9))
model.add(K.layers.Dense(units=2, kernel_initializer=init, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=simple_adam, metrics=['accuracy'])
b_size = 128
max_epochs = 100
print("Starting training ")
h = model.fit(X_train, Y_train, batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1)
print("Training finished \n")
eval = model.evaluate(X_test, Y_test, verbose=0)
print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%% \n" \
% (eval[0], eval[1] * 100) )
Evaluation on test data: loss = 0.214410 accuracy = 91.21%
train_DL = df_train.copy()
test_DL = df_test.copy()
train_DL.fillna(0,inplace=True)
test_DL.fillna(0,inplace=True)
X_train = train_DL.drop(columns=['loan_status']).values
Y_train = train_DL['loan_status'].values.astype(int)
X_test = test_DL.drop(columns=['loan_status']).values
Y_test = test_DL['loan_status'].values.astype(int)
from tensorflow.keras.utils import to_categorical
Y_test=to_categorical(Y_test,2).astype(int)
Y_train=to_categorical(Y_train,2).astype(int)
import keras as K
from keras.layers.core import Dropout
init = K.initializers.glorot_uniform(seed=1)
simple_adam = K.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=1e-06)#trick为添加了RMSprop
model = K.models.Sequential()
model.add(K.layers.Dense(units=146, input_dim=145, kernel_initializer=init, activation='relu'))
# model.add(Dropout(0.1))#采用dropout效果并不好
model.add(K.layers.Dense(units=147, kernel_initializer=init, activation='relu'))
# model.add(Dropout(0.9))
model.add(K.layers.Dense(units=2, kernel_initializer=init, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=simple_adam, metrics=['accuracy'])
b_size = 128
max_epochs = 100
print("Starting training ")
h = model.fit(X_train, Y_train, batch_size=b_size, epochs=max_epochs, shuffle=True, verbose=1)
print("Training finished \n")
eval = model.evaluate(X_test, Y_test, verbose=0)
print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%% \n" \
% (eval[0], eval[1] * 100) )
Evaluation on test data: loss = 0.237782 accuracy = 91.39%
pip install pytorch-tabnet
train_DL = df_train.copy()
test_DL = df_test.copy()
train_DL.fillna(0,inplace=True)
test_DL.fillna(0,inplace=True)
X_train = train_DL.drop(columns=['loan_status']).values
Y_train = train_DL['loan_status'].values.astype(int)
X_test = test_DL.drop(columns=['loan_status']).values
Y_test = test_DL['loan_status'].values.astype(int)
from pytorch_tabnet.tab_model import TabNetClassifier, TabNetRegressor
clf = TabNetClassifier() #TabNetRegressor()
clf.fit(
X_train, Y_train
)
preds = clf.predict(X_test)
accuracy_score(Y_test,preds)
0.9115
train_DL = df_train.copy()
test_DL = df_test.copy()
train_DL.fillna(0,inplace=True)
test_DL.fillna(0,inplace=True)
X_train = train_DL.drop(columns=['loan_status']).values
Y_train = train_DL['loan_status'].values.astype(int)
X_test = test_DL.drop(columns=['loan_status']).values
Y_test = test_DL['loan_status'].values.astype(int)
from tensorflow.keras.utils import to_categorical
Y_test=to_categorical(Y_test,2).astype(int)
Y_train=to_categorical(Y_train,2).astype(int)
import keras as K
from keras.layers.core import Dropout
init = K.initializers.glorot_uniform(seed=1)
model = K.models.Sequential()
model.add(K.layers.Dense(units=146, input_dim=2, kernel_initializer=init, activation='relu'))
model.add(K.layers.Dense(units=147, kernel_initializer=init, activation='relu'))
model.add(K.layers.Dense(units=2, kernel_initializer=init, activation='softmax'))
model.compile(loss='categorical_crossentropy', metrics=['accuracy'])
x_train=train_ML.drop(columns=['loan_status'])
x_test=test_ML.drop(columns=['loan_status'])
y_train=train_ML['loan_status']
test_pred1 ,train_pred1=Stacking(model=gdbt_clf,n_fold=10, train=x_train,test=x_test,y=y_train)
print(test_pred1.size)
train_pred1=pd.DataFrame(train_pred1)
test_pred1=pd.DataFrame(test_pred1)
test_pred2 ,train_pred2=Stacking(model=lgbm_clf,n_fold=10,train=x_train,test=x_test,y=y_train)
print(test_pred2.size)
train_pred2=pd.DataFrame(train_pred2)
test_pred2=pd.DataFrame(test_pred2)
dff = pd.concat([train_pred1, train_pred2], axis=1)
dff_test = pd.concat([test_pred1, test_pred2], axis=1)
model.fit(dff,y_train)
eval = model.evaluate(dff_test, Y_test, verbose=0)
print("Evaluation on test data: loss = %0.6f accuracy = %0.2f%% \n" \
% (eval[0], eval[1] * 100) )
结果
1563/1563 [==============================] - 4s 2ms/step - loss: 0.2892 - accuracy: 0.9029
Evaluation on test data: loss = 0.261336 accuracy = 91.83%