# coding: utf-8
# Table of Contents
#
# In[70]:
import numpy as np ##保存当前所有环境变量
import dill
filename= 'Titanic.pkl'
dill.dump_session(filename)
# In[1]:
import numpy as np
import dill
filename = 'Titanic.pkl'
dill.load_session(filename) ##读取环境变量
# ## 1. 数据总览
# In[2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns ##作图工具
# In[60]:
train_data = pd.read_csv("/Users/lkl/Gtrain.csv")
test_data = pd.read_csv("/Users/lkl/Gtest.csv")
train_data = train_data.drop(['row.names(tT)'], axis=1)
test_data = test_data.drop(['row.names(tT)'], axis=1)
# In[49]:
train_data
# In[47]:
train_data.info()
# In[10]:
train_data['Regulated'].value_counts().plot.pie(autopct = '%1.2f%%')
# In[11]:
plt.figure(figsize=(12,5))
plt.subplot(121) ##比例为纵1,横向1/2
train_data['B'].hist(bins=70) ##绘制70条柱状图,每个为1个年龄
plt.xlabel('B')
plt.ylabel('Num')
plt.subplot(122)
train_data.boxplot(column='B')
plt.show()
# In[32]:
train_df_org = train_data
test_df_org = test_data
titanic_train_Y = train_data['Regulated']
test_df_org['Regulated'] = 0 ##因为测试集的labels是不需要的,将其纳入工程容易导致测试集也用作训练集,故全写0表示占位,且不使用此列数据,只是为了特征处理时具有此特征而已
combined_train_test = train_df_org.append(test_df_org)
# ### 1)特征间相关性分析
# In[16]:
#这里只取部分主要特征,查看之间的关联性
Correlation = pd.DataFrame(combined_train_test[['logFC','t', 'B', 'adj.P.Val','AveExpr', 'P.Value']])
colormap = plt.cm.viridis
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y = 1.05, size = 15)
sns.heatmap(Correlation.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot = True)
# 上图可见,各主要特征的相关度不高,即彼此信息近似正交,使用该特征效果较好
# ## 7.模型融合及测试
# 由此图可见,虽然最终的测试得分差不多,但不同的学习器筛选的特征有不同的偏好,所以进行模型融合特征筛选是必要的
# ### 2)依据筛选的特征构建新的训练集和测试集
# In[95]:
titanic_train_X = train_data.drop(['Regulated'], axis=1)
titanic_train_Y = train_data['Regulated']
## titanic_train_data_Y = pd.DataFrame(titanic_train_Y[feature_top_n]) 此段代码错误,Y应该不会变化
titanic_train_data_X = pd.DataFrame(train_data.drop(['Regulated'], axis = 1))
titanic_test_data_X = test_data.drop(['Regulated'], axis = 1) ##pd[]中传入对象也可使得按新的顺序排列
# ### 3)模型融合
# In[96]:
##此处采用Stacking融合,
##Level 1采用 7个基学习器,将其作7-fold交叉学习,将学习到的结果作为输入到Level 2,Level 2采用XGBoost。
##Level 1:SVM,GBDT,RF,ET,Adaboost,KNN,DT
from sklearn.model_selection import KFold
ntrain = titanic_train_data_X.shape[0]
ntest = titanic_test_data_X.shape[0]
Seed = 0 ##random seed
NFold = 7
kf = KFold(n_splits=NFold, random_state=Seed, shuffle=False) ##已经打乱了顺序,不必再打乱
def get_out_fold(clf, x_train, y_train, x_test):
oof_train = np.zeros((ntrain,)) ##np.zero((m,n)) 给出一个mXn的0矩阵,故有双括号,此处n没有,就是一个长度为891的数组
oof_test = np.zeros((ntest,))
oof_test_skf = np.empty((NFold, ntest)) ##生成mXn个随机数
for i, (train_index, test_index) in enumerate(kf.split(x_train)): ##本身kf.split(X),会返回对应的K-fold子集对应的下标数组,而此处用enumerate是为了对每个返回子集产生标号
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
clf.fit(x_tr, y_tr)
oof_train[test_index] = clf.predict(x_te) ##某一Fold在交叉验证集上的输出,由于K-Fold最后会将左右数据都当做交叉验证集一次,故最后会对所有的训练集都作出预测,每个Fold训练时与其他Fold无关,故不存在过拟合,且充分训练
oof_test_skf[i, :] = clf.predict(x_test) ##某一Fold在测试集上的输出,保存为1行,7Flod故共有7行
oof_test[:] = oof_test_skf.mean(axis = 0) ##对列求均值,每列代表一个样本在7个Fold下的对应输出,这代表将每个Fold产生的模型,对测试集的预测取平均值
return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) ##reshape(-1,1)将其变换成一列的列向量,因,原数据是一行891个基于训练集的预测,同理oof_test
##返回值是训练集的K-FLOD预测值,和测试集的基于K-FLOD的预测均值
# In[97]:
##构建基学习器
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier ##随机森林
from sklearn.ensemble import GradientBoostingClassifier ##GBDT
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier ## ExtraTree
from sklearn.tree import DecisionTreeClassifier
from sklearn import ensemble ##集成学习
from sklearn import model_selection
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
rf_est = RandomForestClassifier()
rf_grid_param = {'n_estimators':range(300,500,3), 'warm_start':[True,False],'min_samples_split':range(2,10,3),'min_samples_leaf':range(1,15,3),
'max_depth':range(6,10,3)}
rf_grid = model_selection.GridSearchCV(rf_est, rf_grid_param, n_jobs=-1,verbose=0)
ada_est = AdaBoostClassifier()
ada_grid_param = {'n_estimators':range(300,500,3), 'learning_rate': np.logspace(-2, -1, 10)}
ada_grid = model_selection.GridSearchCV(ada_est,ada_grid_param)
gb_est = GradientBoostingClassifier()
gb_grid_param = {'n_estimators':range(300,500,3),'learning_rate':np.logspace(-3,-1,20),'min_samples_split':range(2,10,3),'min_samples_leaf':range(1,15,3),'max_depth':range(3,10,3)}
gb_grid = model_selection.GridSearchCV(gb_est, gb_grid_param, n_jobs=-1, verbose=0)
svm_est = SVC()
svm_grid_param = {'kernel':['linear','rbf'], 'C':np.logspace(-4,-1,4)}
svm_grid = model_selection.GridSearchCV(svm_est,svm_grid_param)
et_est = ExtraTreesClassifier()
et_grid_param = {'n_estimators':range(300,500,3), 'min_samples_leaf':range(2,6,3), 'max_depth':range(8,20,3)}
et_grid = model_selection.GridSearchCV(et_est, et_grid_param)
knn_est = KNeighborsClassifier()
knn_grid_param = {'n_neighbors':range(2,4,3)}
knn_grid = model_selection.GridSearchCV(knn_est, knn_grid_param)
dt_est = DecisionTreeClassifier()
dt_grid_param = {'max_depth':range(8,20,3)}
dt_grid = model_selection.GridSearchCV(dt_est, dt_grid_param)
# In[98]:
##测试基学习器
rf = RandomForestClassifier(n_estimators=500, warm_start=True, max_features='sqrt',max_depth=6,
min_samples_split=3, min_samples_leaf=2, n_jobs=-1, verbose=0)
ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_depth=8, min_samples_leaf=2, verbose=0)
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.008, min_samples_split=3, min_samples_leaf=2, max_depth=5, verbose=0)
dt = DecisionTreeClassifier(max_depth=8)
knn = KNeighborsClassifier(n_neighbors = 2)
svm = SVC(kernel='linear', C=0.025)
# In[99]:
x_train = titanic_train_data_X.values ## 将pandas转化为np数组,因为get_out_fold函数要求是np形式的数据才能操作
x_test = titanic_test_data_X.values
y_train = titanic_train_Y.values
# In[100]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
# In[101]:
rf_oof_train, rf_oof_test = get_out_fold(rf, x_train, y_train, x_test)
ada_oof_train, ada_oof_test = get_out_fold(ada,x_train, y_train, x_test)
et_oof_train, et_oof_test = get_out_fold(et,x_train, y_train, x_test)
gb_oof_train, gb_oof_test = get_out_fold(gb,x_train, y_train, x_test)
dt_oof_train, dt_oof_test = get_out_fold(dt, x_train, y_train, x_test)
knn_oof_train, knn_oof_test = get_out_fold(knn, x_train, y_train, x_test)
svm_oof_train, svm_oof_test = get_out_fold(svm, x_train, y_train, x_test)
print('Training is complete')
# In[102]:
##level 2:XGboost
x_train = np.concatenate((rf_oof_train,ada_oof_train,et_oof_train,gb_oof_train,dt_oof_train,knn_oof_train,svm_oof_train),axis = 1) ##按行堆叠成一个数据表
x_test = np.concatenate((rf_oof_test,ada_oof_test,et_oof_test,gb_oof_test,dt_oof_test,knn_oof_test,svm_oof_test), axis = 1)
# In[103]:
from xgboost import XGBClassifier
gbm = XGBClassifier(n_estimators=2000, max_depth=4, min_child_weight=2, gamma=0.9, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread = -1, scale_pos_weight=1).fit(x_train,y_train)
prediction = gbm.predict(x_test)
# In[109]:
gbm.score(x_train,y_train)
# ## Final.生成预测结果
# In[106]:
prediction
y_true = test_data['Regulated']
##预测完毕
from sklearn.metrics import classification_report
print(classification_report(y_true=y_true, y_pred=prediction))