基于机器学习的泰坦尼克号预测

# coding: utf-8

# 

Table of Contents

# # In[70]: import numpy as np ##保存当前所有环境变量 import dill filename= 'Titanic.pkl' dill.dump_session(filename) # In[1]: import numpy as np import dill filename = 'Titanic.pkl' dill.load_session(filename) ##读取环境变量 # ## 1. 数据总览 # In[2]: import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns ##作图工具 # In[60]: train_data = pd.read_csv("/Users/lkl/Gtrain.csv") test_data = pd.read_csv("/Users/lkl/Gtest.csv") train_data = train_data.drop(['row.names(tT)'], axis=1) test_data = test_data.drop(['row.names(tT)'], axis=1) # In[49]: train_data # In[47]: train_data.info() # In[10]: train_data['Regulated'].value_counts().plot.pie(autopct = '%1.2f%%') # In[11]: plt.figure(figsize=(12,5)) plt.subplot(121) ##比例为纵1,横向1/2 train_data['B'].hist(bins=70) ##绘制70条柱状图,每个为1个年龄 plt.xlabel('B') plt.ylabel('Num') plt.subplot(122) train_data.boxplot(column='B') plt.show() # In[32]: train_df_org = train_data test_df_org = test_data titanic_train_Y = train_data['Regulated'] test_df_org['Regulated'] = 0 ##因为测试集的labels是不需要的,将其纳入工程容易导致测试集也用作训练集,故全写0表示占位,且不使用此列数据,只是为了特征处理时具有此特征而已 combined_train_test = train_df_org.append(test_df_org) # ### 1)特征间相关性分析 # In[16]: #这里只取部分主要特征,查看之间的关联性 Correlation = pd.DataFrame(combined_train_test[['logFC','t', 'B', 'adj.P.Val','AveExpr', 'P.Value']]) colormap = plt.cm.viridis plt.figure(figsize=(14,12)) plt.title('Pearson Correlation of Features', y = 1.05, size = 15) sns.heatmap(Correlation.astype(float).corr(), linewidths=0.1, vmax=1.0, square=True, cmap=colormap, linecolor='white', annot = True) # 上图可见,各主要特征的相关度不高,即彼此信息近似正交,使用该特征效果较好 # ## 7.模型融合及测试 # 由此图可见,虽然最终的测试得分差不多,但不同的学习器筛选的特征有不同的偏好,所以进行模型融合特征筛选是必要的 # ### 2)依据筛选的特征构建新的训练集和测试集 # In[95]: titanic_train_X = train_data.drop(['Regulated'], axis=1) titanic_train_Y = train_data['Regulated'] ## titanic_train_data_Y = pd.DataFrame(titanic_train_Y[feature_top_n]) 此段代码错误,Y应该不会变化 titanic_train_data_X = pd.DataFrame(train_data.drop(['Regulated'], axis = 1)) titanic_test_data_X = test_data.drop(['Regulated'], axis = 1) ##pd[]中传入对象也可使得按新的顺序排列 # ### 3)模型融合 # In[96]: ##此处采用Stacking融合, ##Level 1采用 7个基学习器,将其作7-fold交叉学习,将学习到的结果作为输入到Level 2,Level 2采用XGBoost。 ##Level 1:SVM,GBDT,RF,ET,Adaboost,KNN,DT from sklearn.model_selection import KFold ntrain = titanic_train_data_X.shape[0] ntest = titanic_test_data_X.shape[0] Seed = 0 ##random seed NFold = 7 kf = KFold(n_splits=NFold, random_state=Seed, shuffle=False) ##已经打乱了顺序,不必再打乱 def get_out_fold(clf, x_train, y_train, x_test): oof_train = np.zeros((ntrain,)) ##np.zero((m,n)) 给出一个mXn的0矩阵,故有双括号,此处n没有,就是一个长度为891的数组 oof_test = np.zeros((ntest,)) oof_test_skf = np.empty((NFold, ntest)) ##生成mXn个随机数 for i, (train_index, test_index) in enumerate(kf.split(x_train)): ##本身kf.split(X),会返回对应的K-fold子集对应的下标数组,而此处用enumerate是为了对每个返回子集产生标号 x_tr = x_train[train_index] y_tr = y_train[train_index] x_te = x_train[test_index] clf.fit(x_tr, y_tr) oof_train[test_index] = clf.predict(x_te) ##某一Fold在交叉验证集上的输出,由于K-Fold最后会将左右数据都当做交叉验证集一次,故最后会对所有的训练集都作出预测,每个Fold训练时与其他Fold无关,故不存在过拟合,且充分训练 oof_test_skf[i, :] = clf.predict(x_test) ##某一Fold在测试集上的输出,保存为1行,7Flod故共有7行 oof_test[:] = oof_test_skf.mean(axis = 0) ##对列求均值,每列代表一个样本在7个Fold下的对应输出,这代表将每个Fold产生的模型,对测试集的预测取平均值 return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1) ##reshape(-1,1)将其变换成一列的列向量,因,原数据是一行891个基于训练集的预测,同理oof_test ##返回值是训练集的K-FLOD预测值,和测试集的基于K-FLOD的预测均值 # In[97]: ##构建基学习器 from sklearn.neighbors import KNeighborsClassifier from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier ##随机森林 from sklearn.ensemble import GradientBoostingClassifier ##GBDT from sklearn.ensemble import AdaBoostClassifier from sklearn.ensemble import ExtraTreesClassifier ## ExtraTree from sklearn.tree import DecisionTreeClassifier from sklearn import ensemble ##集成学习 from sklearn import model_selection from sklearn.ensemble import GradientBoostingRegressor from sklearn.ensemble import RandomForestRegressor rf_est = RandomForestClassifier() rf_grid_param = {'n_estimators':range(300,500,3), 'warm_start':[True,False],'min_samples_split':range(2,10,3),'min_samples_leaf':range(1,15,3), 'max_depth':range(6,10,3)} rf_grid = model_selection.GridSearchCV(rf_est, rf_grid_param, n_jobs=-1,verbose=0) ada_est = AdaBoostClassifier() ada_grid_param = {'n_estimators':range(300,500,3), 'learning_rate': np.logspace(-2, -1, 10)} ada_grid = model_selection.GridSearchCV(ada_est,ada_grid_param) gb_est = GradientBoostingClassifier() gb_grid_param = {'n_estimators':range(300,500,3),'learning_rate':np.logspace(-3,-1,20),'min_samples_split':range(2,10,3),'min_samples_leaf':range(1,15,3),'max_depth':range(3,10,3)} gb_grid = model_selection.GridSearchCV(gb_est, gb_grid_param, n_jobs=-1, verbose=0) svm_est = SVC() svm_grid_param = {'kernel':['linear','rbf'], 'C':np.logspace(-4,-1,4)} svm_grid = model_selection.GridSearchCV(svm_est,svm_grid_param) et_est = ExtraTreesClassifier() et_grid_param = {'n_estimators':range(300,500,3), 'min_samples_leaf':range(2,6,3), 'max_depth':range(8,20,3)} et_grid = model_selection.GridSearchCV(et_est, et_grid_param) knn_est = KNeighborsClassifier() knn_grid_param = {'n_neighbors':range(2,4,3)} knn_grid = model_selection.GridSearchCV(knn_est, knn_grid_param) dt_est = DecisionTreeClassifier() dt_grid_param = {'max_depth':range(8,20,3)} dt_grid = model_selection.GridSearchCV(dt_est, dt_grid_param) # In[98]: ##测试基学习器 rf = RandomForestClassifier(n_estimators=500, warm_start=True, max_features='sqrt',max_depth=6, min_samples_split=3, min_samples_leaf=2, n_jobs=-1, verbose=0) ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.1) et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_depth=8, min_samples_leaf=2, verbose=0) gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.008, min_samples_split=3, min_samples_leaf=2, max_depth=5, verbose=0) dt = DecisionTreeClassifier(max_depth=8) knn = KNeighborsClassifier(n_neighbors = 2) svm = SVC(kernel='linear', C=0.025) # In[99]: x_train = titanic_train_data_X.values ## 将pandas转化为np数组,因为get_out_fold函数要求是np形式的数据才能操作 x_test = titanic_test_data_X.values y_train = titanic_train_Y.values # In[100]: print(x_train.shape) print(x_test.shape) print(y_train.shape) # In[101]: rf_oof_train, rf_oof_test = get_out_fold(rf, x_train, y_train, x_test) ada_oof_train, ada_oof_test = get_out_fold(ada,x_train, y_train, x_test) et_oof_train, et_oof_test = get_out_fold(et,x_train, y_train, x_test) gb_oof_train, gb_oof_test = get_out_fold(gb,x_train, y_train, x_test) dt_oof_train, dt_oof_test = get_out_fold(dt, x_train, y_train, x_test) knn_oof_train, knn_oof_test = get_out_fold(knn, x_train, y_train, x_test) svm_oof_train, svm_oof_test = get_out_fold(svm, x_train, y_train, x_test) print('Training is complete') # In[102]: ##level 2:XGboost x_train = np.concatenate((rf_oof_train,ada_oof_train,et_oof_train,gb_oof_train,dt_oof_train,knn_oof_train,svm_oof_train),axis = 1) ##按行堆叠成一个数据表 x_test = np.concatenate((rf_oof_test,ada_oof_test,et_oof_test,gb_oof_test,dt_oof_test,knn_oof_test,svm_oof_test), axis = 1) # In[103]: from xgboost import XGBClassifier gbm = XGBClassifier(n_estimators=2000, max_depth=4, min_child_weight=2, gamma=0.9, subsample=0.8, colsample_bytree=0.8, objective='binary:logistic', nthread = -1, scale_pos_weight=1).fit(x_train,y_train) prediction = gbm.predict(x_test) # In[109]: gbm.score(x_train,y_train) # ## Final.生成预测结果 # In[106]: prediction y_true = test_data['Regulated'] ##预测完毕 from sklearn.metrics import classification_report print(classification_report(y_true=y_true, y_pred=prediction))

你可能感兴趣的:(机器学习)