#3.1导入基本库,这是在notebook下演示的代码
import sys #access to system parameters https://docs.python.org/3/library/sys.html
import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
import matplotlib #collection of functions for scientific and publication-ready visualization
import numpy as np #foundational package for scientific computing
import scipy as sp #collection of functions for scientific computing and advance mathematics
import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
import sklearn #collection of machine learning algorithms
#misc libraries
import random
import time
#ignore warnings
import warnings
warnings.filterwarnings('ignore')
print('-'*25)
#3.2导入模型库
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier
#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics
#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.tools.plotting import scatter_matrix
#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8
#3.3初见数据
data_raw = pd.read_csv('../input/train.csv')
data_val = pd.read_csv('../input/test.csv')
data1 = data_raw.copy(deep = True)
#however passing by reference is convenient, because we can clean both datasets at once
data_cleaner = [data1, data_val]
print (data_raw.info())#训练数据的总体信息
data_raw.sample(10)#十个例子,这里不再展示图片,上一篇Titanic入门例子有,这个比较简单
1. *生存*变量是我们的结果或因变量。这是一个二进制名义数据类型,为1存活,0代表不存活。所有其他变量都是潜在的预测变量或自变量。 **重要的是要注意,更多的预测变量不会做出更好的模型,而是正确的变量。**
2. * PassengerID *和* Ticket *变量被假定为随机唯一标识符,对结果变量没有影响。因此,他们将被排除在分析之外。
3. * Pclass *变量是票据类的有序数据类型,是社会经济地位(SES)的代理,代表1 =上层,2 =中层,3 =下层。
4. * Name *变量是一个标称数据类型。它可以用于特征工程中,以从头衔,姓氏的家庭大小以及医生或主人等头衔中的SES推导出性别。由于这些变量已经存在,我们将使用它来查看标题,如master,是否有所作为。
5. * Sex *和* Embarked *变量是名词数据类型。他们将被转换为虚拟变量进行数学计算。
6. * Age *和* Fare *变量是连续的定量数据类型。
7. * SibSp *表示船上相关兄弟姐妹/配偶的人数,* Parch *表示船上相关父母/子女的人数。两者都是离散的定量数据类型。这可以用于特征工程来创建一个家庭大小,并且是单独变量。
8. * Cabin *变量是一种名词数据类型,可用于特征工程中的事件发生时船舶上的近似位置和甲板上的SES。但是,由于有许多空值,因此不会增加值,因此从分析中排除。
#3.4数据四部曲:纠正(Correcting),完善(Completing),新增(Creating),转换(Converting)
在这个阶段,我们将通过1)校正异常值和异常值,2)完成缺失信息,3)创建新的分析功能,4)将字段转换为正确的格式进行计算和表示。
1. **Correcting:**审查数据,似乎没有任何异常或不可接受的数据输入。另外,我们看到我们可能在年龄和票价上有潜在的异常值。但是,由于它们是合理的值,我们将等到完成我们的探索性分析后确定是否应包含或排除数据集。应该指出的是,如果它们是不合理的价值,例如年龄= 800而不是80,那么现在就可以做出安全的决定。但是,当我们从原始值修改数据时,我们要谨慎,因为可能需要创建一个精确的模型。
2. **Completing:**在年龄,机舱和登船区域有空值或缺失数据。缺少值可能很糟糕,因为有些算法不知道如何处理空值并会失败。而其他人,如决策树,可以处理空值。因此,在我们开始建模之前修复是很重要的,因为我们将比较和对比几个模型。有两种常用方法,可以删除记录,也可以使用合理的输入填充缺失的值。不建议删除记录,尤其是大部分记录,除非它确实代表了不完整的记录。相反,最好是对缺失值进行补偿。定性数据的基本方法是使用模式。定量数据的基本方法是使用平均值,中位数或平均值+随机标准偏差进行估算。一种中间方法是使用基于具体标准的基本方法;比如按买票类分类的人的平均年龄或按票价和SES登船。还有更复杂的方法,但是在部署之前,应该将其与基础模型进行比较,以确定复杂性是否真正增加了价值。对于这个数据集,年龄将通过计算中位数填充,客舱属性将被放弃,登船港口用模式来填充。随后的模型迭代可能会修改此决策以确定它是否提高了模型的准确性。
3. **Creating:**特征工程是当我们使用现有的功能来创建新的功能,以确定它们是否提供新的信号来预测我们的结果。对于这个数据集,我们将创建一个标题特征来确定它是否在生存中起作用。
4. **Converting:**最后,同样重要的,我们将处理格式。我们将分类数据导入为对象,这使得数学计算变得很困难。对于这个数据集,我们将把对象数据类型转换为分类虚拟变量。
print('Train columns with null values:\n', data1.isnull().sum())
print("-"*10)
print('Test/Validation columns with null values:\n', data_val.isnull().sum())
print("-"*10)
data_raw.describe(include = 'all')
#3.4.1Clean Data
for dataset in data_cleaner:
#complete missing age with median
dataset['Age'].fillna(dataset['Age'].median(), inplace = True)#用age平均值填充数据
#complete embarked with mode
dataset['Embarked'].fillna(dataset['Embarked'].mode()[0], inplace = True)#用登陆港口地方填充
#complete missing fare with median
dataset['Fare'].fillna(dataset['Fare'].median(), inplace = True)#票价平均值填充
#delete the cabin feature/column and others previously stated to exclude in train dataset
drop_column = ['PassengerId','Cabin', 'Ticket']
data1.drop(drop_column, axis=1, inplace = True)#删除无用数据
#再次检查是否填充完成
print(data1.isnull().sum())
print("-"*10)
print(data_val.isnull().sum())
###CREATE: Feature Engineering for train and test/validation dataset
for dataset in data_cleaner:
#Discrete variables
dataset['FamilySize'] = dataset ['SibSp'] + dataset['Parch'] + 1
dataset['IsAlone'] = 1 #initialize to yes/1 is alone
dataset['IsAlone'].loc[dataset['FamilySize'] > 1] = 0 # now update to no/0 if family size is greater than 1
#quick and dirty code split title from name: http://www.pythonforbeginners.com/dictionary/python-split
dataset['Title'] = dataset['Name'].str.split(", ", expand=True)[1].str.split(".", expand=True)[0]
#Continuous variable bins; qcut vs cut: https://stackoverflow.com/questions/30211923/what-is-the-difference-between-pandas-qcut-and-pandas-cut
#Fare Bins/Buckets using qcut or frequency bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.qcut.html
#分类数据
dataset['FareBin'] = pd.qcut(dataset['Fare'], 4)
#Age Bins/Buckets using cut or value bins: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.cut.html
dataset['AgeBin'] = pd.cut(dataset['Age'].astype(int), 5)
#cleanup rare title names
#print(data1['Title'].value_counts())
stat_min = 10 #while small is arbitrary, we'll use the common minimum in statistics: http://nicholasjjackson.com/2012/03/08/sample-size-is-10-a-magic-number/
title_names = (data1['Title'].value_counts() < stat_min) #this will create a true false series with title name as index
#apply and lambda functions are quick and dirty code to find and replace with fewer lines of code: https://community.modeanalytics.com/python/tutorial/pandas-groupby-and-python-lambda-functions/
data1['Title'] = data1['Title'].apply(lambda x: 'Misc' if title_names.loc[x] == True else x)
print(data1['Title'].value_counts())
print("-"*10)
#preview data again
data1.info()
data_val.info()
data1.sample(10)
#转换格式
#我们会将分类数据转换为虚拟变量进行数学分析。 有多种方法来编码分类变量。 我们将使用sklearn和pandas函数。
#code categorical data
label = LabelEncoder()
for dataset in data_cleaner:
dataset['Sex_Code'] = label.fit_transform(dataset['Sex'])
dataset['Embarked_Code'] = label.fit_transform(dataset['Embarked'])
dataset['Title_Code'] = label.fit_transform(dataset['Title'])
dataset['AgeBin_Code'] = label.fit_transform(dataset['AgeBin'])
dataset['FareBin_Code'] = label.fit_transform(dataset['FareBin'])
#define y variable aka target/outcome
Target = ['Survived']
#define x variables for original features aka feature selection
data1_x = ['Sex','Pclass', 'Embarked', 'Title','SibSp', 'Parch', 'Age', 'Fare', 'FamilySize', 'IsAlone'] #pretty name/values for charts
data1_x_calc = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code','SibSp', 'Parch', 'Age', 'Fare'] #coded for algorithm calculation
data1_xy = Target + data1_x
print('Original X Y: ', data1_xy, '\n')
#define x variables for original w/bin features to remove continuous variables
data1_x_bin = ['Sex_Code','Pclass', 'Embarked_Code', 'Title_Code', 'FamilySize', 'AgeBin_Code', 'FareBin_Code']
data1_xy_bin = Target + data1_x_bin
print('Bin X Y: ', data1_xy_bin, '\n')
#define x and y variables for dummy features original
data1_dummy = pd.get_dummies(data1[data1_x])
data1_x_dummy = data1_dummy.columns.tolist()
data1_xy_dummy = Target + data1_x_dummy
print('Dummy X Y: ', data1_xy_dummy, '\n')
data1_dummy.head()
#再次检查清洗后的数据
print('Train columns with null values: \n', data1.isnull().sum())
print("-"*10)
print (data1.info())
print("-"*10)
print('Test/Validation columns with null values: \n', data_val.isnull().sum())
print("-"*10)
print (data_val.info())
print("-"*10)
data_raw.describe(include = 'all')
分离训练和测试数据
train1_x, test1_x, train1_y, test1_y = model_selection.train_test_split(data1[data1_x_calc], data1[Target], random_state = 0)
train1_x_bin, test1_x_bin, train1_y_bin, test1_y_bin = model_selection.train_test_split(data1[data1_x_bin], data1[Target] , random_state = 0)
train1_x_dummy, test1_x_dummy, train1_y_dummy, test1_y_dummy = model_selection.train_test_split(data1_dummy[data1_x_dummy], data1[Target], random_state = 0)
print("Data1 Shape: {}".format(data1.shape))
print("Train1 Shape: {}".format(train1_x.shape))
print("Test1 Shape: {}".format(test1_x.shape))
train1_x_bin.head()
- - - 6. Step4:数据探索分析
- - - -现在我们的数据已经清理完毕,我们将用描述性和图形化的统计数据来探索我们的数据来描述和总结我们的变量。 在这个阶段,你会发现自己要分类特征,并确定它们与目标变量之间的相互关系。
for x in data1_x:
if data1[x].dtype != 'float64' :
print('Survival Correlation by:', x)
print(data1[[x, Target[0]]].groupby(x, as_index=False).mean())
print('-'*10, '\n')
#using crosstabs: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.crosstab.html
print(pd.crosstab(data1['Title'],data1[Target[0]]))
#下面是部分结果展示
plt.figure(figsize=[16,12])
plt.subplot(231)
plt.boxplot(x=data1['Fare'], showmeans = True, meanline = True)
plt.title('Fare Boxplot')
plt.ylabel('Fare ($)')
plt.subplot(232)
plt.boxplot(data1['Age'], showmeans = True, meanline = True)
plt.title('Age Boxplot')
plt.ylabel('Age (Years)')
plt.subplot(233)
plt.boxplot(data1['FamilySize'], showmeans = True, meanline = True)
plt.title('Family Size Boxplot')
plt.ylabel('Family Size (#)')
plt.subplot(234)
plt.hist(x = [data1[data1['Survived']==1]['Fare'], data1[data1['Survived']==0]['Fare']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Fare Histogram by Survival')
plt.xlabel('Fare ($)')
plt.ylabel('# of Passengers')
plt.legend()
plt.subplot(235)
plt.hist(x = [data1[data1['Survived']==1]['Age'], data1[data1['Survived']==0]['Age']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Age Histogram by Survival')
plt.xlabel('Age (Years)')
plt.ylabel('# of Passengers')
plt.legend()
plt.subplot(236)
plt.hist(x = [data1[data1['Survived']==1]['FamilySize'], data1[data1['Survived']==0]['FamilySize']],
stacked=True, color = ['g','r'],label = ['Survived','Dead'])
plt.title('Family Size Histogram by Survival')
plt.xlabel('Family Size (#)')
plt.ylabel('# of Passengers')
plt.legend()
fig, saxis = plt.subplots(2, 3,figsize=(16,12))
sns.barplot(x = 'Embarked', y = 'Survived', data=data1, ax = saxis[0,0])
sns.barplot(x = 'Pclass', y = 'Survived', order=[1,2,3], data=data1, ax = saxis[0,1])
sns.barplot(x = 'IsAlone', y = 'Survived', order=[1,0], data=data1, ax = saxis[0,2])
sns.pointplot(x = 'FareBin', y = 'Survived', data=data1, ax = saxis[1,0])
sns.pointplot(x = 'AgeBin', y = 'Survived', data=data1, ax = saxis[1,1])
sns.pointplot(x = 'FamilySize', y = 'Survived', data=data1, ax = saxis[1,2])
fig, (axis1,axis2,axis3) = plt.subplots(1,3,figsize=(14,12))
sns.boxplot(x = 'Pclass', y = 'Fare', hue = 'Survived', data = data1, ax = axis1)
axis1.set_title('Pclass vs Fare Survival Comparison')
sns.violinplot(x = 'Pclass', y = 'Age', hue = 'Survived', data = data1, split = True, ax = axis2)
axis2.set_title('Pclass vs Age Survival Comparison')
sns.boxplot(x = 'Pclass', y ='FamilySize', hue = 'Survived', data = data1, ax = axis3)
axis3.set_title('Pclass vs Family Size Survival Comparison')
fig, qaxis = plt.subplots(1,3,figsize=(14,12))
sns.barplot(x = 'Sex', y = 'Survived', hue = 'Embarked', data=data1, ax = qaxis[0])
axis1.set_title('Sex vs Embarked Survival Comparison')
sns.barplot(x = 'Sex', y = 'Survived', hue = 'Pclass', data=data1, ax = qaxis[1])
axis1.set_title('Sex vs Pclass Survival Comparison')
sns.barplot(x = 'Sex', y = 'Survived', hue = 'IsAlone', data=data1, ax = qaxis[2])
axis1.set_title('Sex vs IsAlone Survival Comparison')
fig, (maxis1, maxis2) = plt.subplots(1, 2,figsize=(14,12))
#how does family size factor with sex & survival compare
sns.pointplot(x="FamilySize", y="Survived", hue="Sex", data=data1,
palette={"male": "blue", "female": "pink"},
markers=["*", "o"], linestyles=["-", "--"], ax = maxis1)
#how does class factor with sex & survival compare
sns.pointplot(x="Pclass", y="Survived", hue="Sex", data=data1,
palette={"male": "blue", "female": "pink"},
markers=["*", "o"], linestyles=["-", "--"], ax = maxis2)
e = sns.FacetGrid(data1, col = 'Embarked')
e.map(sns.pointplot, 'Pclass', 'Survived', 'Sex', ci=95.0, palette = 'deep')
e.add_legend()
a = sns.FacetGrid( data1, hue = 'Survived', aspect=4 )
a.map(sns.kdeplot, 'Age', shade= True )
a.set(xlim=(0 , data1['Age'].max()))
a.add_legend()
h = sns.FacetGrid(data1, row = 'Sex', col = 'Pclass', hue = 'Survived')
h.map(plt.hist, 'Age', alpha = .75)
h.add_legend()
pp = sns.pairplot(data1, hue = 'Survived', palette = 'deep', size=1.2, diag_kind = 'kde', diag_kws=dict(shade=True), plot_kws=dict(s=10) )
pp.set(xticklabels=[])
#计算皮尔逊相关系数
def correlation_heatmap(df):
_ , ax = plt.subplots(figsize =(14, 12))
colormap = sns.diverging_palette(220, 10, as_cmap = True)
_ = sns.heatmap(
df.corr(),
cmap = colormap,
square=True,
cbar_kws={'shrink':.9 },
ax=ax,
annot=True,
linewidths=0.1,vmax=1.0, linecolor='white',
annot_kws={'fontsize':12 }
)
plt.title('Pearson Correlation of Features', y=1.05, size=15)
correlation_heatmap(data1)
7.Step5 模型数据
- 因为我们没有对问题的完美解决办法,所以我们必须试验多个预测方法来选择最优的那个解。
MLA = [#所有分类器列表
#Ensemble Methods 合奏
ensemble.AdaBoostClassifier(),
ensemble.BaggingClassifier(),
ensemble.ExtraTreesClassifier(),
ensemble.GradientBoostingClassifier(),
ensemble.RandomForestClassifier(),
#Gaussian Processes
gaussian_process.GaussianProcessClassifier(),
#GLM
linear_model.LogisticRegressionCV(),
linear_model.PassiveAggressiveClassifier(),
linear_model.RidgeClassifierCV(),
linear_model.SGDClassifier(),
linear_model.Perceptron(),
#Navies Bayes
naive_bayes.BernoulliNB(),
naive_bayes.GaussianNB(),
#Nearest Neighbor
neighbors.KNeighborsClassifier(),
#SVM
svm.SVC(probability=True),
svm.NuSVC(probability=True),
svm.LinearSVC(),
#Trees
tree.DecisionTreeClassifier(),#决策树
tree.ExtraTreeClassifier(),
#Discriminant Analysis判别分析
discriminant_analysis.LinearDiscriminantAnalysis(),
discriminant_analysis.QuadraticDiscriminantAnalysis(),#二次判别分析
#xgboost: http://xgboost.readthedocs.io/en/latest/model.html
XGBClassifier()
]
#split dataset in cross-validation with this splitter class: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html#sklearn.model_selection.ShuffleSplit
#note: this is an alternative to train_test_split
cv_split = model_selection.ShuffleSplit(n_splits = 10, test_size = .3, train_size = .6, random_state = 0 ) # run model 10x with 60/30 split intentionally leaving out 10%
#分类数据,60%训练,30%测试,10%丢弃
#create table to compare MLA metrics
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', 'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)#构造一个FataFrame对象,里面是算法的一些特征
#create table to compare MLA predictions
MLA_predict = data1[Target]
#index through MLA and save performance to table
row_index = 0
for alg in MLA:
#遍历所有算法
#set name and parameters
MLA_name = alg.__class__.__name__#算法名称
MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())#算法参数
#score model with cross validation: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html#sklearn.model_selection.cross_validate
cv_results = model_selection.cross_validate(alg, data1[data1_x_bin], data1[Target], cv = cv_split)#交叉验证
MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
#if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, should statistically capture 99.7% of the subsets
MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3 #let's know the worst that can happen!
#save MLA predictions - see section 6 for usage
alg.fit(data1[data1_x_bin], data1[Target])#训练数据
MLA_predict[MLA_name] = alg.predict(data1[data1_x_bin])#预测数据
row_index+=1#算法个数加一
#print and sort table: https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.sort_values.html
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)#按照训练准确度降序排列
MLA_compare
#MLA_predict
#图形化显示预测准确度
sns.barplot(x='MLA Test Accuracy Mean', y = 'MLA Name', data = MLA_compare, color = 'm')
#prettify using pyplot: https://matplotlib.org/api/pyplot_api.html
plt.title('Machine Learning Algorithm Accuracy Score \n')
plt.xlabel('Accuracy Score (%)')
plt.ylabel('Algorithm')
- 8.评估模型性能
- -8.1让我们回顾一下,通过一些基本的数据清理,分析和机器学习算法(MLA),我们能够以82%的准确度预测乘客生存。 几行代码不错。 但我们总是问的问题是,是否我们可以做得更好,更重要的是能为我们投入的时间获得相应的投资回报吗? 例如,如果我们只是将精度提高1/10,那么是否值3个月的开发。 如果你从事研究工作,也许答案是肯定的,但如果你在事业上工作,那么答案是否定的。 因此,在改进模型时请记住这一点。
- 8.2确定精度底线
- 在我们决定如何使模型更好之前,让我们确定我们的模型是否值得保留。要做到这一点,我们必须回到数据科学的基础知识。我们知道这是一个二元问题,因为只有两种可能的结果;乘客幸存或死亡。所以,把它看成是一个硬币翻转问题。如果你有一个公平的硬币,并且你猜对了,那么你有50-50的机会猜测是正确的。所以,我们将50%设置为最差的模型表现;因为低于这个数字的任何东西,那么当我只需要掷硬币时,为什么我需要你?
好吧,没有关于数据集的信息,我们总是可以得到50%的二元问题。但是我们有关于数据集的信息,所以我们应该能够做得更好。我们知道1,502 / 2,224或67.5%的人死亡。因此,如果我们只是预测最频繁发生的事情,那么100%的人死亡,那么我们就是67.5%的正确率。所以,我们将68%设置为糟糕的模型性能,因为如果再低于这个数字的话,那么为什么我需要你呢,当我只使用最频繁发生的事件来预测的话。
- 8.3如何创建自己的模型
接下来的一些问题有助于我们建立我们的决策树模型
Q1:你是否在泰坦尼克号上?如果是,那么大多数人(62%)死亡。注意,这里样本生存率是分离训练和测试数据之后得到的。如果我们假设每个人都死了。我们的样本准确率为62%。
Q2:你是男性还是女性?男性,多数(81%)死亡。女性,多数(74%)幸存。我们得到了79%的准确度。
Q3A:我们继续有314个女性这个分支,你是1,2还是3PClass。1PClass,多数(97%)存活,2PClass,多数(92%)存活。由于死亡小组不足10人,我们将停止下去这个分支。 3PClass,是50-50存活死亡比率。没有新的信息来改进我们的模型。
Q4A:(沿着女性3PClass分支,计数= 144):你是从C,Q或S港出发的吗?我们获得一些信息。 C和Q,大多数仍然存活,所以没有变化。另外,死亡小组小于10,所以我们将停止。 S,大部分(63%)死亡。所以,我们会改变女性,3PClass,让S从假设他们幸存下来为假设他们死了。我们的模型精度提高到81%。
Q5A:(进入女性3PClass,S搭乘船,计数= 88):到目前为止,看起来我们做出了很好的决定。增加另一个级别似乎没有获得更多的信息。这个组55人死亡,33人幸存下来,因为大多数死亡,我们需要找到一个信号来区分33或一个子组,来改变他们从死亡到幸存,并提高我们的模型的准确性。我们可以用我们的特征。我发现一个是票价为0-8,大多数人幸免于难。这是一个11-9的小样本,但经常用于统计。我们略微提高了准确性,但没有太多让我们超过82%。所以,我们会在这里停下来。
Q3B:(计数= 577的男性分支):回到问题2,我们知道大多数男性死亡。所以,我们正在寻找一个能够识别大多数存活的子群的特征。令人惊讶的是,PClass或甚至Embarked并不像对女性那样起一样的作用,但title确实让我们达到了82%精确度。猜测和检查其他特征,似乎没有推动我们超过82%。所以,我们现在就停下来。
你做到了,只有很少的信息,我们的准确率达到了82%。在最坏,差,好,更好和最好五个等级上,我们将把82%设置为好,因为这是一个简单的模型,可以为我们带来不错的结果。但问题仍然存在,我们能否比我们的手工制作模式做得更好?
在我们做之前,让我们编写我们刚才所写的内容。请注意,这是由“手”创建的手动过程。您不必这样做,但在开始使用MLA之前,了解它非常重要。考虑MLA就像微积分考试中的TI-89计算器。它非常强大,可以帮助你完成很多烦琐的工作。但是如果你不知道自己在考试中做什么,计算器,甚至是TI-89,都不会帮助你通过考试。所以,明智地研究下一部分。
#下面是我手动实现上面问题分析得出的决策树模型
for index, row in data1.iterrows():
#random number generator: https://docs.python.org/2/library/random.html
if random.random() > .5: # Random float x, 0.0 <= x < 1.0
data1.set_value(index, 'Random_Predict', 1) #predict survived/1
else:
data1.set_value(index, 'Random_Predict', 0) #predict died/0
#score random guess of survival. Use shortcut 1 = Right Guess and 0 = Wrong Guess
#the mean of the column will then equal the accuracy
#随机猜测,生还是死,并求其准确率
data1['Random_Score'] = 0 #assume prediction wrong
data1.loc[(data1['Survived'] == data1['Random_Predict']), 'Random_Score'] = 1 #set to 1 for correct prediction
print('Coin Flip Model Accuracy: {:.2f}%'.format(data1['Random_Score'].mean()*100))
#we can also use scikit's accuracy_score function to save us a few lines of cod
#可以使用scikit的accuracy_score函数帮助我们 减少几行代码
#http://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html#sklearn.metrics.accuracy_score
print('Coin Flip Model Accuracy w/SciKit: {:.2f}%'.format(metrics.accuracy_score(data1['Survived'], data1['Random_Predict'])*100))
#根据数据透视表分组
pivot_female = data1[data1.Sex=='female'].groupby(['Sex','Pclass', 'Embarked','FareBin'])['Survived'].mean()
print('Survival Decision Tree w/Female Node: \n',pivot_female)
pivot_male = data1[data1.Sex=='male'].groupby(['Sex','Title'])['Survived'].mean()
print('\n\nSurvival Decision Tree w/Male Node: \n',pivot_male)
#手动建模,微软的Excel数据透视表可以快速计算
def mytree(df):
#initialize table to store predictions
Model = pd.DataFrame(data = {'Predict':[]})
male_title = ['Master'] #survived titles
for index, row in df.iterrows():
#Question 1: Were you on the Titanic; majority died
Model.loc[index, 'Predict'] = 0
#Question 2: Are you female; majority survived
if (df.loc[index, 'Sex'] == 'female'):
Model.loc[index, 'Predict'] = 1
#Question 3A Female - Class and Question 4 Embarked gain minimum information
#Question 5B Female - FareBin; set anything less than .5 in female node decision tree back to 0
if ((df.loc[index, 'Sex'] == 'female') &
(df.loc[index, 'Pclass'] == 3) &
(df.loc[index, 'Embarked'] == 'S') &
(df.loc[index, 'Fare'] > 8)
):
Model.loc[index, 'Predict'] = 0
#Question 3B Male: Title; set anything greater than .5 to 1 for majority survived
if ((df.loc[index, 'Sex'] == 'male') &
(df.loc[index, 'Title'] in male_title)
):
Model.loc[index, 'Predict'] = 1
return Model
#model data
Tree_Predict = mytree(data1)
print('Decision Tree Model Accuracy/Precision Score: {:.2f}%\n'.format(metrics.accuracy_score(data1['Survived'], Tree_Predict)*100))
#Accuracy Summary Report with http://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report
#Where recall score = (true positives)/(true positive + false negative) w/1 being best:http://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html#sklearn.metrics.recall_score
#And F1 score = weighted average of precision and recall w/1 being best: http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
print(metrics.classification_report(data1['Survived'], Tree_Predict))
import itertools
#自定义显示混淆矩阵预测图,正则化
'''
cm:数据
normalize:是否正则化
title:名称
cmap:颜色
'''
def plot_confusion_matrix(cm, classes,
normalize=False,
title='Confusion matrix',
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
函数功能展示混淆矩阵
支持正则化
"""
if normalize:#是否在正则化,除以行数据之和
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, format(cm[i, j], fmt),
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
# Compute confusion matrix
cnf_matrix = metrics.confusion_matrix(data1['Survived'], Tree_Predict)
np.set_printoptions(precision=4)#这是显示的精确度
class_names = ['Dead', 'Survived']
# Plot non-normalized confusion matrix
#非正则化混淆矩阵
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names,
title='Confusion matrix, without normalization')
# Plot normalized confusion matrix
#正则化混淆矩阵
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True,
title='Normalized confusion matrix')
9.超参优化
由于决策树可以设置叶节点所需最小样本数或设置树的最大深度等一些参数,因此,我们可以优化这些参数来使模型拟合和泛化程度更高。
dtree = tree.DecisionTreeClassifier(random_state = 0)
base_results = model_selection.cross_validate(dtree, data1[data1_x_bin], data1[Target], cv = cv_split)
dtree.fit(data1[data1_x_bin], data1[Target])
print('BEFORE DT Parameters: ', dtree.get_params())
print("BEFORE DT Training w/bin score mean: {:.2f}". format(base_results['train_score'].mean()*100))
print("BEFORE DT Test w/bin score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE DT Test w/bin score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
#print("BEFORE DT Test w/bin set score min: {:.2f}". format(base_results['test_score'].min()*100))
print('-'*10)
#tune hyper-parameters: http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
param_grid = {'criterion': ['gini', 'entropy'], #scoring methodology; two supported formulas for calculating information gain - default is gini
#'splitter': ['best', 'random'], #splitting methodology; two supported strategies - default is best
'max_depth': [2,4,6,8,10,None], #max depth tree can grow; default is none
#'min_samples_split': [2,5,10,.03,.05], #minimum subset size BEFORE new split (fraction is % of total); default is 2
#'min_samples_leaf': [1,5,10,.03,.05], #minimum subset size AFTER new split split (fraction is % of total); default is 1
#'max_features': [None, 'auto'], #max features to consider when performing split; default none or all
'random_state': [0] #seed or control random number generator: https://www.quora.com/What-is-seed-in-random-number-generation
}
#print(list(model_selection.ParameterGrid(param_grid)))
#choose best model with grid_search: #http://scikit-learn.org/stable/modules/grid_search.html#grid-search
#http://scikit-learn.org/stable/auto_examples/model_selection/plot_grid_search_digits.html
tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
tune_model.fit(data1[data1_x_bin], data1[Target])
#print(tune_model.cv_results_.keys())
#print(tune_model.cv_results_['params'])
print('AFTER DT Parameters: ', tune_model.best_params_)
#print(tune_model.cv_results_['mean_train_score'])
print("AFTER DT Training w/bin score mean: {:.2f}". format(tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100))
#print(tune_model.cv_results_['mean_test_score'])
print("AFTER DT Test w/bin score mean: {:.2f}". format(tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER DT Test w/bin score 3*std: +/- {:.2f}". format(tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)
10.特征选择
#1.优化模型的两种方法:模型参数优化,它本身有一些可变参数,可以指定多个去遍历训练,找到最好的
#2.特征二次选择,通过REF
print('BEFORE DT RFE Training Shape Old: ', data1[data1_x_bin].shape)
print('BEFORE DT RFE Training Columns Old: ', data1[data1_x_bin].columns.values)
print("BEFORE DT RFE Training w/bin score mean: {:.2f}". format(base_results['train_score'].mean()*100))
print("BEFORE DT RFE Test w/bin score mean: {:.2f}". format(base_results['test_score'].mean()*100))
print("BEFORE DT RFE Test w/bin score 3*std: +/- {:.2f}". format(base_results['test_score'].std()*100*3))
print('-'*10)
#feature selection
#特征选择
dtree_rfe = feature_selection.RFECV(dtree, step = 1, scoring = 'accuracy', cv = cv_split)
dtree_rfe.fit(data1[data1_x_bin], data1[Target])
#transform x&y to reduced features and fit new model
#alternative: can use pipeline to reduce fit and transform steps: http://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
#特征选择后再次训练
X_rfe = data1[data1_x_bin].columns.values[dtree_rfe.get_support()]
rfe_results = model_selection.cross_validate(dtree, data1[X_rfe], data1[Target], cv = cv_split)
#print(dtree_rfe.grid_scores_)
#打印特征选择后的训练结果
print('AFTER DT RFE Training Shape New: ', data1[X_rfe].shape)
print('AFTER DT RFE Training Columns New: ', X_rfe)
print("AFTER DT RFE Training w/bin score mean: {:.2f}". format(rfe_results['train_score'].mean()*100))
print("AFTER DT RFE Test w/bin score mean: {:.2f}". format(rfe_results['test_score'].mean()*100))
print("AFTER DT RFE Test w/bin score 3*std: +/- {:.2f}". format(rfe_results['test_score'].std()*100*3))
print('-'*10)
#tune rfe model
#特征选择后再次超参优化
rfe_tune_model = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
rfe_tune_model.fit(data1[X_rfe], data1[Target])
#print(rfe_tune_model.cv_results_.keys())
#print(rfe_tune_model.cv_results_['params'])
print('AFTER DT RFE Tuned Parameters: ', rfe_tune_model.best_params_)
#print(rfe_tune_model.cv_results_['mean_train_score'])
print("AFTER DT RFE Tuned Training w/bin score mean: {:.2f}". format(rfe_tune_model.cv_results_['mean_train_score'][tune_model.best_index_]*100))
#print(rfe_tune_model.cv_results_['mean_test_score'])
print("AFTER DT RFE Tuned Test w/bin score mean: {:.2f}". format(rfe_tune_model.cv_results_['mean_test_score'][tune_model.best_index_]*100))
print("AFTER DT RFE Tuned Test w/bin score 3*std: +/- {:.2f}". format(rfe_tune_model.cv_results_['std_test_score'][tune_model.best_index_]*100*3))
print('-'*10)
#可视化决策树的图像
import graphviz
dot_data = tree.export_graphviz(dtree, out_file=None,
feature_names = data1_x_bin, class_names = True,
filled = True, rounded = True)
graph = graphviz.Source(dot_data)
graph
11.Step6:验证和实施
#比较算法相似度,1相似,0一点都不相似
correlation_heatmap(MLA_predict)
#为什么选择一个模型,何时可以用投票分类器来选择它们
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.VotingClassifier.html
#removed models w/o attribute 'predict_proba' required for vote classifier and models with a 1.0 correlation to another model
#去除需要 预测概率的的投票分类器和与另一个模型相关系数为1的模型
vote_est = [
#Ensemble Methods: http://scikit-learn.org/stable/modules/ensemble.html
('ada', ensemble.AdaBoostClassifier()),
('bc', ensemble.BaggingClassifier()),
('etc',ensemble.ExtraTreesClassifier()),
('gbc', ensemble.GradientBoostingClassifier()),
('rfc', ensemble.RandomForestClassifier()),
#Gaussian Processes: http://scikit-learn.org/stable/modules/gaussian_process.html#gaussian-process-classification-gpc
('gpc', gaussian_process.GaussianProcessClassifier()),
#GLM: http://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
('lr', linear_model.LogisticRegressionCV()),
#Navies Bayes: http://scikit-learn.org/stable/modules/naive_bayes.html
('bnb', naive_bayes.BernoulliNB()),
('gnb', naive_bayes.GaussianNB()),
#Nearest Neighbor: http://scikit-learn.org/stable/modules/neighbors.html
('knn', neighbors.KNeighborsClassifier()),
#SVM: http://scikit-learn.org/stable/modules/svm.html
('svc', svm.SVC(probability=True)),
#xgboost: http://xgboost.readthedocs.io/en/latest/model.html
('xgb', XGBClassifier())
]
#硬投票
#Hard Vote or majority rules
vote_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
#重新训练模型
vote_hard_cv = model_selection.cross_validate(vote_hard, data1[data1_x_bin], data1[Target], cv = cv_split)
vote_hard.fit(data1[data1_x_bin], data1[Target])
print("Hard Voting Training w/bin score mean: {:.2f}". format(vote_hard_cv['train_score'].mean()*100))
print("Hard Voting Test w/bin score mean: {:.2f}". format(vote_hard_cv['test_score'].mean()*100))
print("Hard Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_hard_cv['test_score'].std()*100*3))
print('-'*10)
#多数规则
#Soft Vote or weighted probabilities
vote_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft')
vote_soft_cv = model_selection.cross_validate(vote_soft, data1[data1_x_bin], data1[Target], cv = cv_split)
vote_soft.fit(data1[data1_x_bin], data1[Target])
print("Soft Voting Training w/bin score mean: {:.2f}". format(vote_soft_cv['train_score'].mean()*100))
print("Soft Voting Test w/bin score mean: {:.2f}". format(vote_soft_cv['test_score'].mean()*100))
print("Soft Voting Test w/bin score 3*std: +/- {:.2f}". format(vote_soft_cv['test_score'].std()*100*3))
print('-'*10)
#WARNING: Running is very computational intensive and time expensive.
#超参优化,对下面所有算法,会找到最好的一个算法,大概半小时计算完
#Code is written for experimental/developmental purposes and not production ready!
#Hyperparameter Tune with GridSearchCV: http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
grid_n_estimator = [10, 50, 100, 300]
grid_ratio = [.1, .25, .5, .75, 1.0]
grid_learn = [.01, .03, .05, .1, .25]
grid_max_depth = [2, 4, 6, 8, 10, None]
grid_min_samples = [5, 10, .03, .05, .10]
grid_criterion = ['gini', 'entropy']
grid_bool = [True, False]
grid_seed = [0]
grid_param = [
[{
#AdaBoostClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
'n_estimators': grid_n_estimator, #default=50
'learning_rate': grid_learn, #default=1
#'algorithm': ['SAMME', 'SAMME.R'], #default=’SAMME.R
'random_state': grid_seed
}],
[{
#BaggingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html#sklearn.ensemble.BaggingClassifier
'n_estimators': grid_n_estimator, #default=10
'max_samples': grid_ratio, #default=1.0
'random_state': grid_seed
}],
[{
#ExtraTreesClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html#sklearn.ensemble.ExtraTreesClassifier
'n_estimators': grid_n_estimator, #default=10
'criterion': grid_criterion, #default=”gini”
'max_depth': grid_max_depth, #default=None
'random_state': grid_seed
}],
[{
#GradientBoostingClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html#sklearn.ensemble.GradientBoostingClassifier
#'loss': ['deviance', 'exponential'], #default=’deviance’
'learning_rate': [.05], #default=0.1 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
'n_estimators': [300], #default=100 -- 12/31/17 set to reduce runtime -- The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 264.45 seconds.
#'criterion': ['friedman_mse', 'mse', 'mae'], #default=”friedman_mse”
'max_depth': grid_max_depth, #default=3
'random_state': grid_seed
}],
[{
#RandomForestClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
'n_estimators': grid_n_estimator, #default=10
'criterion': grid_criterion, #default=”gini”
'max_depth': grid_max_depth, #default=None
'oob_score': [True], #default=False -- 12/31/17 set to reduce runtime -- The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 146.35 seconds.
'random_state': grid_seed
}],
[{
#GaussianProcessClassifier
'max_iter_predict': grid_n_estimator, #default: 100
'random_state': grid_seed
}],
[{
#LogisticRegressionCV - http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html#sklearn.linear_model.LogisticRegressionCV
'fit_intercept': grid_bool, #default: True
#'penalty': ['l1','l2'],
'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], #default: lbfgs
'random_state': grid_seed
}],
[{
#BernoulliNB - http://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html#sklearn.naive_bayes.BernoulliNB
'alpha': grid_ratio, #default: 1.0
}],
#GaussianNB -
[{}],
[{
#KNeighborsClassifier - http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
'n_neighbors': [1,2,3,4,5,6,7], #default: 5
'weights': ['uniform', 'distance'], #default = ‘uniform’
'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}],
[{
#SVC - http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
#http://blog.hackerearth.com/simple-tutorial-svm-parameter-tuning-python-r
#'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
'C': [1,2,3,4,5], #default=1.0
'gamma': grid_ratio, #edfault: auto
'decision_function_shape': ['ovo', 'ovr'], #default:ovr
'probability': [True],
'random_state': grid_seed
}],
[{
#XGBClassifier - http://xgboost.readthedocs.io/en/latest/parameter.html
'learning_rate': grid_learn, #default: .3
'max_depth': [1,2,4,6,8,10], #default 2
'n_estimators': grid_n_estimator,
'seed': grid_seed
}]
]
start_total = time.perf_counter() #https://docs.python.org/3/library/time.html#time.perf_counter
for clf, param in zip (vote_est, grid_param): #https://docs.python.org/3/library/functions.html#zip
#print(clf[1]) #vote_est is a list of tuples, index 0 is the name and index 1 is the algorithm
#print(param)
start = time.perf_counter()
best_search = model_selection.GridSearchCV(estimator = clf[1], param_grid = param, cv = cv_split, scoring = 'roc_auc')
best_search.fit(data1[data1_x_bin], data1[Target])
run = time.perf_counter() - start
best_param = best_search.best_params_
print('The best parameter for {} is {} with a runtime of {:.2f} seconds.'.format(clf[1].__class__.__name__, best_param, run))
clf[1].set_params(**best_param)
run_total = time.perf_counter() - start_total#计算运行总时间
print('Total optimization time was {:.2f} minutes.'.format(run_total/60))
print('-'*10)
#结果
'''The best parameter for AdaBoostClassifier is {'learning_rate': 0.1, 'n_estimators': 300, 'random_state': 0} with a runtime of 33.46 seconds.
The best parameter for BaggingClassifier is {'max_samples': 0.25, 'n_estimators': 300, 'random_state': 0} with a runtime of 30.92 seconds.
The best parameter for ExtraTreesClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'random_state': 0} with a runtime of 66.61 seconds.
The best parameter for GradientBoostingClassifier is {'learning_rate': 0.05, 'max_depth': 2, 'n_estimators': 300, 'random_state': 0} with a runtime of 35.08 seconds.
The best parameter for RandomForestClassifier is {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'oob_score': True, 'random_state': 0} with a runtime of 78.74 seconds.
The best parameter for GaussianProcessClassifier is {'max_iter_predict': 10, 'random_state': 0} with a runtime of 1217.29 seconds. #最耗时
The best parameter for LogisticRegressionCV is {'fit_intercept': True, 'random_state': 0, 'solver': 'liblinear'} with a runtime of 9.48 seconds.
The best parameter for BernoulliNB is {'alpha': 0.1} with a runtime of 0.23 seconds.
The best parameter for GaussianNB is {} with a runtime of 0.07 seconds.
The best parameter for KNeighborsClassifier is {'algorithm': 'brute', 'n_neighbors': 7, 'weights': 'uniform'} with a runtime of 5.59 seconds.
The best parameter for SVC is {'C': 2, 'decision_function_shape': 'ovo', 'gamma': 0.1, 'probability': True, 'random_state': 0} with a runtime of 30.53 seconds.
The best parameter for XGBClassifier is {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300, 'seed': 0} with a runtime of 53.97 seconds.
Total optimization time was 26.03 minutes.'''
#Hard Vote or majority rules w/Tuned Hyperparameters
#硬投票
grid_hard = ensemble.VotingClassifier(estimators = vote_est , voting = 'hard')
grid_hard_cv = model_selection.cross_validate(grid_hard, data1[data1_x_bin], data1[Target], cv = cv_split)
grid_hard.fit(data1[data1_x_bin], data1[Target])
print("Hard Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}". format(grid_hard_cv['train_score'].mean()*100))
print("Hard Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}". format(grid_hard_cv['test_score'].mean()*100))
print("Hard Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- {:.2f}". format(grid_hard_cv['test_score'].std()*100*3))
print('-'*10)
#Soft Vote or weighted probabilities w/Tuned Hyperparameters
#多数规则
grid_soft = ensemble.VotingClassifier(estimators = vote_est , voting = 'soft')
grid_soft_cv = model_selection.cross_validate(grid_soft, data1[data1_x_bin], data1[Target], cv = cv_split)
grid_soft.fit(data1[data1_x_bin], data1[Target])
print("Soft Voting w/Tuned Hyperparameters Training w/bin score mean: {:.2f}". format(grid_soft_cv['train_score'].mean()*100))
print("Soft Voting w/Tuned Hyperparameters Test w/bin score mean: {:.2f}". format(grid_soft_cv['test_score'].mean()*100))
print("Soft Voting w/Tuned Hyperparameters Test w/bin score 3*std: +/- {:.2f}". format(grid_soft_cv['test_score'].std()*100*3))
print('-'*10)
#准备验证数据
print(data_val.info())
print("-"*10)
#data_val.sample(10)
#handmade decision tree - submission score = 0.77990
data_val['Survived'] = mytree(data_val).astype(int)
#decision tree w/full dataset modeling submission score: defaults= 0.76555, tuned= 0.77990
#submit_dt = tree.DecisionTreeClassifier()
#submit_dt = model_selection.GridSearchCV(tree.DecisionTreeClassifier(), param_grid=param_grid, scoring = 'roc_auc', cv = cv_split)
#submit_dt.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_dt.best_params_) #Best Parameters: {'criterion': 'gini', 'max_depth': 4, 'random_state': 0}
#data_val['Survived'] = submit_dt.predict(data_val[data1_x_bin])
#bagging w/full dataset modeling submission score: defaults= 0.75119, tuned= 0.77990
#submit_bc = ensemble.BaggingClassifier()
#submit_bc = model_selection.GridSearchCV(ensemble.BaggingClassifier(), param_grid= {'n_estimators':grid_n_estimator, 'max_samples': grid_ratio, 'oob_score': grid_bool, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_bc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_bc.best_params_) #Best Parameters: {'max_samples': 0.25, 'n_estimators': 500, 'oob_score': True, 'random_state': 0}
#data_val['Survived'] = submit_bc.predict(data_val[data1_x_bin])
#extra tree w/full dataset modeling submission score: defaults= 0.76555, tuned= 0.77990
#submit_etc = ensemble.ExtraTreesClassifier()
#submit_etc = model_selection.GridSearchCV(ensemble.ExtraTreesClassifier(), param_grid={'n_estimators': grid_n_estimator, 'criterion': grid_criterion, 'max_depth': grid_max_depth, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_etc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_etc.best_params_) #Best Parameters: {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'random_state': 0}
#data_val['Survived'] = submit_etc.predict(data_val[data1_x_bin])
#random foreset w/full dataset modeling submission score: defaults= 0.71291, tuned= 0.73205
#submit_rfc = ensemble.RandomForestClassifier()
#submit_rfc = model_selection.GridSearchCV(ensemble.RandomForestClassifier(), param_grid={'n_estimators': grid_n_estimator, 'criterion': grid_criterion, 'max_depth': grid_max_depth, 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_rfc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_rfc.best_params_) #Best Parameters: {'criterion': 'entropy', 'max_depth': 6, 'n_estimators': 100, 'random_state': 0}
#data_val['Survived'] = submit_rfc.predict(data_val[data1_x_bin])
#ada boosting w/full dataset modeling submission score: defaults= 0.74162, tuned= 0.75119
#submit_abc = ensemble.AdaBoostClassifier()
#submit_abc = model_selection.GridSearchCV(ensemble.AdaBoostClassifier(), param_grid={'n_estimators': grid_n_estimator, 'learning_rate': grid_ratio, 'algorithm': ['SAMME', 'SAMME.R'], 'random_state': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_abc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_abc.best_params_) #Best Parameters: {'algorithm': 'SAMME.R', 'learning_rate': 0.1, 'n_estimators': 300, 'random_state': 0}
#data_val['Survived'] = submit_abc.predict(data_val[data1_x_bin])
#gradient boosting w/full dataset modeling submission score: defaults= 0.75119, tuned= 0.77033
#submit_gbc = ensemble.GradientBoostingClassifier()
#submit_gbc = model_selection.GridSearchCV(ensemble.GradientBoostingClassifier(), param_grid={'learning_rate': grid_ratio, 'n_estimators': grid_n_estimator, 'max_depth': grid_max_depth, 'random_state':grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_gbc.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_gbc.best_params_) #Best Parameters: {'learning_rate': 0.25, 'max_depth': 2, 'n_estimators': 50, 'random_state': 0}
#data_val['Survived'] = submit_gbc.predict(data_val[data1_x_bin])
#extreme boosting w/full dataset modeling submission score: defaults= 0.73684, tuned= 0.77990
#submit_xgb = XGBClassifier()
#submit_xgb = model_selection.GridSearchCV(XGBClassifier(), param_grid= {'learning_rate': grid_learn, 'max_depth': [0,2,4,6,8,10], 'n_estimators': grid_n_estimator, 'seed': grid_seed}, scoring = 'roc_auc', cv = cv_split)
#submit_xgb.fit(data1[data1_x_bin], data1[Target])
#print('Best Parameters: ', submit_xgb.best_params_) #Best Parameters: {'learning_rate': 0.01, 'max_depth': 4, 'n_estimators': 300, 'seed': 0}
#data_val['Survived'] = submit_xgb.predict(data_val[data1_x_bin])
#hard voting classifier w/full dataset modeling submission score: defaults= 0.75598, tuned = 0.77990
#data_val['Survived'] = vote_hard.predict(data_val[data1_x_bin])
data_val['Survived'] = grid_hard.predict(data_val[data1_x_bin])
#soft voting classifier w/full dataset modeling submission score: defaults= 0.73684, tuned = 0.74162
#data_val['Survived'] = vote_soft.predict(data_val[data1_x_bin])
#data_val['Survived'] = grid_soft.predict(data_val[data1_x_bin])
#submit file
#写入文件
submit = data_val[['PassengerId','Survived']]
submit.to_csv("../working/submit.csv", index=False)
print('Validation Data Distribution: \n', data_val['Survived'].value_counts(normalize = True))
submit.sample(10)
结果:
'''
Validation Data Distribution:
0 0.633971
1 0.366029
Name: Survived, dtype: float64
'''
12.Step7:优化和战略化
最后结果似乎收敛于0.77990。通过调整使用相同的数据集和不同的决策树实现(adaboost,随机森林,梯度提升,xgboost等)不会超过0.77990提交准确性。简单的决策树算法具有最好的默认提交分数,并且通过调整获得了相同的最佳准确性分数。
预处理和特征工程是数据处理比较重要的内容,这里我们需要多加练习。
这里附上源码连接:https://www.kaggle.com/aishanhang/titanic-study-2-10/