Titanic

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

一、数据总览

#1.数据总览
path = 'E:\\data\\titanic\\'
train_data = pd.read_csv(path+'train.csv')
test_data = pd.read_csv(path+'test.csv')

sns.set_style('whitegrid')
train_data.head()
#1是获救
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
#数据信息总览
train_data.info()
#可以看出age,Cabin,Embarked有缺失

RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
#train_data['Fare'].value_counts()
#Fare有15个是0,看一下这些人的仓位,pclass,没准费用为0真的合理
train_data[train_data.Fare==0]
#发现都是男的,且有不同的Pclass,因此断定这15人的Fare为0不合理因此需要进行根据不同阶级的中位数填充
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
179 180 0 3 Leonard, Mr. Lionel male 36.0 0 0 LINE 0.0 NaN S
263 264 0 1 Harrison, Mr. William male 40.0 0 0 112059 0.0 B94 S
271 272 1 3 Tornquist, Mr. William Henry male 25.0 0 0 LINE 0.0 NaN S
277 278 0 2 Parkes, Mr. Francis "Frank" male NaN 0 0 239853 0.0 NaN S
302 303 0 3 Johnson, Mr. William Cahoone Jr male 19.0 0 0 LINE 0.0 NaN S
413 414 0 2 Cunningham, Mr. Alfred Fleming male NaN 0 0 239853 0.0 NaN S
466 467 0 2 Campbell, Mr. William male NaN 0 0 239853 0.0 NaN S
481 482 0 2 Frost, Mr. Anthony Wood "Archie" male NaN 0 0 239854 0.0 NaN S
597 598 0 3 Johnson, Mr. Alfred male 49.0 0 0 LINE 0.0 NaN S
633 634 0 1 Parr, Mr. William Henry Marsh male NaN 0 0 112052 0.0 NaN S
674 675 0 2 Watson, Mr. Ennis Hastings male NaN 0 0 239856 0.0 NaN S
732 733 0 2 Knight, Mr. Robert J male NaN 0 0 239855 0.0 NaN S
806 807 0 1 Andrews, Mr. Thomas Jr male 39.0 0 0 112050 0.0 A36 S
815 816 0 1 Fry, Mr. Richard male NaN 0 0 112058 0.0 B102 S
822 823 0 1 Reuchlin, Jonkheer. John George male 38.0 0 0 19972 0.0 NaN S
#绘制存活比例
train_data['Survived'].value_counts().plot.pie(autopct = '%1.2f%%')
#可以看出是否存活和性别有很大的相关性

二、缺失值的处理:Fare,Age,Embarked,Cabin

1.如果数据集很多,缺失值很少,则可以删掉缺失行

2.如果该属性对于机器学习不太重要,则可以赋值均值或众数或中位数,比如上传地点Embarked一共就三个,可以用众数来填充

train_data['Embarked'].value_counts()
S    644
C    168
Q     77
Name: Embarked, dtype: int64
train_data.Embarked[train_data.Embarked.isnull()] = train_data.Embarked.mode().values
#用众数S填写了两个缺失值

3.对于标称属性(nominal attribute)意味着‘与名称相关’,它的值是一些符号或事物的名称。可以赋值一个代表性的值,因为有可能缺失代表没有仓位,但是此案例根据观察,各种阶级,票价的都有缺失,后面还需考察与目标变量的相关性,此处暂且先用U0替换

train_data['Cabin'] = train_data.Cabin.fillna('U0')
train_data['Cabin'].head()
0      U0
1     C85
2      U0
3    C123
4      U0
Name: Cabin, dtype: object

4.Fare这个变量有15个0来自不同阶级且都是男人,用不同pclass的中位数来填充

for i in range(3):
    train_data.loc[(train_data.Fare == 0) & (train_data.Pclass ==i+1),'Fare'] = train_data[(train_data.Fare !=0) & (train_data.Pclass ==i+1)].Fare.median()    

5.用随机森林来填充年龄缺失值

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
#年龄类似连续值的用回归
for_age = train_data[['Survived','Sex','SibSp','Parch','Fare','Age']]
for_age['Sex'] = pd.Categorical(for_age.Sex).codes #编码

train_age = for_age[for_age.Age.notnull()]
x_train, x_test, y_train, y_test = train_test_split(train_age.iloc[:,:-1], train_age.iloc[:,-1:], test_size=0.2, random_state=2)
test_age = for_age[for_age.Age.isnull()]
x = test_age.iloc[:,:-1]
rfr = RandomForestRegressor(n_estimators=50, max_depth=6, n_jobs=-1)# 
rfr.fit(x_train,y_train)
#检测模型准确率
y_test_hat = rfr.predict(x_test)
print(mean_squared_error(y_test,y_test_hat))
y_hat = rfr.predict(x)
train_data.Age[train_data.Age.isnull()] = y_hat
D:\anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
  from numpy.core.umath_tests import inner1d


146.06308221585957

三、分析数据关系:数据预处理完后开始分析与目标变量之间的关系

1.性别与生还的关系

train_data.groupby(['Sex','Survived'])['Survived'].count()
Sex     Survived
female  0            81
        1           233
male    0           468
        1           109
Name: Survived, dtype: int64
train_data.groupby(['Sex'])['Survived'].count()
Sex
female    314
male      577
Name: Survived, dtype: int64
train_data[['Sex','Survived']].head()
Sex Survived
0 male 0
1 female 1
2 female 1
3 female 1
4 male 0
train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
#例如100个女样本,30个未生还(0),70个生还(1)求均值得0.7
#女性生还率远大于男性,体现了女士优先

2.船舱等级pclass与生还的关系

train_data[['Pclass','Survived']].groupby(['Pclass']).count()
Survived
Pclass
1 216
2 184
3 491
train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
#可以看出阶级越高获救的可能性越大

train_data[['Pclass','Survived','Sex']].groupby(['Pclass','Sex']).mean().plot.bar()
#加上性别

train_data.groupby(['Sex','Pclass','Survived'])['Survived'].count()
#整体来讲,阶级越高获救比例越高,女性获救比例本来就高阶级高的越高
Sex     Pclass  Survived
female  1       0             3
                1            91
        2       0             6
                1            70
        3       0            72
                1            72
male    1       0            77
                1            45
        2       0            91
                1            17
        3       0           300
                1            47
Name: Survived, dtype: int64

3.年龄与是否存活的关系

#不同仓位,性别下的年龄分布和生还的关系
fig, ax = plt.subplots(1,2, figsize=(18,8))
sns.violinplot('Pclass','Age',hue='Survived', data=train_data, split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(1,110,10))

sns.violinplot('Sex', 'Age', hue='Survived', data=train_data, split=True, ax=ax[1])
ax[1].set_title('sex and age vs Survived')
ax[1].set_yticks(range(1,110,10))

plt.show()

#年龄分布图和箱图
plt.figure(figsize=(12,5))
plt.subplot(121)
train_data['Age'].hist(bins=30)
plt.xlabel('age')
plt.ylabel('num')

plt.subplot(122)
train_data.boxplot(column='Age', showfliers=False)
plt.show()

#不同年龄下生存情况
facet = sns.FacetGrid(train_data, hue='Survived', aspect=4)
facet.map(sns.kdeplot,'Age',shade=True)
facet.set(xlim=(0, train_data['Age'].max()))
facet.add_legend()

#不同年龄下的生存率
fig. axis1 = plt.subplots(1,1,figsize=(18,4))
train_data['Age_int'] = train_data['Age'].astype(int)
avg_age = train_data[['Age_int','Survived']].groupby(['Age_int'],as_index=False).mean()
sns.barplot(x='Age_int', y='Survived',data=avg_age)

train_data['Age'].describe()
count    891.000000
mean      29.720748
std       13.384343
min        0.420000
25%       22.000000
50%       29.000000
75%       36.000000
max       80.000000
Name: Age, dtype: float64
#划分幼儿,少年,成年,老人四个群体
bins=[0,12,18,65,100]
train_data['Age_group'] = pd.cut(train_data['Age'],bins)
by_age = train_data[['Age_group','Survived']].groupby(['Age_group']).mean()
by_age
Survived
Age_group
(0, 12] 0.560000
(12, 18] 0.430556
(18, 65] 0.364130
(65, 100] 0.125000
by_age.plot.bar()
#年龄越大生还率越低

4.称呼与存活关系

train_data['Title'] = train_data['Name'].str.extract('([A-Za-z]+)\.',expand=False)
pd.crosstab(train_data['Title'],train_data['Sex'])
Sex female male
Title
Capt 0 1
Col 0 2
Countess 1 0
Don 0 1
Dr 1 6
Jonkheer 0 1
Lady 1 0
Major 0 2
Master 0 40
Miss 182 0
Mlle 2 0
Mme 1 0
Mr 0 517
Mrs 125 0
Ms 1 0
Rev 0 6
Sir 0 1
train_data[['Title','Survived']].groupby(['Title']).mean().plot.bar()
#各称呼下的存活率

fig, axist = plt.subplots(1,1, figsize=(18,4))
train_data['Name_len'] = train_data['Name'].apply(len)
name_len = train_data[['Name_len','Survived']].groupby(['Name_len'],as_index=False).mean()
sns.barplot(x='Name_len',y='Survived',data=name_len)
#从数据上看名字长度确实与存活率有一定关系

5.兄弟姐妹sisp与存活关系

sibsp = train_data[train_data['SibSp'] != 0]
no_sibsp = train_data[train_data['SibSp'] == 0]

plt.figure(figsize=(10,5))
plt.subplot(121)
sibsp['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('sibsp')

plt.subplot(122)
no_sibsp['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('no_sibsp')
#可以看出有兄弟姐妹的人存活率会高一些,可能由于互相帮扶吧
Text(0.5,0,'no_sibsp')

6.有无父母parch

parch = train_data[train_data['Parch'] != 0]
no_parch = train_data[train_data['Parch'] == 0]

plt.figure(figsize=(10,5))
plt.subplot(121)
parch['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('parch')

plt.subplot(122)
no_parch['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('no_parch')
plt.show()
#可以看出结论与兄弟姐妹类似

7.亲友人数与存活关系sibsp&parch

fig,ax = plt.subplots(1,2,figsize=(18,8))
train_data[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar(ax=ax[0])
ax[0].set_title('SibSp and Survived')

train_data[['Parch','Survived']].groupby(['Parch']).mean().plot.bar(ax=ax[1])
ax[1].set_title('Parch and Survived')
plt.show()
#太多也是累赘吧

train_data['people num'] = train_data['SibSp'] + train_data['Parch'] +1
train_data[['people num','Survived']].groupby(['people num']).mean().plot.bar()
#一个人生存率也比较低,家庭过多的人数存活率也会降低

8.存活率与票价分布Fare的关系

plt.figure(figsize=(10,5))
train_data['Fare'].hist(bins=100)

train_data.boxplot(column='Fare', by='Pclass',showfliers=False)
plt.show()

train_data['Fare'].describe()
count    891.000000
mean      32.689318
std       49.611639
min        4.012500
25%        7.925000
50%       14.500000
75%       31.275000
max      512.329200
Name: Fare, dtype: float64
fns = train_data['Fare'][train_data['Survived']==0]
fs = train_data['Fare'][train_data['Survived']==1]

avg_fare = pd.DataFrame([fns.mean(),fs.mean()])
std_fare = pd.DataFrame([fns.std(),fs.std()])

avg_fare.plot.bar(yerr=std_fare, legend=False)
#生还者的平均票价要高于未生还者

9.船舱类型Cabin与生还的关系

#缺失值过多,可直接剔除该特征,此处分析有无仓位
train_data['has cabin'] = train_data['Cabin'].apply(lambda x:0 if x=='U0' else 1)
plt.figure(figsize=(10,7))
train_data[['has cabin','Survived']].groupby(['Survived']).mean().plot.bar()
train_data[['has cabin','Survived']].groupby(['has cabin']).mean().plot.bar()
#可以看出图一,未生还里有仓位率为0.12,生还的人有仓位率接近0.4,图二中没有仓位生还率0.3,有仓位的生还率接近0.7





#对不同类型船舱分析
train_data['cat_cabin'] = train_data['Cabin'].map(lambda x:re.compile('([a-zA-Z]+)').search(x).group())
#将字母映射成数值
train_data['cat_cabin'] = pd.factorize(train_data['cat_cabin'])[0]
train_data[['cat_cabin','Survived']].groupby(['cat_cabin']).mean().plot.bar()

train_data[['cat_cabin','Survived']].groupby(['cat_cabin']).mean().plot.bar()
#其实跟上面一样,只不过顺序问题,虽然各仓位存活率不同,但差别也不大基本没啥规律可以删除该特征

10.登船地Embarked与生还率的关系

#坦尼克号从英国的南安普顿港出发,途径法国瑟堡和爱尔兰昆士敦,那么在昆士敦之前上船的人,有可能在瑟堡或昆士敦下船,这些人将不会遇到海难。
sns.countplot('Embarked',hue='Survived',data=train_data)
plt.title('Embarked and Survived')
plt.show()

sns.factorplot('Embarked','Survived',data=train_data, size=3, aspect=3)
plt.title('Embarked and Survived rate')
plt.show()
#可见s上船存活率最低,c最高

四.变量、定量转换

#独热编码,类别变量或二元变量,类别过多也不好,因为会增加过多特征
embark_dummies = pd.get_dummies(train_data['Embarked'])
train_data = train_data.join(embark_dummies)
embark_dummies = train_data[['S','C','Q']]
#对于多类例如Cabin编码用pd.factorize
#先填空值
train_data['Cabin'][train_data['Cabin'].isnull()] = 'U0'
#先提取出类别字母再进行编码
train_data['cat_cabin'] = train_data['Cabin'].map(lambda x:re.compile('([a-zA-z]+)').serch(x).group())
train_data['cat_cabin_code'] = pd.factorize(train_data['Cabin'])[0]
Series([], Name: Cabin, dtype: object)

定量转换

1.Scaling数据标准化normalization

from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
train_data['age_s'] = scaler.fit_transform(train_data['Age'].values.reshape(-1,1))
#对Fare分箱
train_data['Fare_bin'] = pd.qcut(train_data['Fare'],5)
#分箱后编码
train_data['Fare_bin_id'] = pd.factorize(train_data['Fare_bin'])[0]

#也可以用One hot编码,但是这里类别过多
fare_one_hot = pd.get_dummies(train_data['Fare_bin']).rename(columns=lambda x: 'Fare'+ str(x))
#拼接,也可以用join
a = train_data.join(fare_one_hot)
#也可以用pd.concat
b = pd.concat([train_data,fare_one_hot],axis=1)

五、特征工程

#需要将训练集和测试集一同处理,使得二者有相同数据类型和分布
train_df_org = pd.read_csv('E:\\data\\titanic\\train.csv')
test_df_org = pd.read_csv('E:\\data\\titanic\\test.csv')
test_df_org['Survived'] = None
combined_data = train_df_org.append(test_df_org)
PassengerId = test_df_org['PassengerId']

1.Embarked

#由于此特征的缺失值不多,可以用众数来填充
combined_data['Embarked'].fillna(combined_data['Embarked'].mode().iloc[0],inplace=True)
#或
# combined_data[combined_data['Embarked'].isnull()]['Embarked'] = combined_data['Embarked'].mode()[0]
#然后可以用pd.get_dummies或pd.factorize,此处用后者方便后面处理其他变量
combined_data['Embarked'] = pd.factorize(combined_data['Embarked'])[0]
# 使用 pd.get_dummies 获取one-hot 编码
# emb_dummies_df = pd.get_dummies(combined_data['Embarked'], prefix=combined_data[['Embarked']].columns[0])
# combined_data = pd.concat([combined_data, emb_dummies_df], axis=1)

2.Sex

#应该用onehot由于也为了后面方便分析其他变量,此处也用factorize
combined_data['Sex'] = pd.factorize(combined_data['Sex'])[0]
#one hot
#sex_one_hot = pd.get_dummies(combined_data['Sex'],prefix=combined_data[['Sex']].columns[0])#给列从新命名
#pd.concat([combined_data, sex_one_hot],axis=1)

3.Name

#提取各种称呼,注意空格不能少,否则下一步会出错
combined_data['Title'] = combined_data['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))

combined_data['Title'] = combined_data['Title'].map(title_Dict)
#编码
combined_data['Title'] = pd.factorize(combined_data['Title'])[0]
#增加名字长度的特征
combined_data['Name_len'] = combined_data['Name'].map(len)

4.Fare

# 先把空值进行对应阶级均值得填充,然后对于Fare为0的值进行一个平滑处理
combined_data['Fare'] = combined_data[['Fare']].fillna(combined_data.groupby('Pclass').transform(np.mean))
for i in range(3):
    combined_data.loc[(combined_data['Pclass'] == i+1) & (combined_data['Fare']==0),'Fare'] = combined_data[(combined_data['Pclass'] == i+1) & (combined_data['Fare']!=0)]['Fare'].min()/10
#经判断,有很多ticket重复,有团体票,需要分担到每个人的头上
#将Fare按照TIcket分组后,在每个对应Fare位置上写上该Ticket组有几个Fare
combined_data['Group_Ticket'] = combined_data['Fare'].groupby(by=combined_data['Ticket']).transform('count')
combined_data = combined_data.drop(['Group_Ticket','Title_code'],axis=1,inplace=True)
#对票价进行分箱,然后编码
combined_data['Fare_bin'] = pd.qcut(combined_data['Fare'],5)
combined_data['Fare_id'] = pd.factorize(combined_data['Fare_bin'])[0]
combined_data.drop(['Fare_bin'],axis=1,inplace=True)

5.Pclass

#一般情况没必要处理Pclass,但是各等级内部也有可能跟逃生顺序有关,因此分出各等级仓位的高低价位
from sklearn.preprocessing import LabelEncoder

pc1_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[0][0]
pc2_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[1][0]
pc3_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[2][0]

def Pclass_fare_category(df,pc1_mean,pc2_mean,pc3_mean):
    if df['Pclass'] == 1:
        if df['Fare'] > pc1_mean:
            return 'pc1_high'
        else:
            return 'pc1_low'
    
    elif df['Pclass'] == 2:
        if df['Fare'] > pc2_mean:
            return 'pc2_high'
        else:
            return 'pc2_low'
    
    elif df['Pclass'] == 3:
        if df['Fare'] > pc3_mean:
            return 'pc3_high'
        else:
            return 'pc3_low'

combined_data['Pclass_Fare_Category'] = combined_data.apply(Pclass_fare_category,args=(pc1_mean,pc2_mean,pc3_mean),axis=1)
combined_data['Pclass_Fare_Category'] = LabelEncoder().fit_transform(combined_data['Pclass_Fare_Category'])
#顺手把pclass也编码
combined_data['Pclass'] = LabelEncoder().fit_transform(combined_data['Pclass'])

6.SibSp和Parch,合并为family size,还得加上他自己

def Family_size_category(family_size):
    if family_size == 1:
        return '1'
    elif family_size <= 4:
        return '2'
    else:
        return '3'

combined_data['Family_size'] = combined_data['SibSp'] + combined_data['Parch'] + 1
combined_data['Family_size_categroy'] = combined_data['Family_size'].map(Family_size_category)
combined_data['Family_size_categroy'] = LabelEncoder().fit_transform(combined_data['Family_size_categroy'])

7.Age:缺失过多,不能简单填充,模型融合建模预测

age = combined_data[['Age','Sex','Embarked','Pclass','Pclass_Fare_Category','Title','Name_len','Fare','Fare_id','Family_size','Family_size_categroy']]
age_train = age[age['Age'].notnull()]
age_test = age[age['Age'].isnull()]
#建立多模型预测
from sklearn import ensemble
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

def fill_age(age_train,age_test):
    age_train_x = age_train.drop(['Age'],axis=1)
    age_train_y = age_train['Age']
    age_test_x = age_test.drop(['Age'],axis=1)
    
    #model 1 gbm
    print('==========Model GBM==========')
    gbm_reg = GradientBoostingRegressor(random_state=41)
    gbm_param = {'n_estimators':[2000], 'max_depth':[4], 'learning_rate':[0.01], 'max_features':[3]}
    gbm_grid = model_selection.GridSearchCV(gbm_reg, gbm_param, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
    gbm_grid.fit(age_train_x, age_train_y)
    print('GBM best features for age params'+str(gbm_grid.best_params_))
    print('GBM best features for age scores'+str(gbm_grid.best_score_))
    print('GBM train error for age feature regressor'+str(gbm_grid.score(age_train_x, age_train_y)))
    print(age_train_x.shape,age_test_x.shape)
    age_test.loc[:, 'Age_gbm'] = gbm_grid.predict(age_test_x)
    print(age_test['Age_gbm'][:4])
    
    #model 2 rf
    print('==========Model RF==========')
    rf_reg = RandomForestRegressor()
    rf_param = {'n_estimators':[200], 'max_depth':[5], 'random_state':[0]}
    rf_grid = model_selection.GridSearchCV(rf_reg,rf_param, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
    rf_grid.fit(age_train_x,age_train_y)
    print('RF best features for age params'+ str(rf_grid.best_params_))
    print('RF best features for age score'+ str(rf_grid.best_score_))
    print('RF train error for age feature regressor'+ str(rf_grid.score(age_train_x,age_train_y)))
    
    age_test.loc[:, 'Age_rf'] = rf_grid.predict(age_test_x)
    print(age_test['Age_rf'][:4])
    
    #merge model
    print('==========Merge Model==========')
    print('shape',age_test['Age'].shape,age_test[['Age_gbm','Age_rf']].mode(axis=1).shape)
    
    age_test.loc[:,'Age'] = np.mean([age_test['Age_gbm'], age_test['Age_rf']])
    print('merge age:\n',age_test['Age'][:4])
    age_test.drop(['Age_gbm','Age_rf'],axis=1,inplace=True)
    
    return age_test
combined_data.loc[(combined_data.Age.isnull()),'Age'] = fill_age(age_train,age_test)
==========Model GBM==========
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=25)]: Done   5 out of  10 | elapsed:   35.2s remaining:   35.2s
[Parallel(n_jobs=25)]: Done  10 out of  10 | elapsed:   47.2s finished


GBM best features for age params{'learning_rate': 0.01, 'max_depth': 4, 'max_features': 3, 'n_estimators': 2000}
GBM best features for age scores-130.07827342623582
GBM train error for age feature regressor-63.488346095073126
(1046, 10) (263, 10)
5     36.266057
17    29.768299
19    37.534189
26    27.857612
Name: Age_gbm, dtype: float64
==========Model RF==========
Fitting 10 folds for each of 1 candidates, totalling 10 fits


[Parallel(n_jobs=25)]: Done   5 out of  10 | elapsed:   37.7s remaining:   37.7s
[Parallel(n_jobs=25)]: Done  10 out of  10 | elapsed:   47.6s finished


RF best features for age params{'max_depth': 5, 'n_estimators': 200, 'random_state': 0}
RF best features for age score-120.22123963994939
RF train error for age feature regressor-96.82435399344224
5     32.667672
17    31.516429
19    31.493906
26    27.854183
Name: Age_rf, dtype: float64
==========Merge Model==========
shape (263,) (263, 2)
merge age:
 5     29.841298
17    29.841298
19    29.841298
26    29.841298
Name: Age, dtype: float64

8.Tickt有字母数字之分,将字母与数字分开

combined_data['Ticket_letter'] = combined_data['Ticket'].str.split().str[0]
combined_data['Ticket_letter'] = combined_data['Ticket_letter'].apply(lambda x:'U0' if x.isnumeric() else x)
#编码
combined_data['Ticket_letter'] = LabelEncoder().fit_transform(combined_data['Ticket_letter'])

9.Cabin:缺失值太多,就分为有无

combined_data.loc[combined_data['Cabin'].isnull(),'Cabin'] = 'U0'
combined_data['Cabin'] = combined_data['Cabin'].apply(lambda x:0 if x == 'U0' else 1)
combined_data['Cabin'].value_counts()
0    1014
1     295
Name: Cabin, dtype: int64
combined_data.rename(columns={'Family_size_categroy':'Family_size_category'},inplace=True)

六、特征间相关性分析

correlation = combined_data[['Age','Cabin','Embarked','Fare','Pclass','Sex','Title','Name_len','Fare_id','Pclass_Fare_Category','Family_size','Family_size_category','Ticket_letter']]
colormap = plt.cm.viridis
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features',y=1.05,size=15)
sns.heatmap(correlation.astype(float).corr(),lw=0.1, vmax=1.0, square=True, cmap = colormap, linecolor = 'white', annot=True)
plt.show()

#特征间数据分布图
g = sns.pairplot(combined_data[[u'Survived', u'Pclass', u'Sex', u'Age', u'Fare', u'Embarked',u'Family_size', u'Title', u'Ticket_letter']], hue='Survived', palette='seismic', size=1.2, diag_kind='kde', diag_kws=dict(shade=True), plot_kws=dict(s=10))
g.set(xticklabels=[])

七、入模前处理

#将age,Fare,name_len正则化(归一化)
combined_data[['Age','Fare','Name_len']] = preprocessing.StandardScaler().fit_transform(combined_data[['Age','Fare','Name_len']])
#备份
combined_data_backup = combined_data
#还原
combined_data = combined_data_backup
#把passengerid设置为索引
# combined_data.set_index('PassengerId',inplace=True)
#ticket换成独热编码
pd.cut(combined_data['Ticket_letter'],5)

combined_data.shape
(1309, 40)
#统一独热编码,删除不入模变量
get_dummies = combined_data[['Cabin','Embarked','Pclass','Sex','Title','Fare_id','Pclass_Fare_Category','Family_size_category']]
ohe_list = ['Cabin','Embarked','Pclass','Sex','Title','Fare_id','Pclass_Fare_Category','Family_size_category','Ticket_letter']
drop_list = ohe_list+['Name','Parch','SibSp','Ticket']

for i in ohe_list:
    ohe = pd.get_dummies(combined_data[i]).rename(columns=lambda x: i+str(x))
    combined_data = pd.concat([combined_data,ohe],axis=1)
combined_data.drop(drop_list,axis=1,inplace=True)
#区分训练集与测试集
train_data = combined_data[combined_data.Survived.notnull()]
test_data = combined_data[combined_data.Survived.isnull()]

feature_train_data_x = train_data.drop('Survived',axis=1)
feature_train_data_y = train_data['Survived'].astype('int')
feature_test_data_x = test_data.drop('Survived',axis=1)

八、模型融合及测试

#1.用不同模型对特征进行筛选,选出较重要特征
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

def get_top_n_features(train_x,train_y,get_feature_num):
    
    #extra tree的参数并行数n_jobs=25会报错,=1时候就没事
    print('========== Extra Trees ==========')
    et_est = ExtraTreesClassifier(random_state=0)
    et_param = {'n_estimators':[500],'min_samples_split':[3,4],'max_depth':[20]}
    et_grid = model_selection.GridSearchCV(et_est, et_param, n_jobs=1, cv=10, verbose=1)
    et_grid.fit(train_x,train_y)
    print('best ET params for top n features'+str(et_grid.best_params_))
    print('best ET score for top n features'+str(et_grid.best_score_))
    print('best ET training score for top n features'+str(et_grid.score(train_x,train_y)))
    top_n_features_et_sorted = pd.DataFrame({'feature':list(train_x),'importance':et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    top_n_features_et = top_n_features_et_sorted.head(get_feature_num)['feature']
    print('Sample 10 features from Extra Trees')
    print(str(top_n_features_et[:10]))
    
    print('========== Gradient Boosting ==========')
    gb_est = GradientBoostingClassifier(random_state=0)
    gb_param = {'n_estimators':[500],'learning_rate':[0.01,0.1],'max_depth':[20]}
    gb_grid = model_selection.GridSearchCV(gb_est, gb_param, n_jobs=25, cv=10, verbose=1)
    gb_grid.fit(train_x,train_y)
    print('best GB params for top n features'+str(gb_grid.best_params_))
    print('best GB score for top n features'+str(gb_grid.best_score_))
    print('best GB training score for top n features'+str(gb_grid.score(train_x,train_y)))
    top_n_features_gb_sorted = pd.DataFrame({'feature':list(train_x),'importance':gb_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)
    top_n_features_gb = top_n_features_gb_sorted.head(get_feature_num)['feature']
    print('Sample 10 features from Gradient Boosting')
    print(str(top_n_features_gb[:10]))
    
    print('========== Decision Tree ==========')
    dt_est = DecisionTreeClassifier(random_state=0)
    dt_param = {'min_samples_split':[2,4],'max_depth':[20]}
    dt_grid = model_selection.GridSearchCV(dt_est, dt_param, n_jobs=25, cv=10, verbose=1)
    dt_grid.fit(train_x,train_y)
    print('best DT params for top n features'+str(dt_grid.best_params_))
    print('best DT score for top n features'+str(dt_grid.best_score_))
    print('best DT training score for top n features'+str(dt_grid.score(train_x, train_y)))
    top_n_features_dt_sorted = pd.DataFrame({'feature':list(train_x),'importance':dt_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)
    top_n_features_dt = top_n_features_dt_sorted.head(get_feature_num)['feature']
    print('Sample 10 features from Decision Tree')
    print(str(top_n_features_dt[:10]))

    print('========== Random Forest ==========')
    rf_est = RandomForestClassifier(random_state=0)
    rf_param = {'n_estimators':[500],'min_samples_split':[2,3],'max_depth':[20]}
    rf_grid = model_selection.GridSearchCV(rf_est, rf_param, n_jobs=25, cv=10, verbose=1)
    rf_grid.fit(train_x,train_y)
    print('best RF params for top n features'+str(rf_grid.best_params_))
    print('best RF score for top n fratures'+str(rf_grid.best_score_))
    print('best RF training score for top n fratures'+str(rf_grid.score(train_x,train_y)))
    top_n_features_rf_sorted = pd.DataFrame({'feature':list(train_x),'importance':rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    top_n_features_rf = top_n_features_rf_sorted.head(get_feature_num)['feature']
    print('Sample 10 Features from Random Forest')
    print(str(top_n_features_rf[:10]))
        
    print('========== AdaBoost ==========')
    ada_est = AdaBoostClassifier(random_state=0)
    ada_param = {'n_estimators':[500], 'learning_rate':[0.01,0.1]}
    ada_grid = model_selection.GridSearchCV(ada_est, ada_param, n_jobs=25, cv=10, verbose=1)
    ada_grid.fit(train_x, train_y)
    print('best Ada params for top n features'+str(ada_grid.best_params_))
    print('best Ada score for top n features'+str(ada_grid.best_score_))
    print('best Ada training score for top n features'+str(ada_grid.score(train_x,train_y)))
    top_n_features_ada_sorted = pd.DataFrame({'feature':list(train_x),'importance':ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
    top_n_features_ada = top_n_features_ada_sorted.head(get_feature_num)['feature']
    print('Sample 10 Features from AdaBoost')
    print(str(top_n_features_ada[:10]))
    
    
    
    #model merge
    top_n_feature = pd.concat([top_n_features_rf,top_n_features_ada,top_n_features_et,top_n_features_gb,top_n_features_dt],ignore_index=True).drop_duplicates()
    features_importance = pd.concat([top_n_features_rf_sorted,top_n_features_ada_sorted,top_n_features_et_sorted,top_n_features_gb_sorted,top_n_features_dt_sorted],ignore_index=True)
    
    return top_n_feature, features_importance
#2.用融合的模型得出最佳特征
get_features_num = 30
top_n_feature, features_importance = get_top_n_features(feature_train_data_x, feature_train_data_y, get_features_num)
========== Extra Trees ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   16.7s finished


best ET params for top n features{'max_depth': 20, 'min_samples_split': 4, 'n_estimators': 500}
best ET score for top n features0.8271604938271605
best ET training score for top n features0.9652076318742986
Sample 10 features from Extra Trees
14      Title0
12        Sex0
13        Sex1
2     Name_len
0          Age
1         Fare
11     Pclass2
16      Title2
15      Title1
4       Cabin0
Name: feature, dtype: object
========== Gradient Boosting ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  1.3min remaining:   42.1s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  1.6min finished


best GB params for top n features{'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 500}
best GB score for top n features0.7654320987654321
best GB training score for top n features0.9977553310886644
Sample 10 features from Gradient Boosting
0                       Age
2                  Name_len
1                      Fare
14                   Title0
10                  Pclass1
16                   Title2
13                     Sex1
3               Family_size
28    Pclass_Fare_Category3
12                     Sex0
Name: feature, dtype: object
========== Decision Tree ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  1.1min remaining:   36.3s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  1.5min finished


best DT params for top n features{'max_depth': 20, 'min_samples_split': 4}
best DT score for top n features0.7643097643097643
best DT training score for top n features0.9618406285072951
Sample 10 features from Decision Tree
14                   Title0
1                      Fare
0                       Age
2                  Name_len
3               Family_size
19                   Title5
29    Pclass_Fare_Category4
10                  Pclass1
22                 Fare_id2
4                    Cabin0
Name: feature, dtype: object
========== Random Forest ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  1.3min remaining:   41.9s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  1.8min finished


best RF params for top n features{'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}
best RF score for top n fratures0.8294051627384961
best RF training score for top n fratures0.9809203142536476
Sample 10 Features from Random Forest
2        Name_len
0             Age
1            Fare
13           Sex1
14         Title0
12           Sex0
3     Family_size
11        Pclass2
16         Title2
15         Title1
Name: feature, dtype: object
========== AdaBoost ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits


[Parallel(n_jobs=25)]: Done  13 out of  20 | elapsed:  1.4min remaining:   43.8s
[Parallel(n_jobs=25)]: Done  20 out of  20 | elapsed:  1.7min finished


best Ada params for top n features{'learning_rate': 0.01, 'n_estimators': 500}
best Ada score for top n features0.8148148148148148
best Ada training score for top n features0.8170594837261503
Sample 10 Features from AdaBoost
14                   Title0
1                      Fare
33    Family_size_category2
3               Family_size
11                  Pclass2
12                     Sex0
13                     Sex1
4                    Cabin0
5                    Cabin1
2                  Name_len
Name: feature, dtype: object
#重新构建入模特征数据集
train_data_x = pd.DataFrame(feature_train_data_x[top_n_feature])
test_data_x = pd.DataFrame(feature_test_data_x[top_n_feature])

2.对挑出来的特征部分作图

#对挑出来的特征部分作图,因为是按照每个模型39个特征的重要性依次拼接上的
et_feature_imp = features_importance[:10]
gb_feature_imp = features_importance[39:39+10].reset_index(drop=True)

#一列是特征名称,一列是重要度,取重要度这一列,再做一个归一化
et_feature_importance = 100.0 * (et_feature_imp['importance']/et_feature_imp['importance'].max())
gb_feature_importance = 100.0 * (gb_feature_imp['importance']/gb_feature_imp['importance'].max())

# Get the indexes of all features over the importance threshold
et_important_idx = np.where(et_feature_importance)[0]
gb_important_idx = np.where(gb_feature_importance)[0]

pos = np.arange(et_important_idx.shape[0]) + .5
plt.figure(1, figsize = (18, 8))

plt.subplot(121)
plt.barh(pos, et_feature_importance[et_important_idx][::-1])
plt.yticks(pos, et_feature_imp['feature'][::-1])
plt.xlabel('Relative Importance')
plt.title('Extra Trees Importance')

plt.subplot(122)
plt.barh(pos, gb_feature_importance[gb_important_idx][::-1])
plt.yticks(pos, gb_feature_imp['feature'][::-1])
plt.xlabel('Relative Importance')
plt.title('Gradient Boosting Importance')

plt.show()

3.模型融合(Model Ensemble):常见有Bagging,Boosting,Stacking,Blending
#1.Bagging

Bagging 将多个模型,也就是多个基学习器的预测结果进行简单的加权平均或者投票。它的好处是可以并行地训练基学习器。Random Forest就用到了Bagging的思想。

(3-2): Boosting

Boosting 的思想有点像知错能改,每个基学习器是在上一个基学习器学习的基础上,对上一个基学习器的错误进行弥补。我们将会用到的 AdaBoost,Gradient Boost 就用到了这种思想。

(3-3): Stacking

Stacking是用新的次学习器去学习如何组合上一层的基学习器。如果把 Bagging 看作是多个基分类器的线性组合,那么Stacking就是多个基分类器的非线性组合。Stacking可以将学习器一层一层地堆砌起来,形成一个网状的结构。

相比来说Stacking的融合框架相对前面的二者来说在精度上确实有一定的提升,所以在下面的模型融合上,我们也使用Stacking方法。

(3-4): Blending

Blending 和 Stacking 很相似,但同时它可以防止信息泄露的问题。

# Stacking框架融合:
# 这里我们使用了两层的模型融合,Level 1使用了:RandomForest、AdaBoost、ExtraTrees、GBDT、DecisionTree、KNN、SVM ,一共7个模型,Level 2使用了XGBoost使用第一层预测的结果作为特征对最终的结果进行预测。

# Level 1:
# Stacking框架是堆叠使用基础分类器的预测作为对二级模型的训练的输入。 然而,我们不能简单地在全部训练数据上训练基本模型,产生预测,输出用于第二层的训练。如果我们在Train Data上训练,然后在Train Data上预测,就会造成标签。为了避免标签,我们需要对每个基学习器使用K-fold,将K个模型对Valid Set的预测结果拼起来,作为下一层学习器的输入。

# 所以这里我们建立输出fold预测方法:
from sklearn.model_selection import KFold

#一些参数
ntrain = train_data_x.shape[0]
ntest = test_data_x.shape[0]
random_state = 0
n_fold = 7
kf = KFold(n_splits=n_fold, random_state=random_state, shuffle=False)

def get_out_fold(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest))
    oof_test_skf = np.empty((n_fold, ntest))
    #先把数据7等分,第一份为测试集,其余训练集,然后是第二份以此类推,i:第几折了,train_index:本折训练集索引
    for i,(train_index, test_index) in enumerate(kf.split(x_train)):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]
        %timeit
        clf.fit(x_tr,y_tr)
        oof_train[test_index] = clf.predict(x_te)#将每次循环得到的交叉验证结果存起来
        oof_test_skf[i,:] = clf.predict(x_test)#用每次循环训练出的模型对测试集进行预测
        
    oof_test[:] = oof_test_skf.mean(axis=0)#把7次预测结果求平均
    return oof_train.reshape(-1,1), oof_test.reshape(-1,1)
        
#选用之前填选特征时的5个基分类器Random Forest, AdaBoost, Extra Trees, Decision Tree, Gradient Boosting, 和KNN, SVM这七个学习器
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


rf = RandomForestClassifier(n_estimators=500, warm_start=True, max_features='sqrt', max_depth=6, min_samples_split=3, min_samples_leaf=2, n_jobs=-1, verbose=0)

ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)

et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_depth=8, min_samples_leaf=2, verbose=0)

gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.008, min_samples_split=3, min_samples_leaf=2, verbose=0)

dt = DecisionTreeClassifier(max_depth=8)

knn = KNeighborsClassifier(n_neighbors=2)

svm = SVC(kernel='linear', C=0.025)

#转化成array
x_train = train_data_x.values
x_test = test_data_x.values
y_train = feature_train_data_y.values

# Create our OOF train and test predictions. These base results will be used as new features
rf_oof_train, rf_oof_test = get_out_fold(rf, x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_out_fold(ada, x_train, y_train, x_test) # AdaBoost 
et_oof_train, et_oof_test = get_out_fold(et, x_train, y_train, x_test) # Extra Trees
gb_oof_train, gb_oof_test = get_out_fold(gb, x_train, y_train, x_test) # Gradient Boost
dt_oof_train, dt_oof_test = get_out_fold(dt, x_train, y_train, x_test) # Decision Tree
knn_oof_train, knn_oof_test = get_out_fold(knn, x_train, y_train, x_test) # KNeighbors
svm_oof_train, svm_oof_test = get_out_fold(svm, x_train, y_train, x_test) # Support Vector
print('Training is complete')

Training is complete

4.用XGB将level1的结果作为特征对最终结果进行预测并生成提交文件

#把每个模型得到的训练集预测结果和测试集的预测结果拼接起来
x_train = np.concatenate((rf_oof_train, ada_oof_train, et_oof_train, gb_oof_train, dt_oof_train, knn_oof_train, svm_oof_train),axis=1)
x_test = np.concatenate((rf_oof_test, ada_oof_test, et_oof_test, gb_oof_test, dt_oof_test, knn_oof_test, svm_oof_test),axis=1)#左右拼

from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=2000, max_depth=4, min_child_weight=2, gamma=0.9, subsample=0.8, 
                    colsample_bytree=0.8, objective='binary:logistic',nthread=-1, scale_pos_weight=1)
xgb.fit(x_train, y_train)#其实这一步就相当于把前面得到的7个结果当做特征来和y_train去重新建模
predictions = xgb.predict(x_test)#然后拿训练好的模型去预测结果
D:\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
  if diff:
predictions
array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])
#保存并提交结果,91.1%的正确率,提交得分0.77
Stacking_Submission = pd.DataFrame({'PassengerId':PassengerId,'Survived':predictions})
Stacking_Submission.to_csv('E:\\data\\titanic\\StackingSubmission.csv', index=False, sep=',')

九、验证:学习曲线

from sklearn.learning_curve import learning_curve

def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(0.1,1.0,5), verbose=0):
    
    """
    Generate a simple plot of the test and traning learning curve.

    Parameters
    ----------
    estimator : object type that implements the "fit" and "predict" methods
        An object of that type which is cloned for each validation.

    title : string
        Title for the chart.

    X : array-like, shape (n_samples, n_features)
        Training vector, where n_samples is the number of samples and
        n_features is the number of features.

    y : array-like, shape (n_samples) or (n_samples, n_features), optional
        Target relative to X for classification or regression;
        None for unsupervised learning.

    ylim : tuple, shape (ymin, ymax), optional
        Defines minimum and maximum yvalues plotted.

    cv : integer, cross-validation generator, optional
        If an integer is passed, it is the number of folds (defaults to 3).
        Specific cross-validation objects can be passed, see
        sklearn.cross_validation module for the list of possible objects

    n_jobs : integer, optional
        Number of jobs to run in parallel (default 1).
    """
        
        
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel('Training examples')
    plt.ylabel('Score')
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores,axis=1)
    train_scores_std = np.std(train_scores,axis=1)
    test_scores_mean = np.mean(test_scores,axis=1)
    test_scores_std = np.std(test_scores,axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color='r')
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color='g')
    plt.plot(train_sizes, train_scores_mean,'o-', color='r', label='Training score')
    plt.plot(train_sizes, test_scores_mean,'o-', color='g', label='Cross-validation score')

    plt.legend(loc='best')
    return plt
X = x_train
Y = y_train

# RandomForest
rf_parameters = {'n_jobs': -1, 'n_estimators': 500, 'warm_start': True, 'max_depth': 6, 'min_samples_leaf': 2, 
                 'max_features' : 'sqrt','verbose': 0}

# AdaBoost
ada_parameters = {'n_estimators':500, 'learning_rate':0.1}

# ExtraTrees
et_parameters = {'n_jobs': -1, 'n_estimators':500, 'max_depth': 8, 'min_samples_leaf': 2, 'verbose': 0}

# GradientBoosting
gb_parameters = {'n_estimators': 500, 'max_depth': 5, 'min_samples_leaf': 2, 'verbose': 0}

# DecisionTree
dt_parameters = {'max_depth':8}

# KNeighbors
knn_parameters = {'n_neighbors':2}

# SVM
svm_parameters = {'kernel':'linear', 'C':0.025}

# XGB
gbm_parameters = {'n_estimators': 2000, 'max_depth': 4, 'min_child_weight': 2, 'gamma':0.9, 'subsample':0.8, 
                  'colsample_bytree':0.8, 'objective': 'binary:logistic', 'nthread':-1, 'scale_pos_weight':1}


title = 'learning Curves'
plot_learning_curve(RandomForestClassifier(**rf_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])

plt.show()

title = 'learning Curves'
plot_learning_curve(XGBClassifier(**gbm_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()

title = 'learning Curves'
plot_learning_curve(AdaBoostClassifier(**ada_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()

title = 'learning Curves'
plot_learning_curve(SVC(**svm_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()

title = 'learning Curves'
plot_learning_curve(ExtraTreesClassifier(**et_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()

plot_learning_curve(GradientBoostingClassifier(**gb_parameters), title, X, Y, cv=None,  n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])


from sklearn.metrics import roc_auc_score,roc_curve,auc

y = pd.read_csv('E:\\data\\titanic\\gender_submission.csv')
y = y['Survived'].values
fpr, tpr, thresholds = roc_curve(y, predictions, pos_label=1)
plt.plot(fpr,tpr)
plt.show()

roc_auc_score(y,predictions)
0.9050751879699248
tpr
array([0.        , 0.88157895, 1.        ])

你可能感兴趣的:(Titanic)