import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
一、数据总览
#1.数据总览
path = 'E:\\data\\titanic\\'
train_data = pd.read_csv(path+'train.csv')
test_data = pd.read_csv(path+'test.csv')
sns.set_style('whitegrid')
train_data.head()
#1是获救
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
#数据信息总览
train_data.info()
#可以看出age,Cabin,Embarked有缺失
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
#train_data['Fare'].value_counts()
#Fare有15个是0,看一下这些人的仓位,pclass,没准费用为0真的合理
train_data[train_data.Fare==0]
#发现都是男的,且有不同的Pclass,因此断定这15人的Fare为0不合理因此需要进行根据不同阶级的中位数填充
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
179 | 180 | 0 | 3 | Leonard, Mr. Lionel | male | 36.0 | 0 | 0 | LINE | 0.0 | NaN | S |
263 | 264 | 0 | 1 | Harrison, Mr. William | male | 40.0 | 0 | 0 | 112059 | 0.0 | B94 | S |
271 | 272 | 1 | 3 | Tornquist, Mr. William Henry | male | 25.0 | 0 | 0 | LINE | 0.0 | NaN | S |
277 | 278 | 0 | 2 | Parkes, Mr. Francis "Frank" | male | NaN | 0 | 0 | 239853 | 0.0 | NaN | S |
302 | 303 | 0 | 3 | Johnson, Mr. William Cahoone Jr | male | 19.0 | 0 | 0 | LINE | 0.0 | NaN | S |
413 | 414 | 0 | 2 | Cunningham, Mr. Alfred Fleming | male | NaN | 0 | 0 | 239853 | 0.0 | NaN | S |
466 | 467 | 0 | 2 | Campbell, Mr. William | male | NaN | 0 | 0 | 239853 | 0.0 | NaN | S |
481 | 482 | 0 | 2 | Frost, Mr. Anthony Wood "Archie" | male | NaN | 0 | 0 | 239854 | 0.0 | NaN | S |
597 | 598 | 0 | 3 | Johnson, Mr. Alfred | male | 49.0 | 0 | 0 | LINE | 0.0 | NaN | S |
633 | 634 | 0 | 1 | Parr, Mr. William Henry Marsh | male | NaN | 0 | 0 | 112052 | 0.0 | NaN | S |
674 | 675 | 0 | 2 | Watson, Mr. Ennis Hastings | male | NaN | 0 | 0 | 239856 | 0.0 | NaN | S |
732 | 733 | 0 | 2 | Knight, Mr. Robert J | male | NaN | 0 | 0 | 239855 | 0.0 | NaN | S |
806 | 807 | 0 | 1 | Andrews, Mr. Thomas Jr | male | 39.0 | 0 | 0 | 112050 | 0.0 | A36 | S |
815 | 816 | 0 | 1 | Fry, Mr. Richard | male | NaN | 0 | 0 | 112058 | 0.0 | B102 | S |
822 | 823 | 0 | 1 | Reuchlin, Jonkheer. John George | male | 38.0 | 0 | 0 | 19972 | 0.0 | NaN | S |
#绘制存活比例
train_data['Survived'].value_counts().plot.pie(autopct = '%1.2f%%')
#可以看出是否存活和性别有很大的相关性
二、缺失值的处理:Fare,Age,Embarked,Cabin
1.如果数据集很多,缺失值很少,则可以删掉缺失行
2.如果该属性对于机器学习不太重要,则可以赋值均值或众数或中位数,比如上传地点Embarked一共就三个,可以用众数来填充
train_data['Embarked'].value_counts()
S 644
C 168
Q 77
Name: Embarked, dtype: int64
train_data.Embarked[train_data.Embarked.isnull()] = train_data.Embarked.mode().values
#用众数S填写了两个缺失值
3.对于标称属性(nominal attribute)意味着‘与名称相关’,它的值是一些符号或事物的名称。可以赋值一个代表性的值,因为有可能缺失代表没有仓位,但是此案例根据观察,各种阶级,票价的都有缺失,后面还需考察与目标变量的相关性,此处暂且先用U0替换
train_data['Cabin'] = train_data.Cabin.fillna('U0')
train_data['Cabin'].head()
0 U0
1 C85
2 U0
3 C123
4 U0
Name: Cabin, dtype: object
4.Fare这个变量有15个0来自不同阶级且都是男人,用不同pclass的中位数来填充
for i in range(3):
train_data.loc[(train_data.Fare == 0) & (train_data.Pclass ==i+1),'Fare'] = train_data[(train_data.Fare !=0) & (train_data.Pclass ==i+1)].Fare.median()
5.用随机森林来填充年龄缺失值
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, roc_auc_score
#年龄类似连续值的用回归
for_age = train_data[['Survived','Sex','SibSp','Parch','Fare','Age']]
for_age['Sex'] = pd.Categorical(for_age.Sex).codes #编码
train_age = for_age[for_age.Age.notnull()]
x_train, x_test, y_train, y_test = train_test_split(train_age.iloc[:,:-1], train_age.iloc[:,-1:], test_size=0.2, random_state=2)
test_age = for_age[for_age.Age.isnull()]
x = test_age.iloc[:,:-1]
rfr = RandomForestRegressor(n_estimators=50, max_depth=6, n_jobs=-1)#
rfr.fit(x_train,y_train)
#检测模型准确率
y_test_hat = rfr.predict(x_test)
print(mean_squared_error(y_test,y_test_hat))
y_hat = rfr.predict(x)
train_data.Age[train_data.Age.isnull()] = y_hat
D:\anaconda3\lib\site-packages\sklearn\ensemble\weight_boosting.py:29: DeprecationWarning: numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.
from numpy.core.umath_tests import inner1d
146.06308221585957
三、分析数据关系:数据预处理完后开始分析与目标变量之间的关系
1.性别与生还的关系
train_data.groupby(['Sex','Survived'])['Survived'].count()
Sex Survived
female 0 81
1 233
male 0 468
1 109
Name: Survived, dtype: int64
train_data.groupby(['Sex'])['Survived'].count()
Sex
female 314
male 577
Name: Survived, dtype: int64
train_data[['Sex','Survived']].head()
Sex | Survived | |
---|---|---|
0 | male | 0 |
1 | female | 1 |
2 | female | 1 |
3 | female | 1 |
4 | male | 0 |
train_data[['Sex','Survived']].groupby(['Sex']).mean().plot.bar()
#例如100个女样本,30个未生还(0),70个生还(1)求均值得0.7
#女性生还率远大于男性,体现了女士优先
2.船舱等级pclass与生还的关系
train_data[['Pclass','Survived']].groupby(['Pclass']).count()
Survived | |
---|---|
Pclass | |
1 | 216 |
2 | 184 |
3 | 491 |
train_data[['Pclass','Survived']].groupby(['Pclass']).mean().plot.bar()
#可以看出阶级越高获救的可能性越大
train_data[['Pclass','Survived','Sex']].groupby(['Pclass','Sex']).mean().plot.bar()
#加上性别
train_data.groupby(['Sex','Pclass','Survived'])['Survived'].count()
#整体来讲,阶级越高获救比例越高,女性获救比例本来就高阶级高的越高
Sex Pclass Survived
female 1 0 3
1 91
2 0 6
1 70
3 0 72
1 72
male 1 0 77
1 45
2 0 91
1 17
3 0 300
1 47
Name: Survived, dtype: int64
3.年龄与是否存活的关系
#不同仓位,性别下的年龄分布和生还的关系
fig, ax = plt.subplots(1,2, figsize=(18,8))
sns.violinplot('Pclass','Age',hue='Survived', data=train_data, split=True, ax=ax[0])
ax[0].set_title('Pclass and Age vs Survived')
ax[0].set_yticks(range(1,110,10))
sns.violinplot('Sex', 'Age', hue='Survived', data=train_data, split=True, ax=ax[1])
ax[1].set_title('sex and age vs Survived')
ax[1].set_yticks(range(1,110,10))
plt.show()
#年龄分布图和箱图
plt.figure(figsize=(12,5))
plt.subplot(121)
train_data['Age'].hist(bins=30)
plt.xlabel('age')
plt.ylabel('num')
plt.subplot(122)
train_data.boxplot(column='Age', showfliers=False)
plt.show()
#不同年龄下生存情况
facet = sns.FacetGrid(train_data, hue='Survived', aspect=4)
facet.map(sns.kdeplot,'Age',shade=True)
facet.set(xlim=(0, train_data['Age'].max()))
facet.add_legend()
#不同年龄下的生存率
fig. axis1 = plt.subplots(1,1,figsize=(18,4))
train_data['Age_int'] = train_data['Age'].astype(int)
avg_age = train_data[['Age_int','Survived']].groupby(['Age_int'],as_index=False).mean()
sns.barplot(x='Age_int', y='Survived',data=avg_age)
train_data['Age'].describe()
count 891.000000
mean 29.720748
std 13.384343
min 0.420000
25% 22.000000
50% 29.000000
75% 36.000000
max 80.000000
Name: Age, dtype: float64
#划分幼儿,少年,成年,老人四个群体
bins=[0,12,18,65,100]
train_data['Age_group'] = pd.cut(train_data['Age'],bins)
by_age = train_data[['Age_group','Survived']].groupby(['Age_group']).mean()
by_age
Survived | |
---|---|
Age_group | |
(0, 12] | 0.560000 |
(12, 18] | 0.430556 |
(18, 65] | 0.364130 |
(65, 100] | 0.125000 |
by_age.plot.bar()
#年龄越大生还率越低
4.称呼与存活关系
train_data['Title'] = train_data['Name'].str.extract('([A-Za-z]+)\.',expand=False)
pd.crosstab(train_data['Title'],train_data['Sex'])
Sex | female | male |
---|---|---|
Title | ||
Capt | 0 | 1 |
Col | 0 | 2 |
Countess | 1 | 0 |
Don | 0 | 1 |
Dr | 1 | 6 |
Jonkheer | 0 | 1 |
Lady | 1 | 0 |
Major | 0 | 2 |
Master | 0 | 40 |
Miss | 182 | 0 |
Mlle | 2 | 0 |
Mme | 1 | 0 |
Mr | 0 | 517 |
Mrs | 125 | 0 |
Ms | 1 | 0 |
Rev | 0 | 6 |
Sir | 0 | 1 |
train_data[['Title','Survived']].groupby(['Title']).mean().plot.bar()
#各称呼下的存活率
fig, axist = plt.subplots(1,1, figsize=(18,4))
train_data['Name_len'] = train_data['Name'].apply(len)
name_len = train_data[['Name_len','Survived']].groupby(['Name_len'],as_index=False).mean()
sns.barplot(x='Name_len',y='Survived',data=name_len)
#从数据上看名字长度确实与存活率有一定关系
5.兄弟姐妹sisp与存活关系
sibsp = train_data[train_data['SibSp'] != 0]
no_sibsp = train_data[train_data['SibSp'] == 0]
plt.figure(figsize=(10,5))
plt.subplot(121)
sibsp['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('sibsp')
plt.subplot(122)
no_sibsp['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('no_sibsp')
#可以看出有兄弟姐妹的人存活率会高一些,可能由于互相帮扶吧
Text(0.5,0,'no_sibsp')
6.有无父母parch
parch = train_data[train_data['Parch'] != 0]
no_parch = train_data[train_data['Parch'] == 0]
plt.figure(figsize=(10,5))
plt.subplot(121)
parch['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('parch')
plt.subplot(122)
no_parch['Survived'].value_counts().plot.pie(labels=['No Survived','Survived'],autopct='%1.1f%%')
plt.xlabel('no_parch')
plt.show()
#可以看出结论与兄弟姐妹类似
7.亲友人数与存活关系sibsp&parch
fig,ax = plt.subplots(1,2,figsize=(18,8))
train_data[['SibSp','Survived']].groupby(['SibSp']).mean().plot.bar(ax=ax[0])
ax[0].set_title('SibSp and Survived')
train_data[['Parch','Survived']].groupby(['Parch']).mean().plot.bar(ax=ax[1])
ax[1].set_title('Parch and Survived')
plt.show()
#太多也是累赘吧
train_data['people num'] = train_data['SibSp'] + train_data['Parch'] +1
train_data[['people num','Survived']].groupby(['people num']).mean().plot.bar()
#一个人生存率也比较低,家庭过多的人数存活率也会降低
8.存活率与票价分布Fare的关系
plt.figure(figsize=(10,5))
train_data['Fare'].hist(bins=100)
train_data.boxplot(column='Fare', by='Pclass',showfliers=False)
plt.show()
train_data['Fare'].describe()
count 891.000000
mean 32.689318
std 49.611639
min 4.012500
25% 7.925000
50% 14.500000
75% 31.275000
max 512.329200
Name: Fare, dtype: float64
fns = train_data['Fare'][train_data['Survived']==0]
fs = train_data['Fare'][train_data['Survived']==1]
avg_fare = pd.DataFrame([fns.mean(),fs.mean()])
std_fare = pd.DataFrame([fns.std(),fs.std()])
avg_fare.plot.bar(yerr=std_fare, legend=False)
#生还者的平均票价要高于未生还者
9.船舱类型Cabin与生还的关系
#缺失值过多,可直接剔除该特征,此处分析有无仓位
train_data['has cabin'] = train_data['Cabin'].apply(lambda x:0 if x=='U0' else 1)
plt.figure(figsize=(10,7))
train_data[['has cabin','Survived']].groupby(['Survived']).mean().plot.bar()
train_data[['has cabin','Survived']].groupby(['has cabin']).mean().plot.bar()
#可以看出图一,未生还里有仓位率为0.12,生还的人有仓位率接近0.4,图二中没有仓位生还率0.3,有仓位的生还率接近0.7
#对不同类型船舱分析
train_data['cat_cabin'] = train_data['Cabin'].map(lambda x:re.compile('([a-zA-Z]+)').search(x).group())
#将字母映射成数值
train_data['cat_cabin'] = pd.factorize(train_data['cat_cabin'])[0]
train_data[['cat_cabin','Survived']].groupby(['cat_cabin']).mean().plot.bar()
train_data[['cat_cabin','Survived']].groupby(['cat_cabin']).mean().plot.bar()
#其实跟上面一样,只不过顺序问题,虽然各仓位存活率不同,但差别也不大基本没啥规律可以删除该特征
10.登船地Embarked与生还率的关系
#坦尼克号从英国的南安普顿港出发,途径法国瑟堡和爱尔兰昆士敦,那么在昆士敦之前上船的人,有可能在瑟堡或昆士敦下船,这些人将不会遇到海难。
sns.countplot('Embarked',hue='Survived',data=train_data)
plt.title('Embarked and Survived')
plt.show()
sns.factorplot('Embarked','Survived',data=train_data, size=3, aspect=3)
plt.title('Embarked and Survived rate')
plt.show()
#可见s上船存活率最低,c最高
四.变量、定量转换
#独热编码,类别变量或二元变量,类别过多也不好,因为会增加过多特征
embark_dummies = pd.get_dummies(train_data['Embarked'])
train_data = train_data.join(embark_dummies)
embark_dummies = train_data[['S','C','Q']]
#对于多类例如Cabin编码用pd.factorize
#先填空值
train_data['Cabin'][train_data['Cabin'].isnull()] = 'U0'
#先提取出类别字母再进行编码
train_data['cat_cabin'] = train_data['Cabin'].map(lambda x:re.compile('([a-zA-z]+)').serch(x).group())
train_data['cat_cabin_code'] = pd.factorize(train_data['Cabin'])[0]
Series([], Name: Cabin, dtype: object)
定量转换
1.Scaling数据标准化normalization
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
train_data['age_s'] = scaler.fit_transform(train_data['Age'].values.reshape(-1,1))
#对Fare分箱
train_data['Fare_bin'] = pd.qcut(train_data['Fare'],5)
#分箱后编码
train_data['Fare_bin_id'] = pd.factorize(train_data['Fare_bin'])[0]
#也可以用One hot编码,但是这里类别过多
fare_one_hot = pd.get_dummies(train_data['Fare_bin']).rename(columns=lambda x: 'Fare'+ str(x))
#拼接,也可以用join
a = train_data.join(fare_one_hot)
#也可以用pd.concat
b = pd.concat([train_data,fare_one_hot],axis=1)
五、特征工程
#需要将训练集和测试集一同处理,使得二者有相同数据类型和分布
train_df_org = pd.read_csv('E:\\data\\titanic\\train.csv')
test_df_org = pd.read_csv('E:\\data\\titanic\\test.csv')
test_df_org['Survived'] = None
combined_data = train_df_org.append(test_df_org)
PassengerId = test_df_org['PassengerId']
1.Embarked
#由于此特征的缺失值不多,可以用众数来填充
combined_data['Embarked'].fillna(combined_data['Embarked'].mode().iloc[0],inplace=True)
#或
# combined_data[combined_data['Embarked'].isnull()]['Embarked'] = combined_data['Embarked'].mode()[0]
#然后可以用pd.get_dummies或pd.factorize,此处用后者方便后面处理其他变量
combined_data['Embarked'] = pd.factorize(combined_data['Embarked'])[0]
# 使用 pd.get_dummies 获取one-hot 编码
# emb_dummies_df = pd.get_dummies(combined_data['Embarked'], prefix=combined_data[['Embarked']].columns[0])
# combined_data = pd.concat([combined_data, emb_dummies_df], axis=1)
2.Sex
#应该用onehot由于也为了后面方便分析其他变量,此处也用factorize
combined_data['Sex'] = pd.factorize(combined_data['Sex'])[0]
#one hot
#sex_one_hot = pd.get_dummies(combined_data['Sex'],prefix=combined_data[['Sex']].columns[0])#给列从新命名
#pd.concat([combined_data, sex_one_hot],axis=1)
3.Name
#提取各种称呼,注意空格不能少,否则下一步会出错
combined_data['Title'] = combined_data['Name'].map(lambda x: re.compile(", (.*?)\.").findall(x)[0])
title_Dict = {}
title_Dict.update(dict.fromkeys(['Capt', 'Col', 'Major', 'Dr', 'Rev'], 'Officer'))
title_Dict.update(dict.fromkeys(['Don', 'Sir', 'the Countess', 'Dona', 'Lady'], 'Royalty'))
title_Dict.update(dict.fromkeys(['Mme', 'Ms', 'Mrs'], 'Mrs'))
title_Dict.update(dict.fromkeys(['Mlle', 'Miss'], 'Miss'))
title_Dict.update(dict.fromkeys(['Mr'], 'Mr'))
title_Dict.update(dict.fromkeys(['Master','Jonkheer'], 'Master'))
combined_data['Title'] = combined_data['Title'].map(title_Dict)
#编码
combined_data['Title'] = pd.factorize(combined_data['Title'])[0]
#增加名字长度的特征
combined_data['Name_len'] = combined_data['Name'].map(len)
4.Fare
# 先把空值进行对应阶级均值得填充,然后对于Fare为0的值进行一个平滑处理
combined_data['Fare'] = combined_data[['Fare']].fillna(combined_data.groupby('Pclass').transform(np.mean))
for i in range(3):
combined_data.loc[(combined_data['Pclass'] == i+1) & (combined_data['Fare']==0),'Fare'] = combined_data[(combined_data['Pclass'] == i+1) & (combined_data['Fare']!=0)]['Fare'].min()/10
#经判断,有很多ticket重复,有团体票,需要分担到每个人的头上
#将Fare按照TIcket分组后,在每个对应Fare位置上写上该Ticket组有几个Fare
combined_data['Group_Ticket'] = combined_data['Fare'].groupby(by=combined_data['Ticket']).transform('count')
combined_data = combined_data.drop(['Group_Ticket','Title_code'],axis=1,inplace=True)
#对票价进行分箱,然后编码
combined_data['Fare_bin'] = pd.qcut(combined_data['Fare'],5)
combined_data['Fare_id'] = pd.factorize(combined_data['Fare_bin'])[0]
combined_data.drop(['Fare_bin'],axis=1,inplace=True)
5.Pclass
#一般情况没必要处理Pclass,但是各等级内部也有可能跟逃生顺序有关,因此分出各等级仓位的高低价位
from sklearn.preprocessing import LabelEncoder
pc1_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[0][0]
pc2_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[1][0]
pc3_mean = combined_data[['Pclass','Fare']].groupby(['Pclass']).mean().values[2][0]
def Pclass_fare_category(df,pc1_mean,pc2_mean,pc3_mean):
if df['Pclass'] == 1:
if df['Fare'] > pc1_mean:
return 'pc1_high'
else:
return 'pc1_low'
elif df['Pclass'] == 2:
if df['Fare'] > pc2_mean:
return 'pc2_high'
else:
return 'pc2_low'
elif df['Pclass'] == 3:
if df['Fare'] > pc3_mean:
return 'pc3_high'
else:
return 'pc3_low'
combined_data['Pclass_Fare_Category'] = combined_data.apply(Pclass_fare_category,args=(pc1_mean,pc2_mean,pc3_mean),axis=1)
combined_data['Pclass_Fare_Category'] = LabelEncoder().fit_transform(combined_data['Pclass_Fare_Category'])
#顺手把pclass也编码
combined_data['Pclass'] = LabelEncoder().fit_transform(combined_data['Pclass'])
6.SibSp和Parch,合并为family size,还得加上他自己
def Family_size_category(family_size):
if family_size == 1:
return '1'
elif family_size <= 4:
return '2'
else:
return '3'
combined_data['Family_size'] = combined_data['SibSp'] + combined_data['Parch'] + 1
combined_data['Family_size_categroy'] = combined_data['Family_size'].map(Family_size_category)
combined_data['Family_size_categroy'] = LabelEncoder().fit_transform(combined_data['Family_size_categroy'])
7.Age:缺失过多,不能简单填充,模型融合建模预测
age = combined_data[['Age','Sex','Embarked','Pclass','Pclass_Fare_Category','Title','Name_len','Fare','Fare_id','Family_size','Family_size_categroy']]
age_train = age[age['Age'].notnull()]
age_test = age[age['Age'].isnull()]
#建立多模型预测
from sklearn import ensemble
from sklearn import model_selection
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
def fill_age(age_train,age_test):
age_train_x = age_train.drop(['Age'],axis=1)
age_train_y = age_train['Age']
age_test_x = age_test.drop(['Age'],axis=1)
#model 1 gbm
print('==========Model GBM==========')
gbm_reg = GradientBoostingRegressor(random_state=41)
gbm_param = {'n_estimators':[2000], 'max_depth':[4], 'learning_rate':[0.01], 'max_features':[3]}
gbm_grid = model_selection.GridSearchCV(gbm_reg, gbm_param, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
gbm_grid.fit(age_train_x, age_train_y)
print('GBM best features for age params'+str(gbm_grid.best_params_))
print('GBM best features for age scores'+str(gbm_grid.best_score_))
print('GBM train error for age feature regressor'+str(gbm_grid.score(age_train_x, age_train_y)))
print(age_train_x.shape,age_test_x.shape)
age_test.loc[:, 'Age_gbm'] = gbm_grid.predict(age_test_x)
print(age_test['Age_gbm'][:4])
#model 2 rf
print('==========Model RF==========')
rf_reg = RandomForestRegressor()
rf_param = {'n_estimators':[200], 'max_depth':[5], 'random_state':[0]}
rf_grid = model_selection.GridSearchCV(rf_reg,rf_param, cv=10, n_jobs=25, verbose=1, scoring='neg_mean_squared_error')
rf_grid.fit(age_train_x,age_train_y)
print('RF best features for age params'+ str(rf_grid.best_params_))
print('RF best features for age score'+ str(rf_grid.best_score_))
print('RF train error for age feature regressor'+ str(rf_grid.score(age_train_x,age_train_y)))
age_test.loc[:, 'Age_rf'] = rf_grid.predict(age_test_x)
print(age_test['Age_rf'][:4])
#merge model
print('==========Merge Model==========')
print('shape',age_test['Age'].shape,age_test[['Age_gbm','Age_rf']].mode(axis=1).shape)
age_test.loc[:,'Age'] = np.mean([age_test['Age_gbm'], age_test['Age_rf']])
print('merge age:\n',age_test['Age'][:4])
age_test.drop(['Age_gbm','Age_rf'],axis=1,inplace=True)
return age_test
combined_data.loc[(combined_data.Age.isnull()),'Age'] = fill_age(age_train,age_test)
==========Model GBM==========
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[Parallel(n_jobs=25)]: Done 5 out of 10 | elapsed: 35.2s remaining: 35.2s
[Parallel(n_jobs=25)]: Done 10 out of 10 | elapsed: 47.2s finished
GBM best features for age params{'learning_rate': 0.01, 'max_depth': 4, 'max_features': 3, 'n_estimators': 2000}
GBM best features for age scores-130.07827342623582
GBM train error for age feature regressor-63.488346095073126
(1046, 10) (263, 10)
5 36.266057
17 29.768299
19 37.534189
26 27.857612
Name: Age_gbm, dtype: float64
==========Model RF==========
Fitting 10 folds for each of 1 candidates, totalling 10 fits
[Parallel(n_jobs=25)]: Done 5 out of 10 | elapsed: 37.7s remaining: 37.7s
[Parallel(n_jobs=25)]: Done 10 out of 10 | elapsed: 47.6s finished
RF best features for age params{'max_depth': 5, 'n_estimators': 200, 'random_state': 0}
RF best features for age score-120.22123963994939
RF train error for age feature regressor-96.82435399344224
5 32.667672
17 31.516429
19 31.493906
26 27.854183
Name: Age_rf, dtype: float64
==========Merge Model==========
shape (263,) (263, 2)
merge age:
5 29.841298
17 29.841298
19 29.841298
26 29.841298
Name: Age, dtype: float64
8.Tickt有字母数字之分,将字母与数字分开
combined_data['Ticket_letter'] = combined_data['Ticket'].str.split().str[0]
combined_data['Ticket_letter'] = combined_data['Ticket_letter'].apply(lambda x:'U0' if x.isnumeric() else x)
#编码
combined_data['Ticket_letter'] = LabelEncoder().fit_transform(combined_data['Ticket_letter'])
9.Cabin:缺失值太多,就分为有无
combined_data.loc[combined_data['Cabin'].isnull(),'Cabin'] = 'U0'
combined_data['Cabin'] = combined_data['Cabin'].apply(lambda x:0 if x == 'U0' else 1)
combined_data['Cabin'].value_counts()
0 1014
1 295
Name: Cabin, dtype: int64
combined_data.rename(columns={'Family_size_categroy':'Family_size_category'},inplace=True)
六、特征间相关性分析
correlation = combined_data[['Age','Cabin','Embarked','Fare','Pclass','Sex','Title','Name_len','Fare_id','Pclass_Fare_Category','Family_size','Family_size_category','Ticket_letter']]
colormap = plt.cm.viridis
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features',y=1.05,size=15)
sns.heatmap(correlation.astype(float).corr(),lw=0.1, vmax=1.0, square=True, cmap = colormap, linecolor = 'white', annot=True)
plt.show()
#特征间数据分布图
g = sns.pairplot(combined_data[[u'Survived', u'Pclass', u'Sex', u'Age', u'Fare', u'Embarked',u'Family_size', u'Title', u'Ticket_letter']], hue='Survived', palette='seismic', size=1.2, diag_kind='kde', diag_kws=dict(shade=True), plot_kws=dict(s=10))
g.set(xticklabels=[])
七、入模前处理
#将age,Fare,name_len正则化(归一化)
combined_data[['Age','Fare','Name_len']] = preprocessing.StandardScaler().fit_transform(combined_data[['Age','Fare','Name_len']])
#备份
combined_data_backup = combined_data
#还原
combined_data = combined_data_backup
#把passengerid设置为索引
# combined_data.set_index('PassengerId',inplace=True)
#ticket换成独热编码
pd.cut(combined_data['Ticket_letter'],5)
combined_data.shape
(1309, 40)
#统一独热编码,删除不入模变量
get_dummies = combined_data[['Cabin','Embarked','Pclass','Sex','Title','Fare_id','Pclass_Fare_Category','Family_size_category']]
ohe_list = ['Cabin','Embarked','Pclass','Sex','Title','Fare_id','Pclass_Fare_Category','Family_size_category','Ticket_letter']
drop_list = ohe_list+['Name','Parch','SibSp','Ticket']
for i in ohe_list:
ohe = pd.get_dummies(combined_data[i]).rename(columns=lambda x: i+str(x))
combined_data = pd.concat([combined_data,ohe],axis=1)
combined_data.drop(drop_list,axis=1,inplace=True)
#区分训练集与测试集
train_data = combined_data[combined_data.Survived.notnull()]
test_data = combined_data[combined_data.Survived.isnull()]
feature_train_data_x = train_data.drop('Survived',axis=1)
feature_train_data_y = train_data['Survived'].astype('int')
feature_test_data_x = test_data.drop('Survived',axis=1)
八、模型融合及测试
#1.用不同模型对特征进行筛选,选出较重要特征
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier,ExtraTreesClassifier,AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
def get_top_n_features(train_x,train_y,get_feature_num):
#extra tree的参数并行数n_jobs=25会报错,=1时候就没事
print('========== Extra Trees ==========')
et_est = ExtraTreesClassifier(random_state=0)
et_param = {'n_estimators':[500],'min_samples_split':[3,4],'max_depth':[20]}
et_grid = model_selection.GridSearchCV(et_est, et_param, n_jobs=1, cv=10, verbose=1)
et_grid.fit(train_x,train_y)
print('best ET params for top n features'+str(et_grid.best_params_))
print('best ET score for top n features'+str(et_grid.best_score_))
print('best ET training score for top n features'+str(et_grid.score(train_x,train_y)))
top_n_features_et_sorted = pd.DataFrame({'feature':list(train_x),'importance':et_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
top_n_features_et = top_n_features_et_sorted.head(get_feature_num)['feature']
print('Sample 10 features from Extra Trees')
print(str(top_n_features_et[:10]))
print('========== Gradient Boosting ==========')
gb_est = GradientBoostingClassifier(random_state=0)
gb_param = {'n_estimators':[500],'learning_rate':[0.01,0.1],'max_depth':[20]}
gb_grid = model_selection.GridSearchCV(gb_est, gb_param, n_jobs=25, cv=10, verbose=1)
gb_grid.fit(train_x,train_y)
print('best GB params for top n features'+str(gb_grid.best_params_))
print('best GB score for top n features'+str(gb_grid.best_score_))
print('best GB training score for top n features'+str(gb_grid.score(train_x,train_y)))
top_n_features_gb_sorted = pd.DataFrame({'feature':list(train_x),'importance':gb_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)
top_n_features_gb = top_n_features_gb_sorted.head(get_feature_num)['feature']
print('Sample 10 features from Gradient Boosting')
print(str(top_n_features_gb[:10]))
print('========== Decision Tree ==========')
dt_est = DecisionTreeClassifier(random_state=0)
dt_param = {'min_samples_split':[2,4],'max_depth':[20]}
dt_grid = model_selection.GridSearchCV(dt_est, dt_param, n_jobs=25, cv=10, verbose=1)
dt_grid.fit(train_x,train_y)
print('best DT params for top n features'+str(dt_grid.best_params_))
print('best DT score for top n features'+str(dt_grid.best_score_))
print('best DT training score for top n features'+str(dt_grid.score(train_x, train_y)))
top_n_features_dt_sorted = pd.DataFrame({'feature':list(train_x),'importance':dt_grid.best_estimator_.feature_importances_}).sort_values('importance',ascending=False)
top_n_features_dt = top_n_features_dt_sorted.head(get_feature_num)['feature']
print('Sample 10 features from Decision Tree')
print(str(top_n_features_dt[:10]))
print('========== Random Forest ==========')
rf_est = RandomForestClassifier(random_state=0)
rf_param = {'n_estimators':[500],'min_samples_split':[2,3],'max_depth':[20]}
rf_grid = model_selection.GridSearchCV(rf_est, rf_param, n_jobs=25, cv=10, verbose=1)
rf_grid.fit(train_x,train_y)
print('best RF params for top n features'+str(rf_grid.best_params_))
print('best RF score for top n fratures'+str(rf_grid.best_score_))
print('best RF training score for top n fratures'+str(rf_grid.score(train_x,train_y)))
top_n_features_rf_sorted = pd.DataFrame({'feature':list(train_x),'importance':rf_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
top_n_features_rf = top_n_features_rf_sorted.head(get_feature_num)['feature']
print('Sample 10 Features from Random Forest')
print(str(top_n_features_rf[:10]))
print('========== AdaBoost ==========')
ada_est = AdaBoostClassifier(random_state=0)
ada_param = {'n_estimators':[500], 'learning_rate':[0.01,0.1]}
ada_grid = model_selection.GridSearchCV(ada_est, ada_param, n_jobs=25, cv=10, verbose=1)
ada_grid.fit(train_x, train_y)
print('best Ada params for top n features'+str(ada_grid.best_params_))
print('best Ada score for top n features'+str(ada_grid.best_score_))
print('best Ada training score for top n features'+str(ada_grid.score(train_x,train_y)))
top_n_features_ada_sorted = pd.DataFrame({'feature':list(train_x),'importance':ada_grid.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
top_n_features_ada = top_n_features_ada_sorted.head(get_feature_num)['feature']
print('Sample 10 Features from AdaBoost')
print(str(top_n_features_ada[:10]))
#model merge
top_n_feature = pd.concat([top_n_features_rf,top_n_features_ada,top_n_features_et,top_n_features_gb,top_n_features_dt],ignore_index=True).drop_duplicates()
features_importance = pd.concat([top_n_features_rf_sorted,top_n_features_ada_sorted,top_n_features_et_sorted,top_n_features_gb_sorted,top_n_features_dt_sorted],ignore_index=True)
return top_n_feature, features_importance
#2.用融合的模型得出最佳特征
get_features_num = 30
top_n_feature, features_importance = get_top_n_features(feature_train_data_x, feature_train_data_y, get_features_num)
========== Extra Trees ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=1)]: Done 20 out of 20 | elapsed: 16.7s finished
best ET params for top n features{'max_depth': 20, 'min_samples_split': 4, 'n_estimators': 500}
best ET score for top n features0.8271604938271605
best ET training score for top n features0.9652076318742986
Sample 10 features from Extra Trees
14 Title0
12 Sex0
13 Sex1
2 Name_len
0 Age
1 Fare
11 Pclass2
16 Title2
15 Title1
4 Cabin0
Name: feature, dtype: object
========== Gradient Boosting ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 1.3min remaining: 42.1s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 1.6min finished
best GB params for top n features{'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 500}
best GB score for top n features0.7654320987654321
best GB training score for top n features0.9977553310886644
Sample 10 features from Gradient Boosting
0 Age
2 Name_len
1 Fare
14 Title0
10 Pclass1
16 Title2
13 Sex1
3 Family_size
28 Pclass_Fare_Category3
12 Sex0
Name: feature, dtype: object
========== Decision Tree ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 1.1min remaining: 36.3s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 1.5min finished
best DT params for top n features{'max_depth': 20, 'min_samples_split': 4}
best DT score for top n features0.7643097643097643
best DT training score for top n features0.9618406285072951
Sample 10 features from Decision Tree
14 Title0
1 Fare
0 Age
2 Name_len
3 Family_size
19 Title5
29 Pclass_Fare_Category4
10 Pclass1
22 Fare_id2
4 Cabin0
Name: feature, dtype: object
========== Random Forest ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 1.3min remaining: 41.9s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 1.8min finished
best RF params for top n features{'max_depth': 20, 'min_samples_split': 3, 'n_estimators': 500}
best RF score for top n fratures0.8294051627384961
best RF training score for top n fratures0.9809203142536476
Sample 10 Features from Random Forest
2 Name_len
0 Age
1 Fare
13 Sex1
14 Title0
12 Sex0
3 Family_size
11 Pclass2
16 Title2
15 Title1
Name: feature, dtype: object
========== AdaBoost ==========
Fitting 10 folds for each of 2 candidates, totalling 20 fits
[Parallel(n_jobs=25)]: Done 13 out of 20 | elapsed: 1.4min remaining: 43.8s
[Parallel(n_jobs=25)]: Done 20 out of 20 | elapsed: 1.7min finished
best Ada params for top n features{'learning_rate': 0.01, 'n_estimators': 500}
best Ada score for top n features0.8148148148148148
best Ada training score for top n features0.8170594837261503
Sample 10 Features from AdaBoost
14 Title0
1 Fare
33 Family_size_category2
3 Family_size
11 Pclass2
12 Sex0
13 Sex1
4 Cabin0
5 Cabin1
2 Name_len
Name: feature, dtype: object
#重新构建入模特征数据集
train_data_x = pd.DataFrame(feature_train_data_x[top_n_feature])
test_data_x = pd.DataFrame(feature_test_data_x[top_n_feature])
2.对挑出来的特征部分作图
#对挑出来的特征部分作图,因为是按照每个模型39个特征的重要性依次拼接上的
et_feature_imp = features_importance[:10]
gb_feature_imp = features_importance[39:39+10].reset_index(drop=True)
#一列是特征名称,一列是重要度,取重要度这一列,再做一个归一化
et_feature_importance = 100.0 * (et_feature_imp['importance']/et_feature_imp['importance'].max())
gb_feature_importance = 100.0 * (gb_feature_imp['importance']/gb_feature_imp['importance'].max())
# Get the indexes of all features over the importance threshold
et_important_idx = np.where(et_feature_importance)[0]
gb_important_idx = np.where(gb_feature_importance)[0]
pos = np.arange(et_important_idx.shape[0]) + .5
plt.figure(1, figsize = (18, 8))
plt.subplot(121)
plt.barh(pos, et_feature_importance[et_important_idx][::-1])
plt.yticks(pos, et_feature_imp['feature'][::-1])
plt.xlabel('Relative Importance')
plt.title('Extra Trees Importance')
plt.subplot(122)
plt.barh(pos, gb_feature_importance[gb_important_idx][::-1])
plt.yticks(pos, gb_feature_imp['feature'][::-1])
plt.xlabel('Relative Importance')
plt.title('Gradient Boosting Importance')
plt.show()
3.模型融合(Model Ensemble):常见有Bagging,Boosting,Stacking,Blending
#1.Bagging
# Stacking框架融合:
# 这里我们使用了两层的模型融合,Level 1使用了:RandomForest、AdaBoost、ExtraTrees、GBDT、DecisionTree、KNN、SVM ,一共7个模型,Level 2使用了XGBoost使用第一层预测的结果作为特征对最终的结果进行预测。
# Level 1:
# Stacking框架是堆叠使用基础分类器的预测作为对二级模型的训练的输入。 然而,我们不能简单地在全部训练数据上训练基本模型,产生预测,输出用于第二层的训练。如果我们在Train Data上训练,然后在Train Data上预测,就会造成标签。为了避免标签,我们需要对每个基学习器使用K-fold,将K个模型对Valid Set的预测结果拼起来,作为下一层学习器的输入。
# 所以这里我们建立输出fold预测方法:
from sklearn.model_selection import KFold
#一些参数
ntrain = train_data_x.shape[0]
ntest = test_data_x.shape[0]
random_state = 0
n_fold = 7
kf = KFold(n_splits=n_fold, random_state=random_state, shuffle=False)
def get_out_fold(clf, x_train, y_train, x_test):
oof_train = np.zeros((ntrain,))
oof_test = np.zeros((ntest))
oof_test_skf = np.empty((n_fold, ntest))
#先把数据7等分,第一份为测试集,其余训练集,然后是第二份以此类推,i:第几折了,train_index:本折训练集索引
for i,(train_index, test_index) in enumerate(kf.split(x_train)):
x_tr = x_train[train_index]
y_tr = y_train[train_index]
x_te = x_train[test_index]
%timeit
clf.fit(x_tr,y_tr)
oof_train[test_index] = clf.predict(x_te)#将每次循环得到的交叉验证结果存起来
oof_test_skf[i,:] = clf.predict(x_test)#用每次循环训练出的模型对测试集进行预测
oof_test[:] = oof_test_skf.mean(axis=0)#把7次预测结果求平均
return oof_train.reshape(-1,1), oof_test.reshape(-1,1)
#选用之前填选特征时的5个基分类器Random Forest, AdaBoost, Extra Trees, Decision Tree, Gradient Boosting, 和KNN, SVM这七个学习器
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
rf = RandomForestClassifier(n_estimators=500, warm_start=True, max_features='sqrt', max_depth=6, min_samples_split=3, min_samples_leaf=2, n_jobs=-1, verbose=0)
ada = AdaBoostClassifier(n_estimators=500, learning_rate=0.1)
et = ExtraTreesClassifier(n_estimators=500, n_jobs=-1, max_depth=8, min_samples_leaf=2, verbose=0)
gb = GradientBoostingClassifier(n_estimators=500, learning_rate=0.008, min_samples_split=3, min_samples_leaf=2, verbose=0)
dt = DecisionTreeClassifier(max_depth=8)
knn = KNeighborsClassifier(n_neighbors=2)
svm = SVC(kernel='linear', C=0.025)
#转化成array
x_train = train_data_x.values
x_test = test_data_x.values
y_train = feature_train_data_y.values
# Create our OOF train and test predictions. These base results will be used as new features
rf_oof_train, rf_oof_test = get_out_fold(rf, x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_out_fold(ada, x_train, y_train, x_test) # AdaBoost
et_oof_train, et_oof_test = get_out_fold(et, x_train, y_train, x_test) # Extra Trees
gb_oof_train, gb_oof_test = get_out_fold(gb, x_train, y_train, x_test) # Gradient Boost
dt_oof_train, dt_oof_test = get_out_fold(dt, x_train, y_train, x_test) # Decision Tree
knn_oof_train, knn_oof_test = get_out_fold(knn, x_train, y_train, x_test) # KNeighbors
svm_oof_train, svm_oof_test = get_out_fold(svm, x_train, y_train, x_test) # Support Vector
print('Training is complete')
Training is complete
4.用XGB将level1的结果作为特征对最终结果进行预测并生成提交文件
#把每个模型得到的训练集预测结果和测试集的预测结果拼接起来
x_train = np.concatenate((rf_oof_train, ada_oof_train, et_oof_train, gb_oof_train, dt_oof_train, knn_oof_train, svm_oof_train),axis=1)
x_test = np.concatenate((rf_oof_test, ada_oof_test, et_oof_test, gb_oof_test, dt_oof_test, knn_oof_test, svm_oof_test),axis=1)#左右拼
from xgboost import XGBClassifier
xgb = XGBClassifier(n_estimators=2000, max_depth=4, min_child_weight=2, gamma=0.9, subsample=0.8,
colsample_bytree=0.8, objective='binary:logistic',nthread=-1, scale_pos_weight=1)
xgb.fit(x_train, y_train)#其实这一步就相当于把前面得到的7个结果当做特征来和y_train去重新建模
predictions = xgb.predict(x_test)#然后拿训练好的模型去预测结果
D:\anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
if diff:
predictions
array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0,
1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1,
1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0,
0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0,
1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0,
0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0,
1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1])
#保存并提交结果,91.1%的正确率,提交得分0.77
Stacking_Submission = pd.DataFrame({'PassengerId':PassengerId,'Survived':predictions})
Stacking_Submission.to_csv('E:\\data\\titanic\\StackingSubmission.csv', index=False, sep=',')
九、验证:学习曲线
from sklearn.learning_curve import learning_curve
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, n_jobs=1, train_sizes=np.linspace(0.1,1.0,5), verbose=0):
"""
Generate a simple plot of the test and traning learning curve.
Parameters
----------
estimator : object type that implements the "fit" and "predict" methods
An object of that type which is cloned for each validation.
title : string
Title for the chart.
X : array-like, shape (n_samples, n_features)
Training vector, where n_samples is the number of samples and
n_features is the number of features.
y : array-like, shape (n_samples) or (n_samples, n_features), optional
Target relative to X for classification or regression;
None for unsupervised learning.
ylim : tuple, shape (ymin, ymax), optional
Defines minimum and maximum yvalues plotted.
cv : integer, cross-validation generator, optional
If an integer is passed, it is the number of folds (defaults to 3).
Specific cross-validation objects can be passed, see
sklearn.cross_validation module for the list of possible objects
n_jobs : integer, optional
Number of jobs to run in parallel (default 1).
"""
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel('Training examples')
plt.ylabel('Score')
train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores,axis=1)
train_scores_std = np.std(train_scores,axis=1)
test_scores_mean = np.mean(test_scores,axis=1)
test_scores_std = np.std(test_scores,axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1, color='r')
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color='g')
plt.plot(train_sizes, train_scores_mean,'o-', color='r', label='Training score')
plt.plot(train_sizes, test_scores_mean,'o-', color='g', label='Cross-validation score')
plt.legend(loc='best')
return plt
X = x_train
Y = y_train
# RandomForest
rf_parameters = {'n_jobs': -1, 'n_estimators': 500, 'warm_start': True, 'max_depth': 6, 'min_samples_leaf': 2,
'max_features' : 'sqrt','verbose': 0}
# AdaBoost
ada_parameters = {'n_estimators':500, 'learning_rate':0.1}
# ExtraTrees
et_parameters = {'n_jobs': -1, 'n_estimators':500, 'max_depth': 8, 'min_samples_leaf': 2, 'verbose': 0}
# GradientBoosting
gb_parameters = {'n_estimators': 500, 'max_depth': 5, 'min_samples_leaf': 2, 'verbose': 0}
# DecisionTree
dt_parameters = {'max_depth':8}
# KNeighbors
knn_parameters = {'n_neighbors':2}
# SVM
svm_parameters = {'kernel':'linear', 'C':0.025}
# XGB
gbm_parameters = {'n_estimators': 2000, 'max_depth': 4, 'min_child_weight': 2, 'gamma':0.9, 'subsample':0.8,
'colsample_bytree':0.8, 'objective': 'binary:logistic', 'nthread':-1, 'scale_pos_weight':1}
title = 'learning Curves'
plot_learning_curve(RandomForestClassifier(**rf_parameters), title, X, Y, cv=None, n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()
title = 'learning Curves'
plot_learning_curve(XGBClassifier(**gbm_parameters), title, X, Y, cv=None, n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()
title = 'learning Curves'
plot_learning_curve(AdaBoostClassifier(**ada_parameters), title, X, Y, cv=None, n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()
title = 'learning Curves'
plot_learning_curve(SVC(**svm_parameters), title, X, Y, cv=None, n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()
title = 'learning Curves'
plot_learning_curve(ExtraTreesClassifier(**et_parameters), title, X, Y, cv=None, n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
plt.show()
plot_learning_curve(GradientBoostingClassifier(**gb_parameters), title, X, Y, cv=None, n_jobs=4, train_sizes=[50, 100, 150, 200, 250, 350, 400, 450, 500])
from sklearn.metrics import roc_auc_score,roc_curve,auc
y = pd.read_csv('E:\\data\\titanic\\gender_submission.csv')
y = y['Survived'].values
fpr, tpr, thresholds = roc_curve(y, predictions, pos_label=1)
plt.plot(fpr,tpr)
plt.show()
roc_auc_score(y,predictions)
0.9050751879699248
tpr
array([0. , 0.88157895, 1. ])