比赛地址
kaggle泰坦尼克号比赛说明
泰坦尼克号的沉没是历史上最著名的沉船之一。1912年4月15日,在她的首航中,泰坦尼克号在与冰山相撞后沉没,在2224名乘客和机组人员中造成1502人死亡。这场耸人听闻的悲剧震惊了国际社会,并促进了更严格的船舶安全规定产生。
造成海难失事的原因之一是乘客和机组人员没有足够的救生艇。尽管幸存下沉有一些运气因素,但有些人比其他人更容易生存,比如女人,孩子和上流社会。
在这个挑战中,我们要求您完成对哪些人可能存活的分析。特别是,我们要求您运用机器学习工具来预测哪些乘客在悲剧中幸存下来。
import os
import pandas as pd
import numpy as np
import warnings
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import model_selection
excl = lambda x: os.popen(x).readlines()
%matplotlib inline
warnings.filterwarnings('ignore')
train = pd.read_csv('./titanic_datas/train.csv')
train.head()
|
PassengerId |
Survived |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
1 |
0 |
3 |
Braund, Mr. Owen Harris |
male |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
1 |
2 |
1 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
female |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
2 |
3 |
1 |
3 |
Heikkinen, Miss. Laina |
female |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
3 |
4 |
1 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
female |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
4 |
5 |
0 |
3 |
Allen, Mr. William Henry |
male |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
test = pd.read_csv('./titanic_datas/test.csv')
test.head()
|
PassengerId |
Pclass |
Name |
Sex |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
0 |
892 |
3 |
Kelly, Mr. James |
male |
34.5 |
0 |
0 |
330911 |
7.8292 |
NaN |
Q |
1 |
893 |
3 |
Wilkes, Mrs. James (Ellen Needs) |
female |
47.0 |
1 |
0 |
363272 |
7.0000 |
NaN |
S |
2 |
894 |
2 |
Myles, Mr. Thomas Francis |
male |
62.0 |
0 |
0 |
240276 |
9.6875 |
NaN |
Q |
3 |
895 |
3 |
Wirz, Mr. Albert |
male |
27.0 |
0 |
0 |
315154 |
8.6625 |
NaN |
S |
4 |
896 |
3 |
Hirvonen, Mrs. Alexander (Helga E Lindqvist) |
female |
22.0 |
1 |
1 |
3101298 |
12.2875 |
NaN |
S |
train.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
test.info()
RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId 418 non-null int64
Pclass 418 non-null int64
Name 418 non-null object
Sex 418 non-null object
Age 418 non-null float64
SibSp 418 non-null int64
Parch 418 non-null int64
Ticket 418 non-null object
Fare 418 non-null float64
Cabin 91 non-null object
Embarked 418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
train.describe()
|
PassengerId |
Survived |
Pclass |
Age |
SibSp |
Parch |
Fare |
count |
891.000000 |
891.000000 |
891.000000 |
714.000000 |
891.000000 |
891.000000 |
891.000000 |
mean |
446.000000 |
0.383838 |
2.308642 |
29.699118 |
0.523008 |
0.381594 |
32.204208 |
std |
257.353842 |
0.486592 |
0.836071 |
14.526497 |
1.102743 |
0.806057 |
49.693429 |
min |
1.000000 |
0.000000 |
1.000000 |
0.420000 |
0.000000 |
0.000000 |
0.000000 |
25% |
223.500000 |
0.000000 |
2.000000 |
20.125000 |
0.000000 |
0.000000 |
7.910400 |
50% |
446.000000 |
0.000000 |
3.000000 |
28.000000 |
0.000000 |
0.000000 |
14.454200 |
75% |
668.500000 |
1.000000 |
3.000000 |
38.000000 |
1.000000 |
0.000000 |
31.000000 |
max |
891.000000 |
1.000000 |
3.000000 |
80.000000 |
8.000000 |
6.000000 |
512.329200 |
test.describe()
|
PassengerId |
Pclass |
Age |
SibSp |
Parch |
Fare |
count |
418.000000 |
418.000000 |
418.000000 |
418.000000 |
418.000000 |
418.000000 |
mean |
1100.500000 |
2.265550 |
30.154603 |
0.447368 |
0.392344 |
35.619000 |
std |
120.810458 |
0.841838 |
12.636666 |
0.896760 |
0.981429 |
55.840751 |
min |
892.000000 |
1.000000 |
0.170000 |
0.000000 |
0.000000 |
0.000000 |
25% |
996.250000 |
1.000000 |
23.000000 |
0.000000 |
0.000000 |
7.895800 |
50% |
1100.500000 |
3.000000 |
29.699118 |
0.000000 |
0.000000 |
14.454200 |
75% |
1204.750000 |
3.000000 |
35.750000 |
1.000000 |
0.000000 |
31.500000 |
max |
1309.000000 |
3.000000 |
76.000000 |
8.000000 |
9.000000 |
512.329200 |
fare_mean = train["Fare"].mean()
test.loc[pd.isnull(test.Fare),'Fare'] = fare_mean
embarked_mode = train['Embarked'].mode()
train.loc[pd.isnull(train.Embarked),['Embarked']] = embarked_mode[0]
age_mean = train['Age'].mean()
train.loc[pd.isnull(train.Age),['Age']] = age_mean
test.loc[pd.isnull(test.Age),['Age']] = age_mean
label = train['Survived']
train.drop('Survived',axis=1,inplace=True)
X_train,X_test,Y_train,Y_test = train_test_split(train,label,test_size = 0.3,random_state = 1)
X_train['Survived'] = Y_train
X_test['Survived'] = Y_test
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Sex', 'Survived', data=X_train, ax=axis1)
sns.barplot('Sex', 'Survived', data=X_test, ax=axis2)
train['Sex'] = train['Sex'].apply(lambda x: 1 if x == 'male' else 0)
test['Sex'] = test['Sex'].apply(lambda x: 1 if x == 'male' else 0)
train = pd.get_dummies(data= train,columns=['Sex'])
test = pd.get_dummies(data= test,columns=['Sex'])
def Name_Title_Code(x):
if x == 'Mr.':
return 1
if (x == 'Mrs.') or (x=='Ms.') or (x=='Lady.') or (x == 'Mlle.') or (x =='Mme'):
return 2
if x == 'Miss':
return 3
if x == 'Rev.':
return 4
return 5
X_train['Name_Title'] = X_train['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
X_test['Name_Title'] = X_test['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
X_train.groupby('Name_Title')['Survived'].count()
Name_Title
Capt. 1
Col. 2
Don. 1
Dr. 4
Lady. 1
Major. 1
Master. 27
Miss. 126
Mlle. 1
Mme. 1
Mr. 365
Mrs. 87
Rev. 5
the 1
Name: Survived, dtype: int64
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Name_Title', 'Survived', data=X_train.sort_values('Name_Title'), ax=axis1)
sns.barplot('Name_Title', 'Survived', data=X_test.sort_values('Name_Title'), ax=axis2)
def Name_Title_Code(x):
if x == 'Mr.':
return 1
if (x == 'Mrs.') or (x=='Ms.') or (x=='Lady.') or (x == 'Mlle.') or (x =='Mme'):
return 2
if x == 'Miss':
return 3
if x == 'Rev.':
return 4
return 5
train['Name_Title'] = train['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
test['Name_Title'] = test['Name'].apply(lambda x: x.split(',')[1]).apply(lambda x: x.split()[0])
train['Name_Title'] = train['Name_Title'].apply(Name_Title_Code)
test['Name_Title'] = test['Name_Title'].apply(Name_Title_Code)
train = pd.get_dummies(columns = ['Name_Title'], data = train)
test = pd.get_dummies(columns = ['Name_Title'], data = test)
train.head()
|
PassengerId |
Pclass |
Name |
Age |
SibSp |
Parch |
Ticket |
Fare |
Cabin |
Embarked |
Sex_0 |
Sex_1 |
Name_Title_1 |
Name_Title_2 |
Name_Title_4 |
Name_Title_5 |
0 |
1 |
3 |
Braund, Mr. Owen Harris |
22.0 |
1 |
0 |
A/5 21171 |
7.2500 |
NaN |
S |
0 |
1 |
1 |
0 |
0 |
0 |
1 |
2 |
1 |
Cumings, Mrs. John Bradley (Florence Briggs Th... |
38.0 |
1 |
0 |
PC 17599 |
71.2833 |
C85 |
C |
1 |
0 |
0 |
1 |
0 |
0 |
2 |
3 |
3 |
Heikkinen, Miss. Laina |
26.0 |
0 |
0 |
STON/O2. 3101282 |
7.9250 |
NaN |
S |
1 |
0 |
0 |
0 |
0 |
1 |
3 |
4 |
1 |
Futrelle, Mrs. Jacques Heath (Lily May Peel) |
35.0 |
1 |
0 |
113803 |
53.1000 |
C123 |
S |
1 |
0 |
0 |
1 |
0 |
0 |
4 |
5 |
3 |
Allen, Mr. William Henry |
35.0 |
0 |
0 |
373450 |
8.0500 |
NaN |
S |
0 |
1 |
1 |
0 |
0 |
0 |
X_train['Name_len'] = X_train['Name'].apply(lambda x: len(x))
X_test['Name_len'] = X_test['Name'].apply(lambda x: len(x))
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(20,10))
sns.barplot('Name_len', 'Survived', data=X_train.sort_values(['Name_len']), ax=axis1)
sns.barplot('Name_len', 'Survived', data=X_test.sort_values(['Name_len']), ax=axis2)
train['Name_len'] = train['Name'].apply(lambda x: len(x))
test['Name_len'] = test['Name'].apply(lambda x: len(x))
def Ticket_First_Let(x):
return x[0]
X_train['Ticket_First_Letter'] = X_train['Ticket'].apply(Ticket_First_Let)
X_test['Ticket_First_Letter'] = X_test['Ticket'].apply(Ticket_First_Let)
X_train.groupby('Ticket_First_Letter')['Survived'].count()
Ticket_First_Letter
1 87
2 129
3 225
4 10
5 2
6 6
7 6
8 1
9 1
A 20
C 32
F 3
L 3
P 49
S 40
W 9
Name: Survived, dtype: int64
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Ticket_First_Letter', 'Survived', data=X_train.sort_values('Ticket_First_Letter'), ax=axis1)
sns.barplot('Ticket_First_Letter', 'Survived', data=X_test.sort_values('Ticket_First_Letter'), ax=axis2)
def Ticket_First_Letter_Code(x):
if (x == '1'):
return 1
if x == '3':
return 2
if x == '4':
return 3
if x == 'C':
return 4
if x == 'S':
return 5
if x == 'P':
return 6
if x == '6':
return 7
if x == '7':
return 8
if x == 'A':
return 9
if x == 'W':
return 10
return 11
train['Ticket_First_Letter'] = train['Ticket'].apply(Ticket_First_Let)
test['Ticket_First_Letter'] = test['Ticket'].apply(Ticket_First_Let)
train['Ticket_First_Letter'].unique()
array(['A', 'P', 'S', '1', '3', '2', 'C', '7', 'W', '4', 'F', 'L', '9',
'6', '5', '8'], dtype=object)
test['Ticket_First_Letter'].unique()
array(['3', '2', '7', 'A', '6', 'W', 'S', 'P', 'C', '1', 'F', '4', '9',
'L'], dtype=object)
train['Ticket_First_Letter'] = train['Ticket_First_Letter'].apply(Ticket_First_Letter_Code)
test['Ticket_First_Letter'] = test['Ticket_First_Letter'].apply(Ticket_First_Letter_Code)
X_train['Cabin'] = X_train['Cabin'].fillna('Missing')
X_test['Cabin'] = X_test['Cabin'].fillna('Missing')
def Cabin_First_Letter(x):
if x == 'Missing':
return 'XX'
return x[0]
X_train['Cabin_First_Letter'] = X_train['Cabin'].apply(Cabin_First_Letter)
X_test['Cabin_First_Letter'] = X_test['Cabin'].apply(Cabin_First_Letter)
X_train.groupby('Cabin_First_Letter')['Survived'].count()
Cabin_First_Letter
A 12
B 28
C 41
D 21
E 22
F 8
G 3
XX 488
Name: Survived, dtype: int64
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Cabin_First_Letter', 'Survived', data=X_train.sort_values('Cabin_First_Letter'), ax=axis1)
sns.barplot('Cabin_First_Letter', 'Survived', data=X_test.sort_values('Cabin_First_Letter'), ax=axis2)
def Cabin_First_Letter_Code(x):
if x == 'XX':
return 1
if x == 'B':
return 2
if x == 'C':
return 3
if x == 'D':
return 4
return 5
train['Cabin'] = train['Cabin'].fillna('Missing')
test['Cabin'] = test['Cabin'].fillna('Missing')
train['Cabin_First_Letter'] = train['Cabin'].apply(Cabin_First_Letter)
test['Cabin_First_Letter'] = test['Cabin'].apply(Cabin_First_Letter)
train['Cabin_First_Letter'] = train['Cabin_First_Letter'].apply(Cabin_First_Letter_Code)
test['Cabin_First_Letter'] = test['Cabin_First_Letter'].apply(Cabin_First_Letter_Code)
train = pd.get_dummies(columns = ['Cabin_First_Letter'], data = train)
test = pd.get_dummies(columns = ['Cabin_First_Letter'], data = test)
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Embarked', 'Survived', data=X_train.sort_values('Embarked'), ax=axis1)
sns.barplot('Embarked', 'Survived', data=X_test.sort_values('Embarked'), ax=axis2)
train = pd.get_dummies(train,columns = ['Embarked'])
test = pd.get_dummies(test,columns = ['Embarked'])
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('SibSp', 'Survived', data=X_train.sort_values('SibSp'), ax=axis1)
sns.barplot('SibSp', 'Survived', data=X_test.sort_values('SibSp'), ax=axis2)
X_train['Fam_Size'] = X_train['SibSp'] + X_train['Parch']
X_test['Fam_Size'] = X_test['SibSp'] + X_test['Parch']
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Fam_Size', 'Survived', data=X_train.sort_values('Parch'), ax=axis1)
sns.barplot('Fam_Size', 'Survived', data=X_test.sort_values('Parch'), ax=axis2)
def Family_feature(train, test):
for i in [train, test]:
i['Fam_Size'] = np.where((i['SibSp']+i['Parch']) == 0 , 'Solo',
np.where((i['SibSp']+i['Parch']) <= 3,'Nuclear', 'Big'))
del i['SibSp']
del i['Parch']
return train, test
train, test = Family_feature(train, test)
train = pd.get_dummies(train,columns = ['Fam_Size'])
test = pd.get_dummies(test,columns = ['Fam_Size'])
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.barplot('Pclass', 'Survived', data=X_train.sort_values('Pclass'), ax=axis1)
sns.barplot('Pclass', 'Survived', data=X_test.sort_values('Pclass'), ax=axis2)
train['Pclass_1'] = np.int32(train['Pclass'] == 1)
train['Pclass_2'] = np.int32(train['Pclass'] == 2)
train['Pclass_3'] = np.int32(train['Pclass'] == 3)
test['Pclass_1'] = np.int32(test['Pclass'] == 1)
test['Pclass_2'] = np.int32(test['Pclass'] == 2)
test['Pclass_3'] = np.int32(test['Pclass'] == 3)
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.distplot(X_train[X_train.Survived==1]['Age'].dropna().values, bins=range(0, 81, 6),color='red', ax=axis1)
sns.distplot(X_train[X_train.Survived==0]['Age'].dropna().values, bins=range(0, 81, 6),color = 'blue', ax=axis1)
sns.distplot(X_test[X_test.Survived==1]['Age'].dropna().values, bins=range(0, 81, 6),color='red', ax=axis2)
sns.distplot(X_test[X_test.Survived==0]['Age'].dropna().values, bins=range(0, 81, 6),color = 'blue', ax=axis2)
train['Small_Age'] = np.int32(train['Age'] <= 5)
train['Old_Age'] = np.int32(train['Age'] >= 65)
train['Middle_Age'] = np.int32((train['Age'] >= 15) & (train['Age'] <= 25))
test['Small_Age'] = np.int32(test['Age'] <= 5)
test['Old_Age'] = np.int32(test['Age'] >= 65)
test['Middle_Age'] = np.int32((test['Age'] >= 15) & (test['Age'] <= 25))
X_train['Fare'] = X_train['Fare'] + 1
X_test['Fare'] = X_test['Fare'] + 1
X_train['Fare'] = X_train['Fare'].apply(np.log)
X_test['Fare'] = X_test['Fare'].apply(np.log)
fig, (axis1,axis2) = plt.subplots(1,2,figsize=(15,5))
sns.distplot(X_train[X_train.Survived==1]['Fare'].dropna().values, bins=range(0, 10, 1),color='red', ax=axis1)
sns.distplot(X_train[X_train.Survived==0]['Fare'].dropna().values, bins=range(0, 10, 1),color = 'blue', ax=axis1)
sns.distplot(X_test[X_test.Survived==1]['Fare'].dropna().values, bins=range(0, 10, 1),color='red', ax=axis2)
sns.distplot(X_test[X_test.Survived==0]['Fare'].dropna().values, bins=range(0, 10, 1),color = 'blue', ax=axis2)
train['Fare'] = train['Fare'] + 1
test['Fare'] = test['Fare'] + 1
train['Fare'] = train['Fare'].apply(np.log)
test['Fare'] = test['Fare'].apply(np.log)
train['Fare_0_2'] = np.int32(train['Fare'] <= 2)
train['Fare_2_3'] = np.int32((train['Fare'] > 2) & (train['Fare'] <= 3) )
train['Fare_3_4'] = np.int32((train['Fare'] > 3) & (train['Fare'] <= 4) )
train['Fare_4_5'] = np.int32((train['Fare'] > 4) & (train['Fare'] <= 5))
train['Fare_5_'] = np.int32(train['Fare'] > 5)
test['Fare_0_2'] = np.int32(test['Fare'] <= 2)
test['Fare_2_3'] = np.int32((test['Fare'] > 2) & (test['Fare'] <= 3) )
test['Fare_3_4'] = np.int32((test['Fare'] > 3) & (test['Fare'] <= 4) )
test['Fare_4_5'] = np.int32((test['Fare'] > 4) & (test['Fare'] <= 5))
test['Fare_5_'] = np.int32(test['Fare'] > 5)
train.drop(['Ticket','PassengerId','Name','Age','Cabin','Pclass'],axis = 1, inplace=True)
test.drop( ['PassengerId','Ticket','Name','Age','Cabin','Pclass'],axis =1, inplace=True)
X_train_ = train.loc[X_train.index]
X_test_ = train.loc[X_test.index]
Y_train_ = label.loc[X_train.index]
Y_test_ = label.loc[X_test.index]
X_test_ = X_test_[X_train_.columns]
pd.set_option('display.max_columns',50)
train.head()
|
Fare |
Sex_0 |
Sex_1 |
Name_Title_1 |
Name_Title_2 |
Name_Title_4 |
Name_Title_5 |
Name_len |
Ticket_First_Letter |
Cabin_First_Letter_1 |
Cabin_First_Letter_2 |
Cabin_First_Letter_3 |
Cabin_First_Letter_4 |
Cabin_First_Letter_5 |
Embarked_C |
Embarked_Q |
Embarked_S |
Fam_Size_Big |
Fam_Size_Nuclear |
Fam_Size_Solo |
Pclass_1 |
Pclass_2 |
Pclass_3 |
Small_Age |
Old_Age |
Middle_Age |
Fare_0_2 |
Fare_2_3 |
Fare_3_4 |
Fare_4_5 |
Fare_5_ |
0 |
2.110213 |
0 |
1 |
1 |
0 |
0 |
0 |
23 |
9 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
1 |
4.280593 |
1 |
0 |
0 |
1 |
0 |
0 |
51 |
6 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
2 |
2.188856 |
1 |
0 |
0 |
0 |
0 |
1 |
22 |
5 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
3 |
3.990834 |
1 |
0 |
0 |
1 |
0 |
0 |
44 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
1 |
0 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
4 |
2.202765 |
0 |
1 |
1 |
0 |
0 |
0 |
24 |
2 |
1 |
0 |
0 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
1 |
0 |
0 |
0 |
0 |
1 |
0 |
0 |
0 |
test = test[train.columns]
rf_ = RandomForestClassifier(criterion='gini',
n_estimators=700,
min_samples_split=16,
min_samples_leaf=1,
max_features='auto',
random_state=10,
n_jobs=-1)
rf_.fit(X_train_,Y_train_)
rf_.score(X_test_,Y_test_)
0.7910447761194029
rf_.fit(train,label)
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None, max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=16,
min_weight_fraction_leaf=0.0, n_estimators=700, n_jobs=-1,
oob_score=False, random_state=10, verbose=0, warm_start=False)
pd.concat((pd.DataFrame(train.columns, columns = ['variable']),
pd.DataFrame(rf_.feature_importances_, columns = ['importance'])),
axis = 1).sort_values(by='importance', ascending = False)[:20]
|
variable |
importance |
1 |
Sex_0 |
0.136334 |
3 |
Name_Title_1 |
0.125036 |
2 |
Sex_1 |
0.118254 |
0 |
Fare |
0.096483 |
7 |
Name_len |
0.089186 |
6 |
Name_Title_5 |
0.055360 |
22 |
Pclass_3 |
0.050127 |
8 |
Ticket_First_Letter |
0.045200 |
9 |
Cabin_First_Letter_1 |
0.034312 |
17 |
Fam_Size_Big |
0.033951 |
4 |
Name_Title_2 |
0.033745 |
20 |
Pclass_1 |
0.022517 |
18 |
Fam_Size_Nuclear |
0.021219 |
21 |
Pclass_2 |
0.015824 |
23 |
Small_Age |
0.014996 |
27 |
Fare_2_3 |
0.013717 |
16 |
Embarked_S |
0.012581 |
19 |
Fam_Size_Solo |
0.011034 |
29 |
Fare_4_5 |
0.010546 |
14 |
Embarked_C |
0.008645 |
excl("ls titanic_datas")
['gender_submission.csv\n', 'test.csv\n', 'train.csv\n']
submit = pd.read_csv('./titanic_datas/gender_submission.csv')
submit.set_index('PassengerId',inplace=True)
res_rf = rf_.predict(test)
submit['Survived'] = res_rf
submit['Survived'] = submit['Survived'].apply(int)
submit.to_csv('./titanic_datas/submit.csv')
excl("ls titanic_datas")
['gender_submission.csv\n', 'submit.csv\n', 'test.csv\n', 'train.csv\n']
Your Best Entry
Your submission scored 0.81339