Kaggle-titanic-v0

# ----------------Load lib and import data---------------------
import pandas as pd
import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.linear_model import Perceptron
from sklearn import tree
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier


from sklearn.model_selection import cross_val_score
In [2]:
# import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
In [3]:
# show the data 
train.head()
Out[3]:
  PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
In [4]:
test.head()
Out[4]:
  PassengerId Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 892 3 Kelly, Mr. James male 34.5 0 0 330911 7.8292 NaN Q
1 893 3 Wilkes, Mrs. James (Ellen Needs) female 47.0 1 0 363272 7.0000 NaN S
2 894 2 Myles, Mr. Thomas Francis male 62.0 0 0 240276 9.6875 NaN Q
3 895 3 Wirz, Mr. Albert male 27.0 0 0 315154 8.6625 NaN S
4 896 3 Hirvonen, Mrs. Alexander (Helga E Lindqvist) female 22.0 1 1 3101298 12.2875 NaN S
In [5]:
# -----------------Understand each feature of your data------------------
train.describe()
Out[5]:
  PassengerId Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 714.000000 891.000000 891.000000 891.000000
mean 446.000000 0.383838 2.308642 29.699118 0.523008 0.381594 32.204208
std 257.353842 0.486592 0.836071 14.526497 1.102743 0.806057 49.693429
min 1.000000 0.000000 1.000000 0.420000 0.000000 0.000000 0.000000
25% 223.500000 0.000000 2.000000 20.125000 0.000000 0.000000 7.910400
50% 446.000000 0.000000 3.000000 28.000000 0.000000 0.000000 14.454200
75% 668.500000 1.000000 3.000000 38.000000 1.000000 0.000000 31.000000
max 891.000000 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200
In [6]:
print(train.isnull().sum(), train.info())
# then we know that, there's missing in Age, Cabin and Enbarked. Moreover, Cabin lost most of data

RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
(PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64, None)
In [7]:
# check the data condition
train_suv = train[train['Survived']==1]
train_unsuv = train[train['Survived']==0]
suv_num = len(train_suv)
unsuv_num = len(train_unsuv)
print ("survived: %i (%.1f percent) and unsurvived: %i (%.1f percent)" \
       %(suv_num, float(suv_num)/len(train)*100, unsuv_num, float(unsuv_num)/len(train)*100))
survived: 342 (38.4 percent) and unsurvived: 549 (61.6 percent)
In [8]:
plt.figure(figsize=[15, 5])
plt.subplot(111)
age_train = train[['Age','Survived']].groupby(['Age'], as_index=False)
avg_age = age_train.mean()
sns.barplot(x='Age', y='Survived', data=avg_age) 

plt.figure(figsize=[15, 10])
# plt.subplot(331)
# sns.distplot(train_suv['Age'].dropna().values, color='g', label='suv', \
# axlabel='Age', kde_kws={'label': 'suv'}, bins=range(0,81,1))
# sns.distplot(train_unsuv['Age'].dropna().values, color='r', label='unsuv', \
# axlabel='Age', kde_kws={'label': 'unsuv'}, bins=range(0,81,1))
plt.subplot(231)
sns.barplot(x='Pclass', y='Survived', data=train)
plt.subplot(232)
sns.barplot(x='Sex', y='Survived', data=train, order=['female', 'male'])
plt.subplot(233)
sns.barplot(x='SibSp', y='Survived', data=train)
plt.subplot(234)
sns.barplot(x='Parch', y='Survived', data=train)
plt.subplot(235)
sns.barplot(x='Embarked', y='Survived', data=train)
plt.subplot(236)
sns.distplot(train_suv['Fare'].dropna().values, color='g', label='suv', \
 axlabel='Fare', kde_kws={'label': 'suv'})
sns.distplot(train_unsuv['Fare'].dropna().values, color='r', label='unsuv', \
 axlabel='Fare', kde_kws={'label': 'unsuv'})
Out[8]:

from the figs, we can see:

[discrete variables] Pclass: class1 and class2 are more probable to be survived, as what we expect Sex: famele is more probable to be survived, as what we expect SibSp and Parch: not so clear, may need more work to make the data useful Embarked: C is more probable to be survived, don't know why, interesting

[continious variables] Age: children are more probable to be survived, as what we expect Fare: the survived persons's fare is a little larger, as what we expect

[not analysis variables] Ticket: need to process before using Cabin: need to decide if use or not

In [9]:
# sepecial process for Cabin, Ticket and Name
# Cabin
# get the first letter to mark cabin
for item in train['Cabin'].dropna().index:
    train.loc[item, 'Cabin'] = train.loc[item, 'Cabin'][0]
# check if cabin info useful
cabin_mean = train[['Survived', 'Cabin']].iloc[train['Cabin'].dropna().index].groupby(['Cabin'], as_index=False).mean()
plt.figure(figsize=[10, 5])
plt.subplot(121)
sns.barplot(x='Cabin', y='Survived', data=cabin_mean)
# check the difference between cabin known and cabin unknown
cabin_known_mean = train['Survived'][~pd.isnull(train['Cabin'])].mean()
cabin_unknown_mean = train['Survived'][pd.isnull(train['Cabin'])].mean()
cabin_mean = pd.DataFrame([cabin_known_mean, cabin_unknown_mean])
cabin_mean.columns = ['Survived']
# print(cabin_known_mean, cabin_unknown_mean)
plt.subplot(122)
sns.barplot(x=['Cabin_known','Cabin_unknown'],y='Survived', data=cabin_mean) # wow, can be this way
# although cabin known has more possibility to be survived, however, this should be unfair to use this to judge if \
# survive or not, because test data may has no this cabin number. why test, just because don not know the result
Out[9]:
In [11]:
# Family, subjectivily speaking, we think if one person is alone, he may have high risk to be unsurvived
train_family = pd.DataFrame(train['Parch'] + train['SibSp'])
train_family.columns = ['Family']
train_family['Survived'] = train['Survived']
train_family['Family'][train_family['Family']>1] = 1
train_family
plt.figure(figsize=[5, 5])
plt.subplot(111)
sns.barplot(x='Family', y='Survived', data=train_family)
Out[11]:
In [13]:
# i am not sure if it's useful to figure each peer features like the example. However, should make the sex to int\
# and re-calculate
# why has no sex ---------------> my god...
train['Sex'][train['Sex']=='female'] = np.int64(1)
train['Sex'][train['Sex']=='male'] = np.int64(0)

#print(train.head())
#print(type(train.loc[1,'Pclass']))
#plt.figure(figsize=[15, 14])
#plt.subplot(111)
#sns.heatmap(train.drop(['PassengerId'], axis=1).corr(), annot=True)

# -----------------Fill the missing values for both train and test------------------
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:5: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
In [14]:
cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']
r = sns.pairplot(data=train.dropna(), vars=cols, hue='Survived', palette=['r', 'b'])
r.set(xticklabels=[])
Out[14]:

From the picture, we can see: 1, "the avg survived probablity is higher" seems because pclass one is high percentage for C. 2, can not explain why Q is better than S, seems other strong features influence this, check sex

In [16]:
tab = pd.crosstab(train['Embarked'], train['Sex'])
print(tab)
tab = tab.div(tab.sum(1).astype('float'), axis=0)
tab.plot(kind='bar', stacked=True)
Sex         0    1
Embarked          
C          95   73
Q          41   36
S         441  203
Out[16]:

seems female probability is higher for Q, which can explain why Q is better. till now, seems this feature is not that important

In [17]:
print(train.isnull().sum(), train.info())
# then we know that, there's missing in Age, Cabin and Enbarked. Moreover, Cabin lost most of data

RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
(PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64, None)
In [18]:
print(test.isnull().sum(), test.info())
# there missing values for age, Cabin, Fare.

RangeIndex: 418 entries, 0 to 417
Data columns (total 11 columns):
PassengerId    418 non-null int64
Pclass         418 non-null int64
Name           418 non-null object
Sex            418 non-null object
Age            332 non-null float64
SibSp          418 non-null int64
Parch          418 non-null int64
Ticket         418 non-null object
Fare           417 non-null float64
Cabin          91 non-null object
Embarked       418 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 36.0+ KB
(PassengerId      0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64, None)

what we want is reducing the influence of the missing values, so how to fill the missing values

In [19]:
# --------------------------------Fill missing valuse for data------------------------------------
# filling age values, both for train and test data
# choose avg+_std as filling values
avg_age_train = train['Age'].mean()
std_age_train = train['Age'].std()
print(avg_age_train, std_age_train)
num_null_age_train = sum(train['Age'].isnull())
print(num_null_age_train)
rand_fill_age = np.random.randint(avg_age_train-std_age_train, avg_age_train+std_age_train,size=num_null_age_train)
train['Age'][train['Age'].isnull()] = rand_fill_age

avg_age_test = test['Age'].mean()
std_age_test = test['Age'].std()
print(avg_age_test, std_age_test)
num_null_age_test = sum(test['Age'].isnull())
print(num_null_age_test)
rand_fill_age = np.random.randint(avg_age_test-std_age_test, avg_age_test+std_age_test,size=num_null_age_test)
test['Age'][test['Age'].isnull()] = rand_fill_age
(29.69911764705882, 14.526497332334042)
177
(30.272590361445783, 14.181209235624422)
86
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:10: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:18: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [20]:
# filling fare values
test['Fare'][test['Fare'].isnull()] = test['Fare'].median()
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
In [21]:
# --------------------------------Engineered features-----------------------------------------
data = pd.concat([train.drop(['Survived'], axis=1), test])
print(len(train), len(test), len(data))
(891, 418, 1309)
In [22]:
# remove PassengerId, Name, Embarked
data= data.drop(['PassengerId', 'Name', 'Embarked'], axis=1)
print(data.columns)
# add family, remove SibSp and Parch
data['Family'] = data['SibSp'] + data['Parch']
data['Family'][data['Family']>1] = 1
data = data.drop(['SibSp', 'Parch'], axis=1)
print(data.head())
# change ticket to Tshare 
data_share_index = []
data_no_share_index = []
data_ticket_group = data.groupby(['Ticket'], as_index=False)
for ticket, group in data_ticket_group:
    if len(data_ticket_group.get_group(ticket)) > 1:
        #print(data_ticket_group.get_group(ticket))
        data_share_index.extend(data_ticket_group.get_group(ticket).index)
    else:
        data_no_share_index.extend(data_ticket_group.get_group(ticket).index)
data['TShare'] = 0
for item in data_share_index:
    data.loc[item, 'TShare'] = 1
data=data.drop(['Ticket'], axis=1)
# change cabin to 0 and 1
cabin_null = pd.isnull(data['Cabin'])
cabin_no_null = ~pd.isnull(data['Cabin'])
data['Cabin'][cabin_null] = 0
data['Cabin'][cabin_no_null] = 1
data['Cabin'] = data['Cabin'].astype(int)
# change sex to 0 and 1
data['Sex'][data['Sex']=='female'] = np.int64(1)
data['Sex'][data['Sex']=='male'] = np.int64(0)
Index([u'Pclass', u'Sex', u'Age', u'SibSp', u'Parch', u'Ticket', u'Fare',
       u'Cabin'],
      dtype='object')
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:6: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
   Pclass Sex   Age            Ticket     Fare Cabin  Family
0       3   0  22.0         A/5 21171   7.2500   NaN       1
1       1   1  38.0          PC 17599  71.2833     C       1
2       3   1  26.0  STON/O2. 3101282   7.9250   NaN       0
3       1   1  35.0            113803  53.1000     C       1
4       3   0  35.0            373450   8.0500   NaN       0
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:26: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:27: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:30: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:31: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
In [23]:
# test remove col
# data=data.drop(['Tshare'], axis=1)
data_train = data[:len(train)]
data_train['Survived'] = train['Survived']
data_test = data[len(train):]
print(len(train), data_train.head())
print('\n')
print(len(test), data_test.head())
x_data_train = data_train.drop(['Survived'], axis=1)
y_data_train = data_train['Survived']
x_data_test = data_test
(891,    Pclass Sex   Age     Fare  Cabin  Family  TShare  Survived
0       3   0  22.0   7.2500      0       1       0         0
1       1   1  38.0  71.2833      1       1       1         1
2       3   1  26.0   7.9250      0       0       0         1
3       1   1  35.0  53.1000      1       1       1         1
4       3   0  35.0   8.0500      0       0       1         0)


(418,    Pclass Sex   Age     Fare  Cabin  Family  TShare
0       3   0  34.5   7.8292      0       0       0
1       3   1  47.0   7.0000      0       1       1
2       2   0  62.0   9.6875      0       0       0
3       3   0  27.0   8.6625      0       0       1
4       3   1  22.0  12.2875      0       1       1)
/usr/local/lib/python2.7/site-packages/ipykernel_launcher.py:4: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
In [24]:
# logistic regression
logreg = LogisticRegressionCV(max_iter=100)
logreg.fit(x_data_train, y_data_train)
print(logreg.get_params())
print(logreg.score(x_data_train, y_data_train))
logreg_score = cross_val_score(logreg, x_data_train, y_data_train, cv=5).mean()
print(logreg_score)
{'scoring': None, 'n_jobs': 1, 'verbose': 0, 'intercept_scaling': 1.0, 'fit_intercept': True, 'max_iter': 100, 'refit': True, 'penalty': 'l2', 'multi_class': 'ovr', 'random_state': None, 'dual': False, 'tol': 0.0001, 'Cs': 10, 'solver': 'lbfgs', 'cv': None, 'class_weight': None}
0.803591470258
0.791247282153
In [25]:
# perceptron
ptron = Perceptron(max_iter=40)
ptron.fit(x_data_train, y_data_train)
print(ptron.get_params())
print(ptron.score(x_data_train, y_data_train))
ptron_score = cross_val_score(ptron, x_data_train, y_data_train, cv=5).mean()
print(ptron_score)
{'warm_start': False, 'n_iter': None, 'n_jobs': 1, 'eta0': 1.0, 'verbose': 0, 'shuffle': True, 'fit_intercept': True, 'max_iter': 40, 'penalty': None, 'random_state': 0, 'tol': None, 'alpha': 0.0001, 'class_weight': None}
0.675645342312
0.720622905205
In [26]:
# KNN
kNN = KNeighborsClassifier(n_neighbors=4, weights='distance')
kNN.fit(x_data_train, y_data_train)
print(kNN.get_params())
print(kNN.score(x_data_train, y_data_train))
kNN_score = cross_val_score(kNN, x_data_train, y_data_train, cv=5).mean()
print(kNN_score)
{'n_neighbors': 4, 'n_jobs': 1, 'algorithm': 'auto', 'metric': 'minkowski', 'metric_params': None, 'p': 2, 'weights': 'distance', 'leaf_size': 30}
0.991021324355
0.676940776023
In [38]:
# SVM
svm_kernel = svm.SVC(class_weight='balanced')
svm_kernel.fit(x_data_train, y_data_train)
print(svm_kernel.get_params())
print(svm_kernel.score(x_data_train, y_data_train))
svm_score = cross_val_score(svm_kernel, x_data_train, y_data_train, cv=5).mean()
print(svm_score)
{'kernel': 'rbf', 'C': 1.0, 'verbose': False, 'probability': False, 'degree': 3, 'shrinking': True, 'max_iter': -1, 'decision_function_shape': 'ovr', 'random_state': None, 'tol': 0.001, 'cache_size': 200, 'coef0': 0.0, 'gamma': 'auto', 'class_weight': 'balanced'}
0.882154882155
0.691535318093
In [101]:
# Decision tree
dec_tree = tree.DecisionTreeClassifier(min_weight_fraction_leaf=0.01)
dec_tree.fit(x_data_train, y_data_train)
print(dec_tree.get_params())
print(dec_tree.score(x_data_train, y_data_train))
dec_score = cross_val_score(dec_tree, x_data_train, y_data_train, cv=10).mean()
print(dec_score)
{'presort': False, 'splitter': 'best', 'min_impurity_decrease': 0.0, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.01, 'criterion': 'gini', 'random_state': None, 'min_impurity_split': None, 'max_features': None, 'max_depth': None, 'class_weight': None}
0.855218855219
0.803653104074
In [214]:
# random forest
rand_forest = RandomForestClassifier(n_estimators=100, min_weight_fraction_leaf=0.01)
rand_forest.fit(x_data_train, y_data_train)
print(rand_forest.get_params())
print(rand_forest.score(x_data_train, y_data_train))
rand_forest_score = cross_val_score(rand_forest, x_data_train, y_data_train, cv=10).mean()
print(rand_forest_score)
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'min_impurity_decrease': 0.0, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': True, 'min_samples_leaf': 1, 'n_estimators': 100, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.01, 'criterion': 'gini', 'random_state': None, 'min_impurity_split': None, 'max_features': 'auto', 'max_depth': None, 'class_weight': None}
0.868686868687
0.818348371354

random forest performance seems just a little better than decision tree

In [181]:
# bagging
bagging = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(min_weight_fraction_leaf=0.01), n_estimators=20)
bagging.fit(x_data_train, y_data_train)
print(bagging.get_params())
print(bagging.score(x_data_train, y_data_train))
bagging_score = cross_val_score(bagging, x_data_train, y_data_train, cv=5).mean()
print(bagging_score)
{'base_estimator__min_samples_split': 2, 'n_jobs': 1, 'verbose': 0, 'base_estimator': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.01, presort=False,
            random_state=None, splitter='best'), 'max_features': 1.0, 'base_estimator__min_samples_leaf': 1, 'max_samples': 1.0, 'base_estimator__max_depth': None, 'base_estimator__max_features': None, 'base_estimator__min_weight_fraction_leaf': 0.01, 'random_state': None, 'base_estimator__class_weight': None, 'oob_score': False, 'base_estimator__splitter': 'best', 'base_estimator__random_state': None, 'base_estimator__presort': False, 'base_estimator__criterion': 'gini', 'n_estimators': 20, 'base_estimator__max_leaf_nodes': None, 'bootstrap_features': False, 'warm_start': False, 'bootstrap': True, 'base_estimator__min_impurity_decrease': 0.0, 'base_estimator__min_impurity_split': None}
0.868686868687
0.808132883796
In [177]:
# extra tree
extra_tree = ExtraTreesClassifier(n_estimators=20, min_samples_split=20)
extra_tree.fit(x_data_train, y_data_train)
print(extra_tree.get_params())
print(extra_tree.score(x_data_train, y_data_train))
extra_tree_score = cross_val_score(extra_tree, x_data_train, y_data_train, cv=5).mean()
print(extra_tree_score)
{'warm_start': False, 'oob_score': False, 'n_jobs': 1, 'min_impurity_decrease': 0.0, 'verbose': 0, 'max_leaf_nodes': None, 'bootstrap': False, 'min_samples_leaf': 1, 'n_estimators': 20, 'min_samples_split': 20, 'min_weight_fraction_leaf': 0.0, 'criterion': 'gini', 'random_state': None, 'min_impurity_split': None, 'max_features': 'auto', 'max_depth': None, 'class_weight': None}
0.859708193042
0.81264630981
In [182]:
# Gradient boosting
grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(x_data_train, y_data_train)
print(grad_boost.get_params())
print(grad_boost.score(x_data_train, y_data_train))
grad_boost_score = cross_val_score(grad_boost, x_data_train, y_data_train, cv=5).mean()
print(grad_boost_score)
{'presort': 'auto', 'loss': 'deviance', 'min_impurity_decrease': 0.0, 'verbose': 0, 'subsample': 1.0, 'max_leaf_nodes': None, 'learning_rate': 0.1, 'warm_start': False, 'min_samples_leaf': 1, 'n_estimators': 50, 'min_samples_split': 2, 'init': None, 'min_weight_fraction_leaf': 0.0, 'criterion': 'friedman_mse', 'random_state': None, 'min_impurity_split': None, 'max_features': None, 'max_depth': 3}
0.863075196409
0.818239179059
In [171]:
# adaboost
adaboost = AdaBoostClassifier(n_estimators=50)
adaboost.fit(x_data_train, y_data_train)
print(adaboost.get_params())
print(adaboost.score(x_data_train, y_data_train))
adaboost_score = cross_val_score(adaboost, x_data_train, y_data_train, cv=5).mean()
print(adaboost_score)
{'n_estimators': 50, 'base_estimator': None, 'random_state': None, 'learning_rate': 1.0, 'algorithm': 'SAMME.R'}
0.829405162738
0.792364387807
In [34]:
# Bayes
gauss = GaussianNB()
gauss.fit(x_data_train, y_data_train)
print(gauss.get_params())
print(gauss.score(x_data_train, y_data_train))
gauss_score = cross_val_score(gauss, x_data_train, y_data_train, cv=5).mean()
print(gauss_score)
{'priors': None}
0.742985409652
0.740835850367
In [35]:
# voting
'''
voting = VotingClassifier()
voting.fit(x_data_train, y_data_train)
print(voting.score(x_data_train, y_data_train))
voting_score = cross_val_score(voting, x_data_train, y_data_train, cv=5).mean()
print(voting_score)
'''
Out[35]:
'\nvoting = VotingClassifier()\nvoting.fit(x_data_train, y_data_train)\nprint(voting.score(x_data_train, y_data_train))\nvoting_score = cross_val_score(voting, x_data_train, y_data_train, cv=5).mean()\nprint(voting_score)\n'
In [216]:
clf = rand_forest
pred_test = clf.predict(data_test)
submit = pd.DataFrame({'PassengerId': test.loc[:, 'PassengerId'],'Survived':pred_test.T})
print(submit.head())
submit.to_csv("output.csv", index=False)
   PassengerId  Survived
0          892         0
1          893         0
2          894         0
3          895         0
4          896         1

你可能感兴趣的:(Kaggle,kaggle,titanic)