数据分析
train.csv的属性有:
属性名 | 定义 | 取值 |
---|---|---|
PassengerId | 乘客编号 | 1-891 |
Suvived | 生还情况 | 0, 1 |
Pclass | 票的等级 | 1,2,3 |
Name | 乘客姓名 | Braund, Mr. Owen Harris |
Sex | 性别 | male,female |
Age | 年龄 | 数字,有缺失值 |
SibSp | 兄弟姐妹/配偶在船上 | 0-8 |
Parch | 父母/子女在船上 | 0-6 |
Ticket | 船票编号 | A/5 21171 |
Fare | 票价 | 7.25 |
Cabin | 船舱号 | C85,有缺失值 |
Embark | 登船港 | S,C,Q |
test.csv缺少Survived字段,也是需要我们预测的
数据预处理
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
预览数据
train = pd.read_csv("train.csv")
test = pd.read_csv('test.csv')
train.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
定义dummies函数,将某个离散型特征的所有取值变为特征
def dummies(col,train,test):
train_dum = pd.get_dummies(train[col])
test_dum = pd.get_dummies(test[col])
train = pd.concat([train, train_dum], axis=1)
test = pd.concat([test,test_dum],axis=1)
train.drop(col,axis=1,inplace=True)
test.drop(col,axis=1,inplace=True)
return train, test
# get rid of the useless cols
dropping = ['PassengerId', 'Name', 'Ticket']
train.drop(dropping,axis=1, inplace=True)
test.drop(dropping,axis=1, inplace=True)
Pclass处理
观察Pclass和survived的关系,等级越高,生还率越大
将Pclass分解为1,2,3三个特征
print(train.Pclass.value_counts())
sns.factorplot("Pclass",'Survived',data=train,order=[1,2,3])
train, test = dummies('Pclass',train,test)
3 491
1 216
2 184
Name: Pclass, dtype: int64
Sex处理
观察Sex和Survived的关系,女性生还率显著高于男性
分解Sex为male,female,并删除原特征
print(train.Sex.value_counts(dropna=False))
sns.factorplot('Sex','Survived',data=train)
train,test = dummies('Sex',train,test)
train.drop('male',axis=1,inplace=True)
test.drop('male',axis=1,inplace=True)
male 577
female 314
Name: Sex, dtype: int64
Age处理
处理缺失值,计算平均值和方差,对缺失值进行填充
观察Age和Survived的关系,在15到30区间对结果影响较大,增加两个特征,Age小于15和Age大于15且小于30,删除Age
nan_num = len(train[train['Age'].isnull()])
age_mean = train['Age'].mean()
age_std = train['Age'].std()
filling = np.random.randint(age_mean-age_std,age_mean+age_std,size=nan_num)
train['Age'][train['Age'].isnull()==True] = filling
nan_num = train['Age'].isnull().sum()
# dealing the missing val in test
nan_num = test['Age'].isnull().sum()
# 86 null
age_mean = test['Age'].mean()
age_std = test['Age'].std()
filling = np.random.randint(age_mean-age_std,age_mean+age_std,size=nan_num)
test['Age'][test['Age'].isnull()==True]=filling
nan_num = test['Age'].isnull().sum()
s = sns.FacetGrid(train,hue='Survived',aspect=2)
s.map(sns.kdeplot,'Age',shade=True)
s.set(xlim=(0,train['Age'].max()))
s.add_legend()
def under15(row):
result = 0.0
if row<15:
result = 1.0
return result
def young(row):
result = 0.0
if row>=15 and row<30:
result = 1.0
return result
train['under15'] = train['Age'].apply(under15)
train['young'] = train['Age'].apply(young)
test['under15'] = test['Age'].apply(under15)
test['young'] = test['Age'].apply(young)
train.drop('Age',axis=1,inplace=True)
test.drop('Age',axis=1,inplace=True)
SibSp和Parch处理
发现两者值越大,生还率越低
生成组合特征family = SibSp+Parch,删除原特征
print (train.SibSp.value_counts(dropna=False))
print (train.Parch.value_counts(dropna=False))
sns.factorplot('SibSp','Survived',data=train,size=5)
sns.factorplot('Parch','Survived',data=train,szie=5)
train['family'] = train['SibSp'] + train['Parch']
test['family'] = test['SibSp'] + test['Parch']
sns.factorplot('family','Survived',data=train,size=5)
train.drop(['SibSp','Parch'],axis=1,inplace=True)
test.drop(['SibSp','Parch'],axis=1,inplace=True)
0 608
1 209
2 28
4 18
3 16
8 7
5 5
Name: SibSp, dtype: int64
0 678
1 118
2 80
5 5
3 5
4 4
6 1
Name: Parch, dtype: int64
Fare处理
票价高的生还率较大,test里有一个缺失值,用均值填充
train.Fare.isnull().sum()
test.Fare.isnull().sum()
sns.factorplot('Survived','Fare',data=train,size=4)
s = sns.FacetGrid(train,hue='Survived',aspect=2)
s.map(sns.kdeplot,'Fare',shade=True)
s.set(xlim=(0,train['Fare'].max()))
s.add_legend()
test['Fare'].fillna(test['Fare'].median(),inplace=True)
Cabin处理
缺失值过多,删除该特征
#Cabin
print train.Cabin.isnull().sum()
print test.Cabin.isnull().sum()
train.drop('Cabin',axis=1,inplace=True)
test.drop('Cabin',axis=1,inplace=True)
687
327
Embarked处理
训练集有两个缺失值,S出现最多,用S进行填充
观察发现C港口的乘客生还率较高,分解Embarked为S, Q, C
删除S,Q,Embarked. 保留C作为新特征
#Embarked
print train.Embarked.isnull().sum()
print test.Embarked.isnull().sum()
print train['Embarked'].value_counts(dropna=False)
train['Embarked'].fillna('S',inplace=True)
sns.factorplot('Embarked','Survived',data=train,size=5)
train,test = dummies('Embarked',train,test)
train.drop(['S','Q'],axis=1,inplace=True)
test.drop(['S','Q'],axis=1,inplace=True)
2
0
S 644
C 168
Q 77
NaN 2
Name: Embarked, dtype: int64
训练模型
模型选择
主要用逻辑回归,随机森林,支持向量机和k近邻
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score, KFold
def modeling(clf,ft,target):
acc = cross_val_score(clf,ft,target,cv=kf)
acc_lst.append(acc.mean())
return
accuracy = []
def ml(ft,target,time):
accuracy.append(acc_lst)
#logisticregression
logreg = LogisticRegression()
modeling(logreg,ft,target)
#RandomForest
rf = RandomForestClassifier(n_estimators=50,min_samples_split=4,min_samples_leaf=2)
modeling(rf,ft,target)
#svc
svc = SVC()
modeling(svc,ft,target)
#knn
knn = KNeighborsClassifier(n_neighbors = 3)
modeling(knn,ft,target)
# see the coefficient
logreg.fit(ft,target)
feature = pd.DataFrame(ft.columns)
feature.columns = ['Features']
feature["Coefficient Estimate"] = pd.Series(logreg.coef_[0])
print(feature)
return
使用不同特征组合方案
1.使用全部特征
#test1
train_ft = train.drop('Survived',axis=1)
train_y = train['Survived']
kf = KFold(n_splits=3,random_state=1)
acc_lst = []
ml(train_ft,train_y,'test_1')
Features Coefficient Estimate
0 Fare 0.004240
1 1 0.389135
2 2 -0.211795
3 3 -1.210494
4 female 2.689013
5 under15 1.658023
6 young 0.030681
7 family -0.310545
8 C 0.374100
2.删除young
# testing 2, lose young
train_ft_2=train.drop(['Survived','young'],axis=1)
test_2 = test.drop('young',axis=1)
train_ft.head()
# ml
kf = KFold(n_splits=3,random_state=1)
acc_lst=[]
ml(train_ft_2,train_y,'test_2')
Features Coefficient Estimate
0 Fare 0.004285
1 1 0.386195
2 2 -0.207867
3 3 -1.202922
4 female 2.690898
5 under15 1.645827
6 family -0.311682
7 C 0.376629
3.删除young,C
#test3, lose young, c
train_ft_3=train.drop(['Survived','young','C'],axis=1)
test_3 = test.drop(['young','C'],axis=1)
train_ft.head()
# ml
kf = KFold(n_splits=3,random_state=1)
acc_lst = []
ml(train_ft_3,train_y,'test_3')
Features Coefficient Estimate
0 Fare 0.004920
1 1 0.438557
2 2 -0.225821
3 3 -1.194444
4 female 2.694665
5 under15 1.679459
6 family -0.322922
4.删除Fare
# test4, no FARE
train_ft_4=train.drop(['Survived','Fare'],axis=1)
test_4 = test.drop(['Fare'],axis=1)
train_ft.head()
# ml
kf = KFold(n_splits=3,random_state=1)
acc_lst = []
ml(train_ft_4,train_y,'test_4')
Features Coefficient Estimate
0 1 0.564754
1 2 -0.242384
2 3 -1.287715
3 female 2.699738
4 under15 1.629584
5 young 0.058133
6 family -0.269146
7 C 0.436600
5.删除C
# test5, get rid of c
train_ft_5=train.drop(['Survived','C'],axis=1)
test_5 = test.drop('C',axis=1)
# ml
kf = KFold(n_splits=3,random_state=1)
acc_lst = []
ml(train_ft_5,train_y,'test_5')
Features Coefficient Estimate
0 Fare 0.004841
1 1 0.442430
2 2 -0.232150
3 3 -1.207308
4 female 2.691465
5 under15 1.700077
6 young 0.052091
7 family -0.320831
6.删除Fare和young
# test6, lose Fare and young
train_ft_6=train.drop(['Survived','Fare','young'],axis=1)
test_6 = test.drop(['Fare','young'],axis=1)
train_ft.head()
# ml
kf = KFold(n_splits=3,random_state=1)
acc_lst = []
ml(train_ft_6,train_y,'test_6')
Features Coefficient Estimate
0 1 0.562814
1 2 -0.235606
2 3 -1.274657
3 female 2.702955
4 under15 1.604597
5 family -0.270284
6 C 0.442288
结果汇总
accuracy_df=pd.DataFrame(data=accuracy,
index=['test1','test2','test3','test4','test5','test6'],
columns=['logistic','rf','svc','knn'])
accuracy_df
确定模型和特征
综合来看,test_4和支持向量机的表现最好,所以用该模型进行预测
svc = SVC()
svc.fit(train_ft_4,train_y)
svc_pred = svc.predict(test_4)
print(svc.score(train_ft_4,train_y))
submission_test = pd.read_csv("test.csv")
submission = pd.DataFrame({"PassengerId":submission_test['PassengerId'],
"Survived":svc_pred})
submission.to_csv("kaggle_SVC.csv",index=False)
0.832772166105
结果提交
Reference
Titanic: Machine Learning from Disaster
TitanicLearningQI