Titanic : Machine Learning from Disaster
链接:GitHub源代码
Question
- 要求你建立一个预测模型来回答这个问题:“什么样的人更有可能生存?”使用乘客数据(如姓名、年龄、性别、社会经济阶层等)。
一、导入数据包和数据集
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
-
重点:在kaggle notebook上时,应该把pd.read_csv("./kaggle/input/titanic/train.csv")
引号中第一个'.'
去掉
- 读入训练集和测试及都需要
train = pd.read_csv("./kaggle/input/titanic/train.csv")
test = pd.read_csv("./kaggle/input/titanic/test.csv")
allData = pd.concat([train, test], ignore_index=True)
train.info()
二、数据总览
概况
- 输入
train.info()
回车可以查看数据集整体信息
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB
- 输入
train.head()
可以查看数据样例
特征
Variable |
Definition |
Key |
survival |
Survival |
0 = No, 1 = Yes |
pclass |
Ticket class(客舱等级) |
1 = 1st, 2 = 2nd, 3 = 3rd |
sex |
Sex |
|
Age |
Age in years |
|
sibsp |
# of siblings / spouses aboard the Titanic(旁系亲属) |
|
parch |
# of parents / children aboard the Titanic(直系亲属) |
|
ticket |
Ticket number |
|
fare |
Passenger fare |
|
cabin |
Cabin number(客舱编号) |
|
embarked |
Port of Embarkation(上船港口编号) |
C = Cherbourg, Q = Queenstown, S = Southampton |
三、可视化数据分析
性别特征Sex
sns.countplot('Sex', hue='Survived', data=train)
plt.show()
等级特征Pclass
sns.barplot(x='Pclass', y="Survived", data=train)
plt.show()
家庭成员数量特征
-
FamilySize=Parch+SibSp
- 家庭成员数量适中,生存率高
allData['FamilySize'] = allData['SibSp'] + allData['Parch'] + 1
sns.barplot(x='FamilySize', y='Survived', data=allData)
plt.show()
上船港口特征Embarked
sns.countplot('Embarked', hue='Survived', data=train)
plt.show()
年龄特征Age
sns.stripplot(x="Survived", y="Age", data=train, jitter=True)
plt.show()
facet = sns.FacetGrid(train, hue="Survived",aspect=2)
facet.map(sns.kdeplot,'Age',shade= True)
facet.set(xlim=(0, train['Age'].max()))
facet.add_legend()
plt.xlabel('Age')
plt.ylabel('density')
plt.show()
- 儿童相对于全年龄段有特殊的生存率
-
作者将10及以下视为儿童,设置单独标签
费用特征Fare
sns.stripplot(x="Survived", y="Fare", data=train, jitter=True)
plt.show()
姓名特征Name
头衔特征Title
allData['Title'] = allData['Name'].apply(lambda x:x.split(',')[1].split('.')[0].strip())
pd.crosstab(allData['Title'], allData['Sex'])
TitleClassification = {'Officer':['Capt', 'Col', 'Major', 'Dr', 'Rev'],
'Royalty':['Don', 'Sir', 'the Countess', 'Dona', 'Lady'],
'Mrs':['Mme', 'Ms', 'Mrs'],
'Miss':['Mlle', 'Miss'],
'Mr':['Mr'],
'Master':['Master','Jonkheer']}
for title in TitleClassification.keys():
cnt = 0
for name in TitleClassification[title]:
cnt += allData.groupby(['Title']).size()[name]
print (title,':',cnt)
TitleClassification = {'Officer':['Capt', 'Col', 'Major', 'Dr', 'Rev'],
'Royalty':['Don', 'Sir', 'the Countess', 'Dona', 'Lady'],
'Mrs':['Mme', 'Ms', 'Mrs'],
'Miss':['Mlle', 'Miss'],
'Mr':['Mr'],
'Master':['Master','Jonkheer']}
TitleMap = {}
for title in TitleClassification.keys():
TitleMap.update(dict.fromkeys(TitleClassification[title], title))
allData['Title'] = allData['Title'].map(TitleMap)
sns.barplot(x="Title", y="Survived", data=allData)
plt.show()
票号特征Ticket
TicketCnt = allData.groupby(['Ticket']).size()
allData['SameTicketNum'] = allData['Ticket'].apply(lambda x:TicketCnt[x])
sns.barplot(x='SameTicketNum', y='Survived', data=allData)
plt.show()
二维/多维分析
二维分析之Pclass & Age
sns.violinplot("Pclass", "Age", hue="Survived", data=train, split=True)
plt.show()
二维分析之Age & Sex
sns.swarmplot(x='Age', y="Sex", data=train, hue='Survived')
plt.show()
四、数据清洗 & 异常处理
离散型数据
有可用标签 --> One-Hot编码
- Sex & Pclass & Embarked 都有已经设置好的标签(int或float或string等),可以直接进行get_dummies,拆分成多维向量,增加特征维度
- 其中,Embarked存在一定缺失值,通过对整体的分析,填充上估计值
allData = allData.join(pd.get_dummies(allData['Sex'], prefix="Sex"))
allData = allData.join(pd.get_dummies(allData['Pclass'], prefix="Pclass"))
allData[allData['Embarked'].isnull()]
allData.groupby(by=['Pclass','Embarked']).Fare.mean()
allData['Embarked'] = allData['Embarked'].fillna('C')
allData = allData.join(pd.get_dummies(allData['Embarked'], prefix="Embarked"))
无可用标签 --> 设计标签 --> One-Hot
- FamilySize & Name & Ticket需要对整体数据统一处理,再进行标记
def FamilyLabel(s):
if (s == 4):
return 4
elif (s == 2 or s == 3):
return 3
elif (s == 1 or s == 7):
return 2
elif (s == 5 or s == 6):
return 1
elif (s < 1 or s > 7):
return 0
allData['FamilyLabel'] = allData['FamilySize'].apply(FamilyLabel)
allData = allData.join(pd.get_dummies(allData['FamilyLabel'], prefix="Fam"))
TitleLabelMap = {'Mr':1.0,
'Mrs':5.0,
'Miss':4.5,
'Master':2.5,
'Royalty':3.5,
'Officer':2.0}
def TitleLabel(s):
return TitleLabelMap[s]
allData = allData.join(pd.get_dummies(allData['Title'], prefix="Title"))
def TicketLabel(s):
if (s == 3 or s == 4):
return 3
elif (s == 2 or s == 8):
return 2
elif (s == 1 or s == 5 or s == 6 or s ==7):
return 1
elif (s < 1 or s > 8):
return 0
allData['TicketLabel'] = allData['SameTicketNum'].apply(TicketLabel)
allData = allData.join(pd.get_dummies(allData['TicketLabel'], prefix="TicNum"))
连续型数据
Age & Fare
allData['Child'] = allData['Age'].apply(lambda x:1 if x <= 10 else 0)
allData['Age'] = (allData['Age']-allData['Age'].mean())/allData['Age'].std()
allData['Age'].fillna(value=0, inplace=True)
allData['Fare'] = allData['Fare'].fillna(25)
allData[allData['Survived'].notnull()]['Fare'] = allData[allData['Survived'].notnull()]['Fare'].apply(lambda x:300.0 if x>500 else x)
allData['Fare'] = allData['Fare'].apply(lambda x:(x-allData['Fare'].mean())/allData['Fare'].std())
清除无用特征
allData.drop(['Cabin', 'PassengerId', 'Ticket', 'Name', 'Title', 'Sex', 'SibSp', 'Parch', 'FamilySize', 'Embarked', 'Pclass', 'Title', 'FamilyLabel', 'SameTicketNum', 'TicketLabel'], axis=1, inplace=True)
重新分割训练集/测试集
- 一开始,为了处理方便,作者将训练集和测试集合并,现在根据Survived是否缺失来讲训练集和测试集分开
train_data = allData[allData['Survived'].notnull()]
test_data = allData[allData['Survived'].isnull()]
test_data = test_data.reset_index(drop=True)
xTrain = train_data.drop(['Survived'], axis=1)
yTrain = train_data['Survived']
xTest = test_data.drop( ['Survived'], axis=1)
特征相关性分析
- 该步骤用于筛选特征后向程序员反馈,特征是否有效、是否重叠
- 若有问题,可以修改之前的特征方案
Correlation = pd.DataFrame(allData[allData.columns.to_list()])
colormap = plt.cm.viridis
plt.figure(figsize=(24,22))
sns.heatmap(Correlation.astype(float).corr(), linewidths=0.1, vmax=1.0, cmap=colormap, linecolor='white', annot=True, square=True)
plt.show()
五、模型建立 & 参数优化
导入模型包
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
网格搜索调试参数
pipe = Pipeline([('select', SelectKBest(k=10)),
('classify', RandomForestClassifier(random_state = 10, max_features = 'sqrt'))])
param_test = {'classify__n_estimators':list(range(20,100,5)),
'classify__max_depth' :list(range(3,10,1))}
gsearch = GridSearchCV(estimator=pipe, param_grid=param_test, scoring='roc_auc', cv=10)
gsearch.fit(xTrain, yTrain)
print (gsearch.best_params_, gsearch.best_score_)
{'classify__max_depth': 6, 'classify__n_estimators': 70} 0.8790924679681529
建立模型
rfc = RandomForestClassifier(n_estimators=70, max_depth=6, random_state=10, max_features='sqrt')
rfc.fit(xTrain, yTrain)
导出结果
predictions = rfc.predict(xTest)
output = pd.DataFrame({'PassengerId':test['PassengerId'], 'Survived':predictions.astype('int64')})
output.to_csv('my_submission.csv', index=False)
六、提交评分
附:完整代码
- Jupiter Notebook导出为Python Script格式,需要ipynb格式请点击
-
GitHub源代码
import pandas as pd
from pandas import Series, DataFrame
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
train = pd.read_csv("./kaggle/input/titanic/train.csv")
test = pd.read_csv("./kaggle/input/titanic/test.csv")
allData = pd.concat([train, test], ignore_index=True)
train.head()
sns.countplot("Sex", hue="Survived", data=train)
plt.show()
sns.barplot(x="Pclass", y="Survived", data=train)
plt.show()
sns.violinplot("Pclass", "Age", hue="Survived", data=train, split=True)
plt.show()
allData["FamilySize"] = allData["SibSp"] + allData["Parch"] + 1
sns.barplot(x="FamilySize", y="Survived", data=allData)
plt.show()
sns.countplot("Embarked", hue="Survived", data=train)
plt.show()
sns.stripplot(x="Survived", y="Age", data=train, jitter=True)
plt.show()
facet = sns.FacetGrid(train, hue="Survived", aspect=2)
facet.map(sns.kdeplot, "Age", shade=True)
facet.set(xlim=(0, train["Age"].max()))
facet.add_legend()
plt.xlabel("Age")
plt.ylabel("density")
plt.show()
sns.swarmplot(x="Age", y="Sex", data=train, hue="Survived")
plt.show()
sns.stripplot(x="Survived", y="Fare", data=train, jitter=True)
plt.show()
allData["Title"] = allData["Name"].apply(
lambda x: x.split(",")[1].split(".")[0].strip()
)
TitleClassification = {
"Officer": ["Capt", "Col", "Major", "Dr", "Rev"],
"Royalty": ["Don", "Sir", "the Countess", "Dona", "Lady"],
"Mrs": ["Mme", "Ms", "Mrs"],
"Miss": ["Mlle", "Miss"],
"Mr": ["Mr"],
"Master": ["Master", "Jonkheer"],
}
TitleMap = {}
for title in TitleClassification.keys():
TitleMap.update(dict.fromkeys(TitleClassification[title], title))
"""
# cnt = 0
for name in TitleClassification[title]:
cnt += allData.groupby(['Title']).size()[name]
# print (title,':',cnt)
"""
allData["Title"] = allData["Title"].map(TitleMap)
sns.barplot(x="Title", y="Survived", data=allData)
plt.show()
TicketCnt = allData.groupby(["Ticket"]).size()
allData["SameTicketNum"] = allData["Ticket"].apply(lambda x: TicketCnt[x])
sns.barplot(x="SameTicketNum", y="Survived", data=allData)
plt.show()
allData = allData.join(pd.get_dummies(allData["Sex"], prefix="Sex"))
allData = allData.join(pd.get_dummies(allData["Pclass"], prefix="Pclass"))
allData[allData["Embarked"].isnull()]
allData.groupby(by=["Pclass", "Embarked"]).Fare.mean()
allData["Embarked"] = allData["Embarked"].fillna("C")
allData = allData.join(pd.get_dummies(allData["Embarked"], prefix="Embarked"))
allData["Child"] = allData["Age"].apply(lambda x: 1 if x <= 10 else 0)
allData["Age"] = (allData["Age"] - allData["Age"].mean()) / allData["Age"].std()
allData["Age"].fillna(value=0, inplace=True)
allData["Fare"] = allData["Fare"].fillna(25)
allData[allData["Survived"].notnull()]["Fare"] = allData[allData["Survived"].notnull()][
"Fare"
].apply(lambda x: 300.0 if x > 500 else x)
allData["Fare"] = allData["Fare"].apply(
lambda x: (x - allData["Fare"].mean()) / allData["Fare"].std()
)
def FamilyLabel(s):
if s == 4:
return 4
elif s == 2 or s == 3:
return 3
elif s == 1 or s == 7:
return 2
elif s == 5 or s == 6:
return 1
elif s < 1 or s > 7:
return 0
allData["FamilyLabel"] = allData["FamilySize"].apply(FamilyLabel)
allData = allData.join(pd.get_dummies(allData["FamilyLabel"], prefix="Fam"))
TitleLabelMap = {
"Mr": 1.0,
"Mrs": 5.0,
"Miss": 4.5,
"Master": 2.5,
"Royalty": 3.5,
"Officer": 2.0,
}
def TitleLabel(s):
return TitleLabelMap[s]
allData = allData.join(pd.get_dummies(allData["Title"], prefix="Title"))
def TicketLabel(s):
if s == 3 or s == 4:
return 3
elif s == 2 or s == 8:
return 2
elif s == 1 or s == 5 or s == 6 or s == 7:
return 1
elif s < 1 or s > 8:
return 0
allData["TicketLabel"] = allData["SameTicketNum"].apply(TicketLabel)
allData = allData.join(pd.get_dummies(allData["TicketLabel"], prefix="TicNum"))
allData.drop(
[
"Cabin",
"PassengerId",
"Ticket",
"Name",
"Title",
"Sex",
"SibSp",
"Parch",
"FamilySize",
"Embarked",
"Pclass",
"Title",
"FamilyLabel",
"SameTicketNum",
"TicketLabel",
],
axis=1,
inplace=True,
)
train_data = allData[allData["Survived"].notnull()]
test_data = allData[allData["Survived"].isnull()]
test_data = test_data.reset_index(drop=True)
xTrain = train_data.drop(["Survived"], axis=1)
yTrain = train_data["Survived"]
xTest = test_data.drop(["Survived"], axis=1)
Correlation = pd.DataFrame(allData[allData.columns.to_list()])
colormap = plt.cm.viridis
plt.figure(figsize=(24, 22))
sns.heatmap(
Correlation.astype(float).corr(),
linewidths=0.1,
vmax=1.0,
cmap=colormap,
linecolor="white",
annot=True,
square=True,
)
plt.show()
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest
pipe = Pipeline(
[
("select", SelectKBest(k=10)),
("classify", RandomForestClassifier(random_state=10, max_features="sqrt")),
]
)
param_test = {
"classify__n_estimators": list(range(20, 100, 5)),
"classify__max_depth": list(range(3, 10, 1)),
}
gsearch = GridSearchCV(estimator=pipe, param_grid=param_test, scoring="roc_auc", cv=10)
gsearch.fit(xTrain, yTrain)
print(gsearch.best_params_, gsearch.best_score_)
rfc = RandomForestClassifier(
n_estimators=70, max_depth=6, random_state=10, max_features="sqrt"
)
rfc.fit(xTrain, yTrain)
predictions = rfc.predict(xTest)
output = pd.DataFrame(
{"PassengerId": test["PassengerId"], "Survived": predictions.astype("int64")}
)
output.to_csv("my_submission.csv", index=False)
链接:GitHub源代码