第2关 填充缺失值
import pandas as pd
import numpy as np
def process_nan_value(data):
'''
处理data中缺失值,有缺失值的特征为`Age`,`Cabin`,`Embarked`。
:param data: 训练集的特征,类型为DataFrame
:return:处理好缺失值后的训练集特征,类型为DataFrame
'''
#********* Begin *********#
data['Age'].replace(np.nan,np.nanmedian(data['Age']),
inplace=True)
#data['Age'].replace(np.nan,np.nanmedian(data['Age']),inplace=True)
data.drop(labels='Cabin',axis=1,inplace=True)
data['Embarked'].replace(np.nan,'S',inplace=True)
return data
#********* End *********#
第3关 特征工程与生还预测
import pandas as pd
import numpy as np
import sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor
#********* Begin *********#
titanic = pd.read_csv('./train.csv')
def set_missing_ages(df):
# 把已有的数值型特征取出来丢进Random Forest Regressor中
age_df = df[['Age', 'Fare', 'Parch', 'SibSp', 'Pclass']]
# 乘客分成已知年龄和未知年龄两部分
known_age = age_df[age_df.Age.notnull()].values
unknown_age = age_df[age_df.Age.isnull()].values
# y即目标年龄
y = known_age[:, 0]
# X即特征属性值
X = known_age[:, 1:]
# fit到RandomForestRegressor之中
rfr = RandomForestRegressor(random_state=0, n_estimators=2000, n_jobs=-1)
rfr.fit(X, y)
# 用得到的模型进行未知年龄结果预测
predictedAges = rfr.predict(unknown_age[:, 1::])
# 用得到的预测结果填补原缺失数据
df.loc[(df.Age.isnull()), 'Age'] = predictedAges
return df
titanic = set_missing_ages(titanic)
dummies_Embarked = pd.get_dummies(titanic['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(titanic['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(titanic['Pclass'], prefix= 'Pclass')
df = pd.concat([titanic, dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
# print(df)
train_label = df['Survived']
train_titanic = df.drop('Survived', 1)
titanic_test = pd.read_csv('./test.csv')
titanic_test = set_missing_ages(titanic_test)
dummies_Embarked = pd.get_dummies(titanic_test['Embarked'], prefix= 'Embarked')
dummies_Sex = pd.get_dummies(titanic_test['Sex'], prefix= 'Sex')
dummies_Pclass = pd.get_dummies(titanic_test['Pclass'], prefix= 'Pclass')
df_test = pd.concat([titanic_test,dummies_Embarked, dummies_Sex, dummies_Pclass], axis=1)
df_test.drop(['Pclass', 'Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], axis=1, inplace=True)
#model = RandomForestClassifier(random_state=1, n_estimators=10, min_samples_split=2, min_samples_leaf=1)
model = RandomForestClassifier(n_estimators=10)
model.fit(train_titanic, train_label)
predictions = model.predict(df_test)
result = pd.DataFrame({'Survived':predictions.astype(np.int32)})
result.to_csv("./predict.csv", index=False)
#********* End *********#
结果不是很稳定,多运行几次,准确率能到80以上