数据集介绍:
PassengerId乘客编号,Survived是否被救, Parch 父母子女数量,SibSP兄弟姐妹配偶数, Pclass客舱等级, Pclass登船港口,Ticket票号,Fare票价,Cabin客舱号 Cabin:缺失值很多, Fare票价由客舱等级决定,不必重复分析, Ticket没有参考价值。
Age均值补齐;Cabin删除,因为Pclass和Pclass能体现出来;Ticket也删除;Embarked众数补齐.
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())
train_data['Embarked'] = train_data['Embarked'].fillna('S')
train_data = train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_data = test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
train_data.loc[train_data['Sex'] == 'female', 'Sex'] = 0
train_data.loc[train_data['Sex'] == 'male', 'Sex'] = 1
train_data.loc[train_data['Embarked'] == 'S', 'Embarked'] = 0
train_data.loc[train_data['Embarked'] == 'C', 'Embarked'] = 1
train_data.loc[train_data['Embarked'] == 'Q', 'Embarked'] = 2
test_data.loc[test_data['Sex'] == 'female', 'Sex'] = 0
test_data.loc[test_data['Sex'] == 'male', 'Sex'] = 1
test_data.loc[test_data['Embarked'] == 'S', 'Embarked'] = 0
test_data.loc[test_data['Embarked'] == 'C', 'Embarked'] = 1
test_data.loc[test_data['Embarked'] == 'Q', 'Embarked'] = 2
年龄划分几个等级:
data.loc[data['Age'] <= 20, 'Age'] = 0
data.loc[(data['Age'] > 20) & (data['Age'] <= 40), 'Age'] = 1
data.loc[(data['Age'] > 40) & (data['Age'] <= 60), 'Age'] = 2
data.loc[data['Age'] > 60, 'Age'] = 3
船费按四分位数划分:
q = data.quantile([0.25, 0.50, 0.75])
data.loc[data['Fare'] <= q['Fare'][0.25], 'Fare'] = 0
data.loc[(data['Fare'] > q['Fare'][0.25]) & (data['Fare'] <= q['Fare'][0.50]), 'Fare'] = 1
data.loc[(data['Fare'] > q['Fare'][0.50]) & (data['Fare'] <= q['Fare'][0.75]), 'Fare'] = 2
data.loc[data['Fare'] > q['Fare'][0.75], 'Fare'] = 3
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import pandas as pd
import numpy as np
import seaborn as sns
plt.rcParams['font.sans-serif'] = ['SimHei'] # 中文
plt.rcParams['axes.unicode_minus'] = False # 解决 -
# 先对数据进行简单处理
def simple(train_data, test_data):
# 缺失值处理
train_data['Age'] = train_data['Age'].fillna(train_data['Age'].mean())
test_data['Age'] = test_data['Age'].fillna(test_data['Age'].mean())
test_data['Fare'] = test_data['Fare'].fillna(test_data['Fare'].mean())
# print(train_data['Embarked'].value_counts())
train_data['Embarked'] = train_data['Embarked'].fillna('S')
train_data = train_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
test_data = test_data.drop(['Name', 'Ticket', 'Cabin'], axis=1)
# 数值化处理
train_data.loc[train_data['Sex'] == 'female', 'Sex'] = 0
train_data.loc[train_data['Sex'] == 'male', 'Sex'] = 1
train_data.loc[train_data['Embarked'] == 'S', 'Embarked'] = 0
train_data.loc[train_data['Embarked'] == 'C', 'Embarked'] = 1
train_data.loc[train_data['Embarked'] == 'Q', 'Embarked'] = 2
test_data.loc[test_data['Sex'] == 'female', 'Sex'] = 0
test_data.loc[test_data['Sex'] == 'male', 'Sex'] = 1
test_data.loc[test_data['Embarked'] == 'S', 'Embarked'] = 0
test_data.loc[test_data['Embarked'] == 'C', 'Embarked'] = 1
test_data.loc[test_data['Embarked'] == 'Q', 'Embarked'] = 2
return train_data, test_data
def age_label(data):
data.loc[data['Age'] <= 20, 'Age'] = 0
data.loc[(data['Age'] > 20) & (data['Age'] <= 40), 'Age'] = 1
data.loc[(data['Age'] > 40) & (data['Age'] <= 60), 'Age'] = 2
data.loc[data['Age'] > 60, 'Age'] = 3
return data
# fare直接按4分位数分区间
def fare_label(data):
q = data.quantile([0.25, 0.50, 0.75])
data.loc[data['Fare'] <= q['Fare'][0.25], 'Fare'] = 0
data.loc[(data['Fare'] > q['Fare'][0.25]) & (data['Fare'] <= q['Fare'][0.50]), 'Fare'] = 1
data.loc[(data['Fare'] > q['Fare'][0.50]) & (data['Fare'] <= q['Fare'][0.75]), 'Fare'] = 2
data.loc[data['Fare'] > q['Fare'][0.75], 'Fare'] = 3
return data
# 以下进行可视化
def survived_pie(train_data): # 查看整体营救率
# 被救是否
survived = ['未被营救', '生存']
survived_y = train_data['Survived'].value_counts().values
plt.pie(survived_y, labels=survived, autopct='%.2f%%')
plt.title("整体营救率")
plt.legend()
plt.show()
def sex_bar(train_data):
sex_x = ['male', 'female']
sex_y = train_data['Sex'].value_counts().values
sex_survived = [len(train_data[(train_data['Sex'] == 1) & (train_data['Survived'] == 1)]),
len(train_data[(train_data['Sex'] == 0) & (train_data['Survived'] == 1)])]
bars1 = plt.bar(sex_x, sex_y, width=0.4, tick_label=sex_x, color='#008B8B')
bars2 = plt.bar(sex_x, sex_survived, width=0.4, tick_label=sex_x, color='#FF6347')
plt.title("性别和生存")
red_patch = mpatches.Patch(color='#008B8B', label='总人数')
blue_patch = mpatches.Patch(color='#FF6347', label='生存数')
plt.legend(handles=[red_patch, blue_patch])
for bar1, bar2 in bars1, bars2:
plt.text(bar1.get_x() + bar1.get_width() / 2, bar1.get_height(), str(bar1.get_height()), size=15, ha='center', va='bottom')
plt.text(bar2.get_x() + bar2.get_width() / 2, bar2.get_height(), str(bar2.get_height()), size=15, ha='center', va='bottom')
plt.show()
def pclass_bar(train_data):
grid = sns.FacetGrid(train_data, col='Survived', row='Pclass')
grid.map(plt.hist, 'Age', alpha=.5, bins=20)
grid.add_legend()
plt.show()
def embarked_barh(train_data):
embarked_x = ['S', 'C', 'Q']
embarked_y = train_data['Embarked'].value_counts().values
embarked_survived = [len(train_data[(train_data['Embarked'] == 0) & (train_data['Survived'] == 1)]),
len(train_data[(train_data['Embarked'] == 1) & (train_data['Survived'] == 1)]),
len(train_data[(train_data['Embarked'] == 2) & (train_data['Survived'] == 1)])]
plt.barh(embarked_x, embarked_y, color='#008B8B')
plt.barh(embarked_x, embarked_survived, color='#FF6347')
plt.title("登船口和生存")
red_patch = mpatches.Patch(color='#008B8B', label='总人数')
blue_patch = mpatches.Patch(color='#FF6347', label='生存数')
plt.legend(handles=[red_patch, blue_patch])
embarked_survived_ratio = embarked_survived / embarked_y
for i, v in enumerate(embarked_survived_ratio):
v = round(v, 2)
plt.text(v, i, str('%.1f' % (v * 100)) + '%', fontweight='bold', size=16)
plt.show()
def age_hist(train_data):
ages = train_data['Age']
plt.hist(ages, bins=20, color="steelblue", edgecolor="black", label="总人数")
plt.title('年龄与生存的直方图')
plt.xlabel('年龄')
plt.ylabel('人数')
plt.hist(train_data.loc[(train_data['Survived'] == 1), 'Age'], bins=20, color="#FF6347", label="生存数")
plt.legend()
plt.show()
def fare_hist(train_data):
fare = train_data['Fare']
plt.hist(fare, color="steelblue", edgecolor="black", label="总人数", bins=20)
plt.title('船费与生存的直方图')
plt.xlabel('船费')
plt.ylabel('人数')
plt.hist(train_data.loc[(train_data['Survived'] == 1), 'Fare'], color="#FF6347", label="生存数", bins=20)
plt.legend()
plt.show()
if __name__ == '__main__':
train_data = pd.read_csv('data/train.csv')
test_data = pd.read_csv("data/test.csv")
# pd.set_option('display.max_columns', 20)
# print(train_data.head(5))
print(train_data.info())
print(test_data.info())
print(train_data.describe())
train_data, test_data = simple(train_data, test_data)
survived_pie(train_data)
sex_bar(train_data)
pclass_bar(train_data)
embarked_barh(train_data)
age_hist(train_data)
# 对年龄进行标号处理
train_data = age_label(train_data)
test_data = age_label(test_data)
# 对费用进行标号处理
fare_hist(train_data)
train_data = fare_label(train_data)
test_data = fare_label(test_data)
# # print(train_data.info())
# # print(test_data.info())
# train_data.to_csv('data/train_pre.csv', index=False)
# test_data.to_csv('data/test_pre.csv', index=False)
# print("Your PreprocessedData was successfully saved!")