import pandas as pd
df = pd.read_csv('Titanic.csv')
df.info()
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId 891 non-null int64
Survived 891 non-null int64
Pclass 891 non-null int64
Name 891 non-null object
Sex 891 non-null object
Age 714 non-null float64
SibSp 891 non-null int64
Parch 891 non-null int64
Ticket 891 non-null object
Fare 891 non-null float64
Cabin 204 non-null object
Embarked 889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 66.2+ KB
(1) 年龄有174个缺失;
(2) 舱位编号大量缺失;
(3) 登录口岸有2个缺失
返回顶部
df.head(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
符号意义:
#PassengerId:乘客ID编号
#Survived:是否存活,0-未存活,1-存活
#Pclass:船舱号,共1,2,3类舱
#Name:乘客姓名
#Sex:乘客性别,Male,Female
#Age:乘客年龄
#SibSp:兄弟姐妹/配偶数量,0~8
#Parch:父母/子女数量,0~6
#Ticket:船票编号
#Fare:票价
#Cabin:舱位编号
#Embarked:登陆口岸,C、Q、S
返回顶部
surv_sex_result = df.loc[:, ['Sex', 'Survived']].groupby(['Sex', 'Survived']).size()
surv_sex_result
Sex Survived
female 0 81
1 233
male 0 468
1 109
dtype: int64
dct_surv_sex = surv_sex_result.to_dict()
dct_surv_sex
{('female', 0): 81, ('female', 1): 233, ('male', 0): 468, ('male', 1): 109}
female_sum = 0
female_surv = 0
for key in dct_surv_sex:
if key[0] == 'female':
female_sum += dct_surv_sex[key]
if key[1] == 1:
female_surv = dct_surv_sex[key]
female_surv_rate = round(female_surv / female_sum, 2)
female_surv_rate
0.74
male_sum = 0
male_surv = 0
for key in dct_surv_sex:
if key[0] == 'male':
male_sum += dct_surv_sex[key]
if key[1] == 1:
male_surv = dct_surv_sex[key]
male_surv_rate = round(male_surv / male_sum, 2)
male_surv_rate
0.19
女性获救率74%远大于男性19%
返回顶部
round(df['Fare'].max() - df['Fare'].min(), 2)
512.33
从票价上看,贫富差异很大
返回顶部
# 备份一份是否有亲戚的数据
df_sib = df.copy(deep=True)
df_sib['sib_or_parch'] = 'N'
df_sib.loc[(df['SibSp'] > 0) | (df['Parch'] > 0),'sib_or_parch'] = 'Y'
df_sib.head(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | sib_or_parch | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | Y |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | Y |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | N |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | Y |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | N |
有无亲戚的获救情况:
surv_sib_result = df_sib.loc[:, ['Survived', 'sib_or_parch']].groupby(['sib_or_parch', 'Survived']).size()
dct_surv_sib = surv_sib_result.to_dict()
dct_surv_sib
{('N', 0): 374, ('N', 1): 163, ('Y', 0): 175, ('Y', 1): 179}
sib_sum = 0
sib_surv = 0
for key in dct_surv_sib:
if key[0] == 'Y':
sib_sum += dct_surv_sib[key]
if key[1] == 1:
sib_surv = dct_surv_sib[key]
sib_surv_rate = round(sib_surv / sib_sum, 2)
sib_surv_rate
0.51
nosib_sum = 0
nosib_surv = 0
for key in dct_surv_sib:
if key[0] == 'N':
nosib_sum += dct_surv_sib[key]
if key[1] == 1:
nosib_surv = dct_surv_sib[key]
nosib_surv_rate = round(nosib_surv / nosib_sum, 2)
nosib_surv_rate
0.3
有亲戚在船上的获救率50%高于无亲戚的获救率30%
返回顶部
embark_sex = df.loc[:, ['Embarked', 'Sex']].groupby(['Embarked', 'Sex']).size()
embark_sex
Embarked Sex
C female 73
male 95
Q female 36
male 41
S female 203
male 441
dtype: int64
(登船地点的票价中位数)
返回顶部
# 中位数
embark_fare = df.loc[:, ['Embarked', 'Fare']].groupby('Embarked').median()
embark_fare
Fare | |
---|---|
Embarked | |
C | 29.70 |
Q | 7.75 |
S | 13.00 |
# 最大值
embark_fare_max = df.loc[:, ['Embarked', 'Fare']].groupby('Embarked').max()
embark_fare_max
Fare | |
---|---|
Embarked | |
C | 512.3292 |
Q | 90.0000 |
S | 263.0000 |
C地最富, S地次之, Q地最穷
返回顶部
按不同年龄规约
children:小于6岁;
teen:6~16岁;
youth:16~36岁;
middle:36~60岁;
old:大于60岁
df_age = df.copy(deep=True)
df_age.loc[(df_age['Age'] <= 6) & (df_age['Age'] > 0), 'age_scope'] = 'children'
df_age.loc[(df_age['Age'] <= 16) & (df_age['Age'] > 6), 'age_scope'] = 'teen'
df_age.loc[(df_age['Age'] <= 36) & (df_age['Age'] > 16), 'age_scope'] = 'youth'
df_age.loc[(df_age['Age'] <= 60) & (df_age['Age'] > 36), 'age_scope'] = 'middle'
df_age.loc[df_age['Age'] > 60, 'age_scope'] = 'old'
df_age
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | age_scope | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S | youth |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C | middle |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S | youth |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S | youth |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S | youth |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.0000 | NaN | S | youth |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.0000 | B42 | S | youth |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | S | NaN |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.0000 | C148 | C | youth |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.7500 | NaN | Q | youth |
891 rows × 13 columns
age_surv = dict(df_age.loc[:, ['age_scope', 'Survived']].groupby(['age_scope', 'Survived']).size())
age_surv
{('children', 0): 14,
('children', 1): 33,
('middle', 0): 106,
('middle', 1): 67,
('old', 0): 17,
('old', 1): 5,
('teen', 0): 31,
('teen', 1): 22,
('youth', 0): 256,
('youth', 1): 163}
age_surv_dct = {}
age_set = set(lt[0] for lt in list(age_surv))
for ast in age_set:
sum = 0
surv = 0
for key in age_surv:
if key[0] == ast:
sum += age_surv[key]
if key[1] == 1:
surv += age_surv[key]
age_surv_dct[ast] = round(surv / sum, 2)
age_surv_dct
{'teen': 0.42, 'old': 0.23, 'children': 0.7, 'youth': 0.39, 'middle': 0.39}
from pyecharts.charts import Bar
from pyecharts.commons.utils import JsCode
from pyecharts import options as opts
def bar_border_radius(dct):
c = (
Bar()
.add_xaxis([key for key in dct])
.add_yaxis("获救率", [dct[key] for key in dct], category_gap="60%")
.set_series_opts(itemstyle_opts={
"normal": {
"color": JsCode("""new echarts.graphic.LinearGradient(0, 0, 0, 1, [{
offset: 0,
color: 'rgba(0, 244, 255, 1)'
}, {
offset: 1,
color: 'rgba(0, 77, 167, 1)'
}], false)"""),
"barBorderRadius": [30, 30, 30, 30],
"shadowColor": 'rgb(0, 160, 221)',
}})
.set_global_opts(title_opts=opts.TitleOpts(title="不同年龄段获救率"))
)
return c
bar_border_radius(age_surv_dct).render_notebook()
小孩获救率最高,老人获救率最低,其他年龄段获救率在40%左右
返回顶部
pcls_surv = dict(df.loc[:, ['Pclass', 'Survived']].groupby(['Pclass', 'Survived']).size())
pcls_surv
{(1, 0): 80, (1, 1): 136, (2, 0): 97, (2, 1): 87, (3, 0): 372, (3, 1): 119}
pcls_surv_dct = {}
pcls_set = set([lt[0] for lt in list(pcls_surv)])
for pst in pcls_set:
sum = 0
surv = 0
for key in pcls_surv:
if key[0] == pst:
sum += pcls_surv[key]
if key[1] == 1:
surv += pcls_surv[key]
pcls_surv_dct[pst] = round(surv / sum, 2)
pcls_surv_dct
{1: 0.63, 2: 0.47, 3: 0.24}
def bar_reversal_axis(dct) -> Bar:
c = (
Bar()
.add_xaxis([key for key in dct])
.add_yaxis("获救率", [dct[key] for key in dct])
.reversal_axis()
.set_series_opts(label_opts=opts.LabelOpts(position="right"))
.set_global_opts(title_opts=opts.TitleOpts(title="不同等级客舱获救率"))
)
return c
bar_reversal_axis(pcls_surv_dct).render_notebook()
不同等级客舱的获救率差别很大,1级客舱更便于逃生
返回顶部
# 后两个据说是人物原型
df.loc[df['Name'].str.contains('Jack') | df['Name'].str.contains('Emilio Portaluppi') | df['Name'].str.contains('Dawson')]
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
766 | 767 | 0 | 1 | Brewe, Dr. Arthur Jackson | male | NaN | 0 | 0 | 112379 | 39.6 | NaN | C |
df.loc[df['Name'].str.contains('Rose')]
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
855 | 856 | 1 | 3 | Aks, Mrs. Sam (Leah Rosen) | female | 18.0 | 0 | 1 | 392091 | 9.35 | NaN | S |
船上没有Jack和Rose,只有他们的故事
返回顶部
和年龄有关系的参数:Survived、Pclass、Name(Mr、Mrs、Miss、Other)、SibSp、Parch、Fare、Embarked
df_age.loc[df_age['Sex'] == 'male', ['Sex']] = 1
df_age.loc[df_age['Sex'] == 'female', ['Sex']] = 2
df_age.loc[df_age['Name'].str.contains('Mrs.'), ['Name']] = '1'
df_age.loc[df_age['Name'].str.contains('Mr.'), ['Name']] = '2'
df_age.loc[df_age['Name'].str.contains('Miss.'), ['Name']] = '3'
df_age.loc[df_age['Name'].str.contains(','), ['Name']] = '4'
df_age.loc[df_age['Name'] == '1', ['Name']] = 1
df_age.loc[df_age['Name'] == '2', ['Name']] = 2
df_age.loc[df_age['Name'] == '3', ['Name']] = 3
df_age.loc[df_age['Name'] == '4', ['Name']] = 4
df_age.loc[df_age['Embarked'] == 'C', ['Embarked']] = 1
df_age.loc[df_age['Embarked'] == 'S', ['Embarked']] = 2
df_age.loc[df_age['Embarked'] == 'Q', ['Embarked']] = 3
df_age.head(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | age_scope | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | 2 | 1 | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | 2 | youth |
1 | 2 | 1 | 1 | 1 | 2 | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | 1 | middle |
2 | 3 | 1 | 3 | 3 | 2 | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | 2 | youth |
3 | 4 | 1 | 1 | 1 | 2 | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | 2 | youth |
4 | 5 | 0 | 3 | 2 | 1 | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | 2 | youth |
df_age.tail(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | age_scope | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
886 | 887 | 0 | 2 | 4 | 1 | 27.0 | 0 | 0 | 211536 | 13.00 | NaN | 2 | youth |
887 | 888 | 1 | 1 | 3 | 2 | 19.0 | 0 | 0 | 112053 | 30.00 | B42 | 2 | youth |
888 | 889 | 0 | 3 | 3 | 2 | NaN | 1 | 2 | W./C. 6607 | 23.45 | NaN | 2 | NaN |
889 | 890 | 1 | 1 | 2 | 1 | 26.0 | 0 | 0 | 111369 | 30.00 | C148 | 1 | youth |
890 | 891 | 0 | 3 | 2 | 1 | 32.0 | 0 | 0 | 370376 | 7.75 | NaN | 3 | youth |
from sklearn.linear_model import LinearRegression
from sklearn.externals import joblib
train_columns = ['Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch', 'Fare', 'Embarked']
df_train = df_age[(df_age['Age'] > 0) & (df_age['Embarked'] > 0)]
x_train = df_train.loc[:, train_columns]
y_target = df_train['Age']
lr = LinearRegression()
lr.fit(x_train, y_target)
joblib.dump(lr, './lr.pkl')
lr = joblib.load('./lr.pkl')
df_test = df_age[(df_age['Age'].isnull()) & (df_age['Embarked'] > 0)]
df_test
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | age_scope | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
5 | 6 | 0 | 3 | 2 | 1 | NaN | 0 | 0 | 330877 | 8.4583 | NaN | 3 | NaN |
17 | 18 | 1 | 2 | 2 | 1 | NaN | 0 | 0 | 244373 | 13.0000 | NaN | 2 | NaN |
19 | 20 | 1 | 3 | 1 | 2 | NaN | 0 | 0 | 2649 | 7.2250 | NaN | 1 | NaN |
26 | 27 | 0 | 3 | 2 | 1 | NaN | 0 | 0 | 2631 | 7.2250 | NaN | 1 | NaN |
28 | 29 | 1 | 3 | 3 | 2 | NaN | 0 | 0 | 330959 | 7.8792 | NaN | 3 | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
859 | 860 | 0 | 3 | 2 | 1 | NaN | 0 | 0 | 2629 | 7.2292 | NaN | 1 | NaN |
863 | 864 | 0 | 3 | 3 | 2 | NaN | 8 | 2 | CA. 2343 | 69.5500 | NaN | 2 | NaN |
868 | 869 | 0 | 3 | 2 | 1 | NaN | 0 | 0 | 345777 | 9.5000 | NaN | 2 | NaN |
878 | 879 | 0 | 3 | 2 | 1 | NaN | 0 | 0 | 349217 | 7.8958 | NaN | 2 | NaN |
888 | 889 | 0 | 3 | 3 | 2 | NaN | 1 | 2 | W./C. 6607 | 23.4500 | NaN | 2 | NaN |
177 rows × 13 columns
for i in range(len(df_test)):
x_i = df_test.loc[:, train_columns][i:i + 1] # 取每一条需要预测年龄的数据
x_PassId = df_test[i:i + 1]['PassengerId'] # 取对应的乘客Id
age_pred = lr.predict(x_i) if lr.predict(x_i) >= 0 else 0 # 如果预测年龄小于0, 按0
age_pred = lr.predict(x_i) if lr.predict(x_i) <= 80 else 80 # 如果预测年龄小于80, 按80
df.loc[df['PassengerId'] == x_PassId.unique()[0], ['Age']] = round(age_pred[0], 1)
df.head(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22.0 | 1 | 0 | A/5 21171 | 7.2500 | NaN | S |
1 | 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Th... | female | 38.0 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
2 | 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26.0 | 0 | 0 | STON/O2. 3101282 | 7.9250 | NaN | S |
3 | 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35.0 | 1 | 0 | 113803 | 53.1000 | C123 | S |
4 | 5 | 0 | 3 | Allen, Mr. William Henry | male | 35.0 | 0 | 0 | 373450 | 8.0500 | NaN | S |
df.tail(5)
PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
886 | 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27.0 | 0 | 0 | 211536 | 13.00 | NaN | S |
887 | 888 | 1 | 1 | Graham, Miss. Margaret Edith | female | 19.0 | 0 | 0 | 112053 | 30.00 | B42 | S |
888 | 889 | 0 | 3 | Johnston, Miss. Catherine Helen "Carrie" | female | 18.2 | 1 | 2 | W./C. 6607 | 23.45 | NaN | S |
889 | 890 | 1 | 1 | Behr, Mr. Karl Howell | male | 26.0 | 0 | 0 | 111369 | 30.00 | C148 | C |
890 | 891 | 0 | 3 | Dooley, Mr. Patrick | male | 32.0 | 0 | 0 | 370376 | 7.75 | NaN | Q |
df['Age'].median()
29.0
(1) Titanic号是一条“绅士船”,遇难后妇女儿童的获救率更高。
(2) Titanic号上贫富差距也很大,1等舱的获救率更高。
(3) Rose和Jack不在船上,船上只有他们的传说。
(1) 对层次索引,求构造指标(如获救率等)。
(2) 线性回归预测缺失值,并填入表格。
欢迎关注,敬请点赞!
返回顶部