泰坦尼克号海难幸存状况分析

matplotlib可视化练习

%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

练习4:泰坦尼克号海难幸存状况分析

  • 不同仓位等级中幸存和遇难的乘客比例(堆积柱状图)
  • 不同性别的幸存比例(堆积柱状图)
  • 幸存和遇难乘客的票价分布(分类箱式图)
  • 幸存和遇难乘客的年龄分布(分类箱式图)
  • 不同上船港口的乘客仓位等级分布(分组柱状图)
  • 幸存和遇难乘客堂兄弟姐妹的数量分布(分类箱式图)
  • 幸存和遇难乘客父母子女的数量分布(分类箱式图)
  • 单独乘船与否和幸存之间有没有联系(堆积柱状图或者分组柱状图)
data = sns.load_dataset("titanic")
data.head()
# 幸存与否,仓位等级,性别,年龄,堂兄弟姐妹数,父母子女数,票价,上船港口缩写,仓位等级,人员分类,是否成年男性,所在甲板,上船港口,是否幸存,是否单独乘船
泰坦尼克号海难幸存状况分析_第1张图片
Paste_Image.png
survived_pclass= data[['survived', 'pclass']].groupby(['survived','pclass']).size()
survived_pclass = survived_pclass.unstack(level=0)
survived_pclass
泰坦尼克号海难幸存状况分析_第2张图片
Paste_Image.png
survived_pclass['total'] = survived_pclass[0] + survived_pclass[1]
survived_pclass['yes_prop'] = survived_pclass[1] / survived_pclass['total']
survived_pclass['no_prop'] = survived_pclass[0] / survived_pclass['total']
survived_pclass
泰坦尼克号海难幸存状况分析_第3张图片
Paste_Image.png
# 绘制堆积柱状图
def stackedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
    _, ax = plt.subplots()
    # 循环绘制堆积柱状图
    for i in range(0, len(y_data_list)):
        if i == 0:
            ax.bar(x_data, y_data_list[i], color = colors[i], align = 'center', label = y_data_names[i])
        else:
            # 采用堆积的方式,除了第一个分类,后面的分类都从前一个分类的柱状图接着画
            # 用归一化保证最终累积结果为1
            ax.bar(x_data, y_data_list[i], color = colors[i], bottom = y_data_list[i - 1], align = 'center', label = y_data_names[i])
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc = 'upper right') # 设定图例位置

# 调用绘图函数
stackedbarplot(x_data = survived_pclass.index.values
               , y_data_list = [survived_pclass['yes_prop'], survived_pclass['no_prop']]
               , y_data_names = ['Survived', 'Not survived']
               , colors = ['#539caf', '#7663b0']
               , x_label = 'pclass'
               , y_label = 'Survival rate'
               , title = 'pclass and Survival Rate')
泰坦尼克号海难幸存状况分析_第4张图片
output_60_0.png

不同性别幸存比例

sex_survived=data.groupby(['sex','survived']).size().unstack()
sex_survived
泰坦尼克号海难幸存状况分析_第5张图片
Paste_Image.png
sex_survived['sum']=sex_survived[0]+sex_survived[1]
sex_survived['yes_prop']=sex_survived[1]/sex_survived['sum']
sex_survived['no_prop']=sex_survived[0]/sex_survived['sum']
sex_survived
泰坦尼克号海难幸存状况分析_第6张图片
Paste_Image.png
# 调用绘图函数
stackedbarplot(x_data = [0,1]
               , y_data_list = [sex_survived['yes_prop'], sex_survived['no_prop']]
               , y_data_names = ['Survived', 'Not survived']
               , colors = ['#539caf', '#7663b0']
               , x_label = 'Sex'
               , y_label = 'Survival rate'
               , title = 'Sex and Survival Rate')
泰坦尼克号海难幸存状况分析_第7张图片
output_64_0.png

幸存和遇难乘客的票价分布(分类箱式图)

def boxplot(x_data, y_data, base_color, median_color, x_label, y_label, title):
    _, ax = plt.subplots()
    ax.boxplot(y_data
               # 箱子是否颜色填充
               , patch_artist = True
               # 中位数线颜色
               , medianprops = {'color': base_color}
               # 箱子颜色设置,color:边框颜色,facecolor:填充颜色
               , boxprops = {'color': base_color, 'facecolor': median_color}
               # 猫须颜色whisker
               , whiskerprops = {'color': median_color}
               # 猫须界限颜色whisker cap
               , capprops = {'color': base_color})
    # 箱图与x_data保持一致
    ax.set_xticklabels(x_data)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
survived=data['survived'].unique()
bp_data=[data[data['survived']==survived[0]]['fare'],data[data['survived']==survived[1]]['fare']]

# 调用绘图函数
boxplot(x_data = survived
        , y_data = bp_data
        , base_color = 'b'
        , median_color = 'r'
        , x_label = 'survived'
        , y_label = 'fare'
        , title = 'Distribution of Fare By Survived')
泰坦尼克号海难幸存状况分析_第8张图片
output_67_0.png

幸存和遇难乘客的年龄分布(分类箱式图)

data['age'].fillna(0,inplace=True)
survived=data['survived'].unique()
bp_data=[data[data['survived']==survived[0]]['age'],data[data['survived']==survived[1]]['age']]
# 调用绘图函数
boxplot(x_data=survived
        , y_data = bp_data
        , base_color = 'b'
        , median_color = 'r'
        , x_label = 'survived'
        , y_label = 'age'
        , title = 'Distribution of Age By Survived')
泰坦尼克号海难幸存状况分析_第9张图片
output_69_0.png

不同上船港口的乘客仓位等级分布(分组柱状图)

embark_pclass=data.groupby(['embark_town','pclass']).size().unstack()
embark_pclass.fillna(0,inplace=True)
embark_pclass
泰坦尼克号海难幸存状况分析_第10张图片
Paste_Image.png
pclass_list=[embark_pclass.iloc[:,0],embark_pclass.iloc[:,1],embark_pclass.iloc[:,2]]
pclass_list
[embark_town
 Cherbourg       85
 Queenstown       2
 Southampton    127
 Name: 1, dtype: int64, embark_town
 Cherbourg       17
 Queenstown       3
 Southampton    164
 Name: 2, dtype: int64, embark_town
 Cherbourg       66
 Queenstown      72
 Southampton    353
 Name: 3, dtype: int64]
# 绘制分组柱状图的函数
def groupedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label,title):
    _, ax = plt.subplots()
    # 设置每一组柱状图的宽度
    total_width = 0.8
    # 设置每一个柱状图的宽度
    ind_width = total_width / len(y_data_list)
    # 计算每一个柱状图的中心偏移
    alteration = np.arange(-total_width/2+ind_width/2, total_width/2+ind_width/2, ind_width)

    # 分别绘制每一个柱状图
    for i in range(0, len(y_data_list)):
        # 横向散开绘制
        ax.bar(x_data + alteration[i], y_data_list[i], color = colors[i], label = y_data_names[i], width = ind_width)
    ax.set_ylabel(y_label)
    ax.set_xlabel(x_label)
    ax.set_title(title)
    ax.legend(loc = 'upper right')

# 调用绘图函数
groupedbarplot(x_data = range(3)
               , y_data_list = pclass_list
               , y_data_names = embark_pclass.columns
               , colors = ['#539caf', '#7663b0','#00ff00']
               , x_label = 'embark_town'
               , y_label = 'counts of pclass'
               ,title = 'Counts of Pclass vs Embark Town')

ax=plt.gca()
ax.set_xticks(range(3))
ax.set_xticklabels(embark_pclass.index.values)
泰坦尼克号海难幸存状况分析_第11张图片
output_74_1.png

幸存和遇难乘客堂兄弟姐妹的数量分布(分类箱式图)

survived=data['survived'].unique()
bp_data=[data[data['survived']==survived[0]]['sibsp'],data[data['survived']==survived[1]]['sibsp']]
# 调用绘图函数
boxplot(x_data=survived
        , y_data = bp_data
        , base_color = 'b'
        , median_color = 'r'
        , x_label = 'survived'
        , y_label = 'sibsp'
        , title = 'Distribution of Sibsp By Survived')
泰坦尼克号海难幸存状况分析_第12张图片
output_76_0.png

幸存和遇难乘客父母子女的数量分布(分类箱式图)

survived=data['survived'].unique()
bp_data=[data[data['survived']==survived[0]]['parch'],data[data['survived']==survived[1]]['parch']]
# 调用绘图函数
boxplot(x_data=survived
        , y_data = bp_data
        , base_color = 'b'
        , median_color = 'r'
        , x_label = 'survived'
        , y_label = 'parch'
        , title = 'Distribution of Parch By Survived')
泰坦尼克号海难幸存状况分析_第13张图片
output_78_0.png

单独乘船与否和幸存之间有没有联系(堆积柱状图或者分组柱状图)

alone_survived=data.groupby(['alone','survived']).size().unstack()
alone_survived
泰坦尼克号海难幸存状况分析_第14张图片
Paste_Image.png
_, ax = plt.subplots()
width=0.4
index=alone_survived.index.values
ax.bar(index, alone_survived[0], color = '#ff0000', label = 'Not survived', width = width)
ax.bar(index+width, alone_survived[1], color = '#00ff00', label = 'Survived', width = width)

ax.set_ylabel('numbers of People')
ax.set_xlabel('alone')
ax.set_title('People Survived vs Alone')
ax.legend(loc = 'upper right')
plt.xticks(index+width,index)
泰坦尼克号海难幸存状况分析_第15张图片
output_81_1.png

你可能感兴趣的:(泰坦尼克号海难幸存状况分析)