matplotlib可视化练习
%matplotlib inline
import matplotlib as mpl
from matplotlib import pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
练习4:泰坦尼克号海难幸存状况分析
- 不同仓位等级中幸存和遇难的乘客比例(堆积柱状图)
- 不同性别的幸存比例(堆积柱状图)
- 幸存和遇难乘客的票价分布(分类箱式图)
- 幸存和遇难乘客的年龄分布(分类箱式图)
- 不同上船港口的乘客仓位等级分布(分组柱状图)
- 幸存和遇难乘客堂兄弟姐妹的数量分布(分类箱式图)
- 幸存和遇难乘客父母子女的数量分布(分类箱式图)
- 单独乘船与否和幸存之间有没有联系(堆积柱状图或者分组柱状图)
data = sns.load_dataset("titanic")
data.head()
# 幸存与否,仓位等级,性别,年龄,堂兄弟姐妹数,父母子女数,票价,上船港口缩写,仓位等级,人员分类,是否成年男性,所在甲板,上船港口,是否幸存,是否单独乘船
survived_pclass= data[['survived', 'pclass']].groupby(['survived','pclass']).size()
survived_pclass = survived_pclass.unstack(level=0)
survived_pclass
survived_pclass['total'] = survived_pclass[0] + survived_pclass[1]
survived_pclass['yes_prop'] = survived_pclass[1] / survived_pclass['total']
survived_pclass['no_prop'] = survived_pclass[0] / survived_pclass['total']
survived_pclass
# 绘制堆积柱状图
def stackedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label, title):
_, ax = plt.subplots()
# 循环绘制堆积柱状图
for i in range(0, len(y_data_list)):
if i == 0:
ax.bar(x_data, y_data_list[i], color = colors[i], align = 'center', label = y_data_names[i])
else:
# 采用堆积的方式,除了第一个分类,后面的分类都从前一个分类的柱状图接着画
# 用归一化保证最终累积结果为1
ax.bar(x_data, y_data_list[i], color = colors[i], bottom = y_data_list[i - 1], align = 'center', label = y_data_names[i])
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
ax.legend(loc = 'upper right') # 设定图例位置
# 调用绘图函数
stackedbarplot(x_data = survived_pclass.index.values
, y_data_list = [survived_pclass['yes_prop'], survived_pclass['no_prop']]
, y_data_names = ['Survived', 'Not survived']
, colors = ['#539caf', '#7663b0']
, x_label = 'pclass'
, y_label = 'Survival rate'
, title = 'pclass and Survival Rate')
不同性别幸存比例
sex_survived=data.groupby(['sex','survived']).size().unstack()
sex_survived
sex_survived['sum']=sex_survived[0]+sex_survived[1]
sex_survived['yes_prop']=sex_survived[1]/sex_survived['sum']
sex_survived['no_prop']=sex_survived[0]/sex_survived['sum']
sex_survived
# 调用绘图函数
stackedbarplot(x_data = [0,1]
, y_data_list = [sex_survived['yes_prop'], sex_survived['no_prop']]
, y_data_names = ['Survived', 'Not survived']
, colors = ['#539caf', '#7663b0']
, x_label = 'Sex'
, y_label = 'Survival rate'
, title = 'Sex and Survival Rate')
幸存和遇难乘客的票价分布(分类箱式图)
def boxplot(x_data, y_data, base_color, median_color, x_label, y_label, title):
_, ax = plt.subplots()
ax.boxplot(y_data
# 箱子是否颜色填充
, patch_artist = True
# 中位数线颜色
, medianprops = {'color': base_color}
# 箱子颜色设置,color:边框颜色,facecolor:填充颜色
, boxprops = {'color': base_color, 'facecolor': median_color}
# 猫须颜色whisker
, whiskerprops = {'color': median_color}
# 猫须界限颜色whisker cap
, capprops = {'color': base_color})
# 箱图与x_data保持一致
ax.set_xticklabels(x_data)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
survived=data['survived'].unique()
bp_data=[data[data['survived']==survived[0]]['fare'],data[data['survived']==survived[1]]['fare']]
# 调用绘图函数
boxplot(x_data = survived
, y_data = bp_data
, base_color = 'b'
, median_color = 'r'
, x_label = 'survived'
, y_label = 'fare'
, title = 'Distribution of Fare By Survived')
幸存和遇难乘客的年龄分布(分类箱式图)
data['age'].fillna(0,inplace=True)
survived=data['survived'].unique()
bp_data=[data[data['survived']==survived[0]]['age'],data[data['survived']==survived[1]]['age']]
# 调用绘图函数
boxplot(x_data=survived
, y_data = bp_data
, base_color = 'b'
, median_color = 'r'
, x_label = 'survived'
, y_label = 'age'
, title = 'Distribution of Age By Survived')
不同上船港口的乘客仓位等级分布(分组柱状图)
embark_pclass=data.groupby(['embark_town','pclass']).size().unstack()
embark_pclass.fillna(0,inplace=True)
embark_pclass
pclass_list=[embark_pclass.iloc[:,0],embark_pclass.iloc[:,1],embark_pclass.iloc[:,2]]
pclass_list
[embark_town
Cherbourg 85
Queenstown 2
Southampton 127
Name: 1, dtype: int64, embark_town
Cherbourg 17
Queenstown 3
Southampton 164
Name: 2, dtype: int64, embark_town
Cherbourg 66
Queenstown 72
Southampton 353
Name: 3, dtype: int64]
# 绘制分组柱状图的函数
def groupedbarplot(x_data, y_data_list, y_data_names, colors, x_label, y_label,title):
_, ax = plt.subplots()
# 设置每一组柱状图的宽度
total_width = 0.8
# 设置每一个柱状图的宽度
ind_width = total_width / len(y_data_list)
# 计算每一个柱状图的中心偏移
alteration = np.arange(-total_width/2+ind_width/2, total_width/2+ind_width/2, ind_width)
# 分别绘制每一个柱状图
for i in range(0, len(y_data_list)):
# 横向散开绘制
ax.bar(x_data + alteration[i], y_data_list[i], color = colors[i], label = y_data_names[i], width = ind_width)
ax.set_ylabel(y_label)
ax.set_xlabel(x_label)
ax.set_title(title)
ax.legend(loc = 'upper right')
# 调用绘图函数
groupedbarplot(x_data = range(3)
, y_data_list = pclass_list
, y_data_names = embark_pclass.columns
, colors = ['#539caf', '#7663b0','#00ff00']
, x_label = 'embark_town'
, y_label = 'counts of pclass'
,title = 'Counts of Pclass vs Embark Town')
ax=plt.gca()
ax.set_xticks(range(3))
ax.set_xticklabels(embark_pclass.index.values)
幸存和遇难乘客堂兄弟姐妹的数量分布(分类箱式图)
survived=data['survived'].unique()
bp_data=[data[data['survived']==survived[0]]['sibsp'],data[data['survived']==survived[1]]['sibsp']]
# 调用绘图函数
boxplot(x_data=survived
, y_data = bp_data
, base_color = 'b'
, median_color = 'r'
, x_label = 'survived'
, y_label = 'sibsp'
, title = 'Distribution of Sibsp By Survived')
幸存和遇难乘客父母子女的数量分布(分类箱式图)
survived=data['survived'].unique()
bp_data=[data[data['survived']==survived[0]]['parch'],data[data['survived']==survived[1]]['parch']]
# 调用绘图函数
boxplot(x_data=survived
, y_data = bp_data
, base_color = 'b'
, median_color = 'r'
, x_label = 'survived'
, y_label = 'parch'
, title = 'Distribution of Parch By Survived')
单独乘船与否和幸存之间有没有联系(堆积柱状图或者分组柱状图)
alone_survived=data.groupby(['alone','survived']).size().unstack()
alone_survived
_, ax = plt.subplots()
width=0.4
index=alone_survived.index.values
ax.bar(index, alone_survived[0], color = '#ff0000', label = 'Not survived', width = width)
ax.bar(index+width, alone_survived[1], color = '#00ff00', label = 'Survived', width = width)
ax.set_ylabel('numbers of People')
ax.set_xlabel('alone')
ax.set_title('People Survived vs Alone')
ax.legend(loc = 'upper right')
plt.xticks(index+width,index)