该数据集整理了从1896年雅典奥运会至2016年里约热内卢奥运会120年的奥林匹克运动会的历史数据。
120年奥运会数据集
需要注意的是,在1896年-1992年期间,冬季奥运会与夏季奥运会都是在同一年举行的。在这之后,冬季与夏季的奥运会才被错开举办,冬季奥运会从1994年开始4年举办一次,夏季奥运会从1996开始4年举办一次。大家在分析这些数据时,经常会犯得一个错误就是认为夏季与冬季奥运会是一直错开举办的。
```python
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pyecharts.charts import *
from pyecharts.components import Table
from pyecharts import options as opts
from pyecharts.commons.utils import JsCode
import random
import datetime
import warnings
warnings.filterwarnings("ignore")
# 导入奥运会数据集
df=pd.read_csv("/home/kesci/input/olympic/athlete_events.csv")
df_region=pd.read_csv("/home/kesci/input/olympic/noc_regions.csv")
```python
# 需要作数据清理,先看看数据有哪些问题
df.isnull().sum()
df.describe()
sports_cate=df.groupby(by="Sport")["Sport"].count()
sports_cate.loc["Gymnastics"]
data=[z for z in zip(sports_cate.index,sports_cate)]
wc = (
WordCloud()
.add("奥运会项目",data,word_size_range=[6,100])
.set_global_opts(
title_opts=opts.TitleOpts(
title="奥运会项目汇总", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
),
tooltip_opts=opts.TooltipOpts(is_show=True),
)
)
wc.render_notebook()
# Emily 词云,是一个小插曲,可以用来展示人名的一个美观的可视化效果
list=[]
for i in range(100):
j=random.randint(0,100)
list.append(("Emily",j))
list
wc = (
WordCloud()
.add("Emily",list,word_size_range=[6,100])
.set_global_opts(
title_opts=opts.TitleOpts(
title="Emily", title_textstyle_opts=opts.TextStyleOpts(font_size=23)
),
tooltip_opts=opts.TooltipOpts(is_show=True),
)
)
wc.render_notebook()
tmp1=df.drop_duplicates()
tmp1=tmp1.groupby(by="Sex")["Sex"].count()
tmp1=tmp1/tmp1.sum()*100
tmp1
data=[z for z in zip(tmp1.index,tmp1)]
data
pie = (Pie()
.add('', data,radius=["20%", "55%"])
.set_global_opts(
title_opts=opts.TitleOpts(title="历届奥运会男女比例"),
legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {d}%"))
)
pie.render_notebook()
tmp2=df.drop_duplicates()
tmp2.head()
trace0=tmp2[tmp2["Sex"]=="M"]["Age"].to_list()
trace1=tmp2[tmp2["Sex"]=="F"]["Age"].to_list()
y_data=[trace0,trace1]
gender=["男","女"]
tmp2[tmp2["Age"]==97.0]
c = Boxplot()
c.add_xaxis(gender)
c.add_yaxis("比例", c.prepare_data(y_data))
c.set_global_opts(title_opts=opts.TitleOpts(title="奥运会男女参赛者的年龄分布"))
c.render_notebook()
tmp2=tmp2.sort_values(by="Year",ascending=True)
male_data=(tmp2[tmp2["Sex"]=='M']).groupby(by="Year")["Age"].mean().values
male_data=[int(i) for i in male_data]
female_data=(tmp2[tmp2["Sex"]=='F']).groupby(by="Year")["Age"].mean().values
female_data=[int(i)for i in female_data]
x_data=[str(i) for i in tmp2["Year"].unique()]
background_color_js = (
"new echarts.graphic.LinearGradient(0, 0, 0, 1, "
"[{offset: 0, color: '#c86589'}, {offset: 1, color: '#06a7ff'}], false)"
)
area_color_js = (
"new echarts.graphic.LinearGradient(0, 0, 0, 1, "
"[{offset: 0, color: '#eb64fb'}, {offset: 1, color: '#3fbbff0d'}], false)"
)
c = (
Line(init_opts=opts.InitOpts(bg_color=JsCode(background_color_js)))
.add_xaxis(xaxis_data=x_data)
.add_yaxis(
series_name="男",
y_axis=male_data,
is_smooth=True,
is_symbol_show=True,
symbol="circle",
symbol_size=6,
linestyle_opts=opts.LineStyleOpts(color="#fff"),
label_opts=opts.LabelOpts(is_show=True, position="top", color="white"),
itemstyle_opts=opts.ItemStyleOpts(
color="red", border_color="#fff", border_width=3
),
tooltip_opts=opts.TooltipOpts(is_show=False),
areastyle_opts=opts.AreaStyleOpts(color=JsCode(area_color_js), opacity=1),
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="历年奥运男女平均年龄的变化",
pos_bottom="5%",
pos_left="center",
title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16),
),
xaxis_opts=opts.AxisOpts(
type_="category",
boundary_gap=False,
axislabel_opts=opts.LabelOpts(margin=30, color="#ffffff63"),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(
is_show=True,
length=25,
linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
),
splitline_opts=opts.SplitLineOpts(
is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
),
),
yaxis_opts=opts.AxisOpts(
type_="value",
position="right",
axislabel_opts=opts.LabelOpts(margin=20, color="#ffffff63"),
axisline_opts=opts.AxisLineOpts(
linestyle_opts=opts.LineStyleOpts(width=2, color="#fff")
),
axistick_opts=opts.AxisTickOpts(
is_show=True,
length=15,
linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
),
splitline_opts=opts.SplitLineOpts(
is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
),
),
legend_opts=opts.LegendOpts(is_show=True),
)
)
line = (
Line(init_opts=opts.InitOpts(bg_color=JsCode(background_color_js)))
.add_xaxis(xaxis_data=x_data)
.add_yaxis(
series_name="女",
y_axis=female_data,
is_smooth=True,
is_symbol_show=True,
symbol="triangle",
symbol_size=6,
linestyle_opts=opts.LineStyleOpts(color="#fff"),
label_opts=opts.LabelOpts(is_show=True, position="top", color="white"),
itemstyle_opts=opts.ItemStyleOpts(
color="red", border_color="#fff", border_width=3
),
tooltip_opts=opts.TooltipOpts(is_show=False),
areastyle_opts=opts.AreaStyleOpts(color=JsCode(area_color_js), opacity=1),
)
.set_global_opts(
title_opts=opts.TitleOpts(
title="历年奥运男女平均年龄的变化",
pos_bottom="5%",
pos_left="center",
title_textstyle_opts=opts.TextStyleOpts(color="#fff", font_size=16),
),
xaxis_opts=opts.AxisOpts(
type_="category",
boundary_gap=False,
axislabel_opts=opts.LabelOpts(margin=30, color="#ffffff63"),
axisline_opts=opts.AxisLineOpts(is_show=False),
axistick_opts=opts.AxisTickOpts(
is_show=True,
length=25,
linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
),
splitline_opts=opts.SplitLineOpts(
is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
),
),
yaxis_opts=opts.AxisOpts(
type_="value",
position="right",
axislabel_opts=opts.LabelOpts(margin=20, color="#ffffff63"),
axisline_opts=opts.AxisLineOpts(
linestyle_opts=opts.LineStyleOpts(width=2, color="#fff")
),
axistick_opts=opts.AxisTickOpts(
is_show=True,
length=15,
linestyle_opts=opts.LineStyleOpts(color="#ffffff1f"),
),
splitline_opts=opts.SplitLineOpts(
is_show=True, linestyle_opts=opts.LineStyleOpts(color="#ffffff1f")
),
),
legend_opts=opts.LegendOpts(is_show=True),
)
)
overlap = c.overlap(line)
overlap.render_notebook()
(tmp2[tmp2["Medal"]=="Gold"].groupby(by="Team")["Medal"].count().sort_values(ascending=False))[0:20].plot(kind="bar",figsize=(20,8))
tmp2.groupby(by="Sport")["Sport"].count().sort_values(ascending=False)[0:20].plot(kind="bar",figsize=(20,8))
china_data=tmp2[tmp2["Team"]=="China"].sort_values(by="Year").groupby(by=["Games","Medal"])["Medal"].count().unstack()
china_data.plot(kind="bar",figsize=(20,8))
data=tmp2[tmp2["Team"]=="China"].groupby(by="Sport")["Medal"].count().to_frame()
data=data.sort_values(by="Medal",ascending=False)[0:10]
data
sport_name=[str(i) for i in data.index]
sports=[int(j)for j in data.values]
# 虚假数据
funnel = (Funnel()
.add("", [z for z in zip(sport_name,sports)])
)
funnel.render_notebook()