这学期上了数据通讯这门课,其中有一个作业是要求爬取某一天各重要城市到上海虹桥以及上海浦东两机场的航班信息,然后进行可视化、数据分析。在这一份作业用到的可视化工具是python的pyecharts库,其中一幅图的效果如下:
事实上这航线图是可以动的,只是截了个屏,下面具体说一下怎么操作。用到的库除了pyecharts,还有numpy、panda.
要做数据的可视化,首先肯定要获取数据。我是采用C#+selenium动态爬取携程网上的10月30号的数据,selenium通过调用命令模拟人与浏览器的交互,效果不错。爬取下来的数据放在了一个文件夹里,每个城市到上海的存入一个文本文档中。
在这里我是用numpy+pandas处理数据的,没安装这两个包的先安装再进行操作。
import numpy as np
import pandas as pd
import os
df =pd.read_fwf(r"C:\Users\58381\Desktop\data\北京—上海.txt",encoding = "utf-8",header=None, names=["航班号","飞机机型","出发时间","出发机场","到达时间","到达机场","出发城市"])
for filename in os.listdir(r'C:\Users\58381\Desktop\data'):
df1 =pd.read_fwf('C:\\Users\\58381\\Desktop\\data\\'+filename,encoding = "utf-8",header=None, names=["航班号","飞机机型","出发时间","出发机场","到达时间","到达机场","出发城市"])
df = pd.merge(df1,df,how='outer',on=["航班号","飞机机型","出发时间","出发机场","到达时间","到达机场","出发城市"])
df = df[ ~ df['航班号'].str.contains('共享') ]
#把文件中含有共享的删掉,即获得真正准确的数据
df.tail()#展示后五条数据
把各城市到虹桥机场的航班数分别统计
dh = df[ ~ df['到达机场'].str.contains('浦东') ]
#把文件中含有浦东的删掉,即获取含有虹桥的数据
dp = df[ ~ df['到达机场'].str.contains('虹桥') ]
#把文件中含有虹桥的删掉,获取浦东机场的数据
dh1 = pd.value_counts(dh["出发城市"])#各城市到达虹桥机场的航班数
dp1 = pd.value_counts(dp["出发城市"])#各城市到达浦东机场的航班数
然后将航班数量排名前十的城市拿出来制作条形图,这里就开始用到pyecharts库,关于怎么安装在我上一篇博客已经说了。在jupyter中安装pyecharts
hq_top = dh1.head(10)#取到虹桥航班数量前十的城市
pd_top = dp1.head(10)#取浦东
from pyecharts.charts import Bar
from pyecharts import options as opts
bar = (
Bar()
.add_xaxis(hq_top.index)
.add_yaxis("航班数",hq_top.values)
.set_global_opts(title_opts=opts.TitleOpts(title="到达虹桥航班数top10城市"))
.reversal_axis()
.set_series_opts(
label_opts=opts.LabelOpts(position="right"),
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_="min", name="最小值"),
opts.MarkLineItem(type_="max", name="最大值"),
]
)
)
)
bar.render_notebook()
bar1 = (
Bar()
.add_xaxis(pd_top.index)
.add_yaxis("航班数",pd_top.value)
.set_global_opts(title_opts=opts.TitleOpts(title="到达浦东航班数top10城市"))
.reversal_axis()
.set_series_opts(
label_opts=opts.LabelOpts(position="right"),
markline_opts=opts.MarkLineOpts(
data=[
opts.MarkLineItem(type_="min", name="最小值"),
opts.MarkLineItem(type_="max", name="最大值"),
]
)
)
)
bar1.render_notebook()
from pyecharts import options as opts
from pyecharts.charts import Geo
from pyecharts.globals import ChartType, SymbolType
def geo_lines_background() -> Geo:
c = (
Geo()
.add_schema(
maptype="china",
),
)
.add(
"航班数",
[("深圳", 55), ("北京", 56), ("广州", 46), ("昆明", 35),("成都",35),("重庆",33),("西安",29),("哈尔滨",20),("沈阳",20),("长沙",17),
("郑州",17),("长春",17),("郑州",14),("乌鲁木齐",13)],
type_=ChartType.EFFECT_SCATTER,
color="black",
)
.add(
"浦东航线",
[("上海", "广州"), ("上海", "沈阳"), ("上海", "深圳"), ("上海", "重庆"),("上海","西安"),("上海","成都"),
("上海","长春"),("上海","昆明"),("上海","郑州"),("上海","哈尔滨")],
type_=ChartType.LINES,
effect_opts=opts.EffectOpts(
symbol=SymbolType.ARROW, symbol_size=6, color="blue"
),
linestyle_opts=opts.LineStyleOpts(curve=0.2),
)
.add(
"虹桥航线",
[("上海", "广州"), ("上海", "北京"), ("上海", "深圳"), ("上海", "重庆"),("上海","西安"),("上海","成都"),
("上海","长沙"),("上海","昆明"),("上海","天津"),("上海","乌鲁木齐")],
type_=ChartType.LINES,
effect_opts=opts.EffectOpts(
symbol=SymbolType.ARROW, symbol_size=6, color="yellow"
),
linestyle_opts=opts.LineStyleOpts(curve=0.2),
)
.set_series_opts(label_opts=opts.LabelOpts(is_show=False))
.set_global_opts(title_opts=opts.TitleOpts(title="上海虹桥、浦东联通城市TOP10航线图"))
)
return c.render_notebook()
这里有点繁琐,因为数据都是手动敲上去的,我想这里应该可以用一个字典把数据存入,这样或许更方便些。效果如图:
桑基图可以很好的表达出数量流动的关系,这里可以用来表示机场航班数的流向,而pyecharts中也很贴心的有制作相关图像的函数。
import json
import os
from pyecharts import options as opts
from pyecharts.charts import Page, Sankey
def sankey_base() -> Sankey:
nodes = [
{"name": "沈阳"},
{"name": "虹桥机场"},
{"name": "昆明"},
{"name": "长春"},
{"name": "哈尔滨"},
{"name": "成都"},
{"name": "重庆"},
{"name": "深圳"},
{"name": "郑州"},
{"name": "广州"},
{"name": "西安"},
{"name": "北京"},
{"name": "长沙"},
{"name": "乌鲁木齐"},
{"name": "天津"},
{"name": "浦东机场"}
]#写入地点
links = [
{"source": "虹桥机场", "target": "深圳", "value":43},
{"source": "浦东机场", "target": "沈阳", "value":19},
{"source": "浦东机场", "target": "昆明", "value":21},
{"source": "浦东机场", "target": "长春", "value":17},
{"source": "浦东机场", "target": "成都", "value":20},
{"source": "虹桥机场", "target": "北京", "value":47},
{"source": "虹桥机场", "target": "广州", "value":36},
{"source": "虹桥机场", "target": "重庆", "value":17},
{"source": "虹桥机场", "target": "西安", "value":17},
{"source": "虹桥机场", "target": "成都", "value":15},
{"source": "虹桥机场", "target": "昆明", "value":14},
{"source": "虹桥机场", "target": "长沙", "value":11},
{"source": "虹桥机场", "target": "天津", "value":10},
{"source": "虹桥机场", "target": "乌鲁木齐", "value":9},
{"source": "浦东机场", "target": "哈尔滨", "value":18},
{"source": "浦东机场", "target": "重庆", "value":16},
{"source": "浦东机场", "target": "郑州", "value":10},
{"source": "浦东机场", "target": "深圳", "value":12},
{"source": "浦东机场", "target": "广州", "value":10},
{"source": "浦东机场", "target": "西安", "value":12}
]#写入流向连接
c = (
Sankey()
.add(
"飞机",
nodes,
links,
linestyle_opt=opts.LineStyleOpts(opacity=2, curve=0.2, color="source"),
label_opts=opts.LabelOpts(position="right"),
)
.set_global_opts(title_opts=opts.TitleOpts(title="上海虹桥、浦东联通城市TOP10"))
)
return c.render_notebook()
这就是简单的一个数据可视化过程,但是数据是死的,之后我又结合了城市的GDP、人口数量、旅游总收入、到上海的航空里程的数据,采用机器学习的方法分析并预测航班数与其的关系,具体过程就不写出来了,有兴趣的私我要具体代码。