(1)编程环境为anaconda–Jupyter,以下源码已划分好,如使用Jupyter环境编写,请按照顺序写在不同代码块中,注意:若将程序写在一个代码块中,将无法运行!
(2)源码代码注释的部分为单步调试代码,想查看单个块执行得到的是什么数据的话,可以微改代码将数据直接print出来。
爬取天气网站数据并保存为.csv表格数据。
import time
import requests
from bs4 import BeautifulSoup
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}#爬虫[Requests设置请求头Headers],伪造浏览器
# 把1转换为01
url= 'http://www.tianqihoubao.com/aqi/aqi_rank.html'
params = {"show_ram":1}
response = requests.get(url,params=params, headers=headers)#访问url
#listData=[]
soup = BeautifulSoup(response.text, 'html.parser')#获取网页源代码
tr = soup.find('table',class_='b').find_all('tr')#.find定位到所需数据位置 .find_all查找所有的tr(表格)
# 去除标签栏
for j in tr[5:]: #表头为第0列
td = j.find_all('td')#td表格
num = td[0].get_text().strip() #遍历排名
city = td[1].get_text().strip() #遍历城市
AQI = td[2].get_text().strip() #遍历空气质量指数AQI
pm = td[3].get_text().strip() #遍历PM2.5浓度
AQI_rank = td[4].get_text() #遍历空气质量状况
province = td[5].get_text() #遍历省份
#listData.append([num,city,AQI,pm,AQI_rank,province])
#print (listData)
with open(r'./big_data.csv', 'a+', encoding='utf-8') as f1:
f1.write(num + ',' + city + ',' + AQI + ',' + pm + ',' + AQI_rank +',' + province + '\n ')
数据处理,将爬取的数据处理为Echarts图表数据相同的格式。
#数据处理
import pandas as pd
import numpy as np
sj0 = pd.read_csv("./big_data.csv", usecols=[5]) # 省份
sj_0 = sj0.values.tolist()
sj1 = pd.read_csv("./big_data.csv", usecols=[2]) # AQI
sj2 = pd.read_csv("./big_data.csv", usecols=[3]) # PM2.5
sj_1 = sj1.values.tolist()#转换成二维数组
sj_2 = sj2.values.tolist()
i = 0 # 取第一列
sj_1 = [item[i] for item in sj_1]#二维转换成一维数组
sj_2 = [item[i] for item in sj_2]
#print(df_li1)
将AQI - PM2.5数据以折线图形式表示。
#AQI - PM2.5折线图示例
import json
import pyecharts
from pyecharts import Line
attr =sj_0
v1 =sj_1
v2 =sj_2
line =pyecharts.Line("AQI - PM2.5折线图示例", width=800, height=500)
line.add("AQI", attr, v1, mark_point=['average'], is_datazoom_show=True)
line.add("PM2.5", attr, v2, mark_line=['average'], is_smooth=True)
line
将AQI - PM2.5数据以柱状图形式表示。
#AQI - PM2.5柱状图示例
import json
import pyecharts
from pyecharts import Bar
attr = sj_0
v1=sj_1#AQI
v2=sj_2#pm2.5
bar = Bar('AQI - PM2.5柱状图示例')
bar.add('AQI',attr,v1, mark_point=['max'], legend_text_color='red', is_datazoom_show=True)
bar.add('PM2.5',attr,v2, mark_line=['min'], legend_text_color='blue')
bar
爬取各个省的AQI,并求平均值,然后保存为.csv表格文件。
#全国空气质量平均分布图
import time
import requests
from bs4 import BeautifulSoup
def Get_Average(list): #计算AQI平均值
if len(list) == 0:
return 0
sum = 0
for item in list:
sum += item
return sum/len(list)
headers= {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
url = 'http://www.tianqi.com/air/guizhou.html'
response = requests.get(url=url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
#把所有省的名字抓下来
province_all=soup.find('div',class_='air_citybox1')
provinces_all=province_all.find_all('a')
provinces_name=[]
province_list=[]
AQI_list=[]
listData=[]
for pn in provinces_all:
provinces_name.append(pn['href'])
province_list.append(pn.get_text().strip())
provinces_name.pop(27)#天津的爬不了,格式不一样
provinces_name.pop(24)#上海的爬不了,格式不一样
provinces_name.pop(3)#重庆的爬不了,格式不一样
provinces_name.pop(2)#北京的爬不了,格式不一样
province_list.pop(27)
province_list.pop(24)
province_list.pop(3)
province_list.pop(2)
#然后把各省对应的地址接到url后面
for tail in provinces_name:
url = 'http://www.tianqi.com/'+tail
response = requests.get(url=url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
air_tab = soup.find('table', class_='air_tab')
tr = air_tab.find_all('tr')
AQI_temp1=[]
# 去除标签栏
ittr = iter(tr)#迭代器,去掉列名
next(ittr)#迭代器,去掉列名
for j in ittr:
td=j.find_all('td')
dat0 = td[0].get_text().strip()
dat1 = td[1].get_text().strip()
dat2 = td[2].get_text().strip()
AQI_temp1.append(dat2)
dat3 = td[3].get_text().strip()
with open(r'./guizhou.csv', 'a+', encoding='utf-8') as f:
f.write(dat0 + ',' + dat1 + ',' + dat2 + ','+ dat3 + '\n ')
AQI_temp1=[int(i) for i in AQI_temp1]
AQI_list.append(Get_Average(AQI_temp1))
画中国版图和柱状图
#画中国版图,全国空气质量平均分布图
from pyecharts import Map, Geo
import pandas as pd
map = Map("",width=300, height=300)
map.add("", province_list, AQI_list, maptype='china', is_visualmap=True,visual_text_color='#000')
#画柱状图
import json
import pyecharts
from pyecharts import *
attr = province_list#全国各省
v1=AQI_list#全国各省部平均AQI
bar = Bar('全国空气质量平均AQI(仅包含部分城市)',width=300, height=1000)
bar.add('平均AQI',attr,v1)
bar
#grid.add(bar, grid_opts=opts.GridOpts(pos_left="55%"))
#grid.add(map, grid_opts=opts.GridOpts(pos_right="55%"))
grid = Grid()
grid.add(bar, grid_bottom='50%')
grid.add(map, grid_top='50%')
grid