中国天气网数据爬取

中国天气网数据爬取(全部)

import requests
from bs4 import BeautifulSoup
from pyecharts import Bar#表格库

def main():
urls = [
‘http://www.weather.com.cn/textFC/hb.shtml’,'http://www.weather.com.cn/textFC/db.shtml’,
‘http://www.weather.com.cn/textFC/hd.shtml’,'http://www.weather.com.cn/textFC/hz.shtml’,
‘http://www.weather.com.cn/textFC/hn.shtml’,'http://www.weather.com.cn/textFC/xb.shtml’,
‘http://www.weather.com.cn/textFC/xn.shtml’,'http://www.weather.com.cn/textFC/gat.shtml’
]
for url in urls:
parse_data(url)

抓取和解析数据

all_data = [ ]
def parse_data(url):

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36'
}
response = requests.get(url,headers = headers)
text = response.content.decode('utf-8')#使用response.text会发生乱码
# html5lib 速度慢,适用于不规范代码解析;lxml 速度快,适用于规范代码
soup = BeautifulSoup(text,'html5lib')
conMidtab = soup.find('div',attrs={'class' :'conMidtab'})
tables = conMidtab.find_all('table')
print(tables)

for table in tables:
    trs = table.find_all('tr')[2:]#去掉前两行
    for index,tr in enumerate(trs):
       #enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中
       tds = tr.find_all('td')
       if index == 0:
          city_td = tds[1]
       else:
           city_td = tds[0]
       city = list(city_td.stripped_strings)[0]#stripped_strings 方法可以得到过滤掉空格和空行的内容
       tem_td = tds[-2]
       min_tem = list(tem_td.stripped_strings)[0]
       print(min_tem)
       all_data.append({'city':city,'min_tem':int(min_tem)})#append函数是在列表的末尾添加新的对象,把温度变为整型便于排序
       all_data.sort(key = lambda x:x['min_tem'])
       data = all_data[0:10]
       #横纵坐标
       cities = list(map(lambda x:x['city'],data ))#map是从列表中找出的想要的对象
       min_tems = list(map(lambda x:x['min_tem'],data))
       #柱状图
       char = Bar('中国天气最低排行榜')
       char.add('',cities,min_tems)
       char.render('tempature.html')

你可能感兴趣的:(Python)