import requests
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
import os
import csv
import time
def get_one_page(url):
'''
获取网页
'''
print('正在加载'+url)
headers={'User-Agent':'User-Agent:Mozilla/5.0'}
try:
response = requests.get(url,headers=headers)
if response.status_code == 200:
return response.content
return None
except RequestException:
return None
def parse_one_page(html):
'''
对网页内容进行解析
'''
soup = BeautifulSoup(html, "lxml")
info = soup.find('div', class_='wdetail')
rows=[]
tr_list = info.find_all('tr')[1:] # 使用从第二个tr开始取
for index, tr in enumerate(tr_list): # enumerate可以返回元素的位置及内容
td_list = tr.find_all('td')
date = td_list[0].text.strip().replace("\n", "") # 取每个标签的text信息,并使用replace()函数将换行符删除
weather = td_list[1].text.strip().replace("\n", "").split("/")[0].strip()
temperature_high = td_list[2].text.strip().replace("\n", "").split("/")[0].strip()
temperature_low = td_list[2].text.strip().replace("\n", "").split("/")[1].strip()
rows.append((date,weather,temperature_high,temperature_low))
return rows
cities = ['chengdu','aba','bazhong','dazhou','deyang','ganzi','guangan',
'guangyuan','leshan','luzhou','meishan','mianyang','neijiang','nanchong',
'panzhihua','scsuining','yaan','yibin','ziyang','zigong','liangshan']
years = ['2012','2013','2014','2015','2016','2017','2018']
months = ['01','02','03','04','05','06', '07', '08','09','10','11','12']
if __name__ == '__main__':
# os.chdir() # 设置工作路径
for city in cities:
with open(city + '_weather.csv', 'a', newline='') as f:
writer = csv.writer(f)
writer.writerow(['date','weather','temperature_high','temperature_low'])
for year in years:
for month in months:
url = 'http://www.tianqihoubao.com/lishi/'+city+'/month/'+year+month+'.html'
html = get_one_page(url)
content=parse_one_page(html)
writer.writerows(content)
print(city+year+month+' is OK!')
time.sleep(2)
你要是比较懒的话也可以先爬取城市作为列表并用循环方式生成年份与月份
import requests
from bs4 import BeautifulSoup
import re
url = 'http://www.tianqihoubao.com/lishi/'
headers={'User-Agent':'User-Agent:Mozilla/5.0'}
response = requests.get(url,headers=headers)
html = response.content
soup = BeautifulSoup(html, "lxml")
results = soup.find('a', title='四川历史天气预报').parent.next_sibling # 爬取其他地方也行,只需将title里的四川更换
patterns= re.compile('href=.*?/lishi/([a-z]+).*?html')
cities = re.findall(patterns, str(results))
years = []
for i in range(7):
years.append('20'+str(i+12).zfill(2))
months = []
for i in range(12):
months.append(str(i+1).zfill(2))
代码可能还需优化,欢迎指教