昨天做了一个爬虫小任务,要求运用BeautifulSoup。弄得我要死了,不过今天总算解决了,虽然方法可能有点笨,但是总归是解决了。
方法并未封装。
from bs4 import BeautifulSoup
import requests
url = 'http://www.weather.com.cn/textFC/hb.shtml'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.9 Safari/537.36',
'charset': 'utf-8',
'Referer': 'http://www.weather.com.cn/textFC/hb.shtml'
}
res = requests.get(url=url, headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'lxml')
for div in soup.select('div.conMidtab2'):
for tr in div.select('tr'):
if tr.find('td', width='83'):
if tr.find('td', width='83').a:
print(tr.find('td', width='83').a.string)
if tr.find('td', width='89'):
print('上午:')
print(tr.find('td', width='89').string) # 天气
print(tr.find('td', width='162').contents[1].string) # 风力风向
print(tr.find('td', width='162').contents[3].string) # 风力风向
print('最高温度' + tr.find('td', width='92').string) # 最高温度
print('晚上:')
print(tr.find('td', width='98').string) # 天气
print(tr.find('td', width='177').contents[1].string) # 风力风向
print(tr.find('td', width='177').contents[3].string) # 风力风向
print('最低温度' + tr.find('td', width='86').string) # 最低气温
print('*****************')
else:
continue
---------------------------------------------------------------------------------------------
添加一下运用xpath爬取的方法:
import json
import requests
from lxml import etree
def get_url(url):
res = requests.get(url=url)
html = etree.HTML(res.text)
urls = html.xpath('//ul[@class="lq_contentboxTab2"]//a/@href')
for url in urls:
url = 'http://www.weather.com.cn' + url
yield url
def get_html(url):
res = requests.get(url=url)
res.encoding = 'utf-8'
return res.text
def parse_html(html):
html = etree.HTML(html)
htmls = html.xpath('//table')
aList = []
for html in html.xpath('//table'):
htmls = html.xpath('./tr')[3:]
date_time = html.xpath('./tr[1]/td[3]/text()')[0].replace('白天', '')
provice = html.xpath('./tr[3]/td[1]/a/text()')[0]
for html in htmls:
city = html.xpath('./td[1]/a/text()')[0]
describe = html.xpath('./td[2]/text()')[0]
wind1 = html.xpath('./td[3]/span[1]/text()')[0]
wind2 = html.xpath('./td[3]/span[2]/text()')[0]
h_tem = html.xpath('./td[4]/text()')[0]
l_tem = html.xpath('./td[7]/text()')[0]
city = {
'city': city,
'describe': describe,
'd_wind': wind1,
's_wind': wind2,
'h_tem': h_tem,
'l_tem': l_tem,
'date': date_time,
'provice': provice,
}
aList.append(city)
return aList
def go():
url = 'http://www.weather.com.cn/textFC/hb.shtml'
for url in get_url(url):
html = get_html(url)
aList = parse_html(html)
for x in aList:
print(x)
with open('tianqi.json', 'a+') as fb:
json.dump(x, fb, ensure_ascii=False)
if __name__ == "__main__":
go()