2019-06-25——BeautifulSoup4

pip install bs4
pip install lxml(用c语言库）

find_all和find找

image.png

image.png

image.png

image.png

image.png

image.png

find_all找所有，find找第一个

获得标签属性

image.png

获得标签下的文字

css选择器

image.png

select找

image.png

image.png

string多行就获取不到了，要用contents

ime.png

image.png

爬取天气预报

image.png

image.png

image.png

image.png

pip install html5lib,这个解析器能自动补充不完整的html标签，但是没有lxml快

完整代码

import requests
from bs4 import BeautifulSoup
from pyecharts.charts import Bar

ALL_DATA = []

def parse_page(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    text = response.content.decode('utf-8')
    soup = BeautifulSoup(text,'html5lib')
    conMidtab = soup.find('div',class_='conMidtab')
    tables = conMidtab.find_all('table')
    for table in tables:
        trs = table.find_all('tr')[2:]
        for index,tr in enumerate(trs):
            tds = tr.find_all('td')
            city_td = tds[0]
            if index == 0:
                city_ed = tds[1]
            high_temp = tds[-5]
            city = list(city_td.stripped_strings)[0]
            temp = list(high_temp.stripped_strings)[0]
            #print({'city':city,'temp':int(temp)})
            ALL_DATA.append({'city':city,'temp':int(temp)})
    
    ALL_DATA.sort(key=lambda data:data['temp'],reverse=True)
    data = ALL_DATA[0:10]
    cities = list(map(lambda x:x['city'],data))
    temps = list(map(lambda x:x['temp'],data))


    bar = Bar()
    bar.add_xaxis(cities)
    bar.add_yaxis("高温城市TOP10", temps)
    bar.render('temperture.html')

def main():
    urls = ['http://www.weather.com.cn/textFC/hb.shtml',
    'http://www.weather.com.cn/textFC/db.shtml',
    'http://www.weather.com.cn/textFC/hd.shtml',
    'http://www.weather.com.cn/textFC/hz.shtml',
    'http://www.weather.com.cn/textFC/hn.shtml',
    'http://www.weather.com.cn/textFC/xb.shtml',
    'http://www.weather.com.cn/textFC/xn.shtml',
    'http://www.weather.com.cn/textFC/gat.shtml',]

    for url in urls:
        parse_page(url)

if __name__ == "__main__":
    main()