python·数据采集·bs4(爬虫2)

python·数据采集·bs4(爬虫2)

  • bs4
    • bs4网页提取
    • 网页中插入链接和图片
    • bs4网页解析
      • find_all()方法
      • select()方法
    • 爬天气预报
  • pyecharts
    • 天气预报绘图(pyecharts)
    • 爬取豆瓣电影

bs4

Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.

官网说明文档链接: Beautiful Soup 4.4.0 文档.

bs4网页提取

from bs4 import BeautifulSoup
html = """

职位名称 职位类别 人数 地点 发布时间
22989-金融云区块链高级研发工程师(深圳) 技术类 1 深圳 2017-11-25
22989-金融云高级后台开发 技术类 2 深圳 2017-11-25
SNG16-腾讯音乐运营开发工程师(深圳) 技术类 2 深圳 2017-11-25
SNG16-腾讯音乐业务运维工程师(深圳) 技术类 1 深圳 2017-11-25
TEG03-高级研发工程师(深圳) 技术类 1 深圳 2017-11-24
TEG03-高级图像算法研发工程师(深圳) 技术类 1 深圳 2017-11-24
TEG11-高级AI开发工程师(深圳) 技术类 4 深圳 2017-11-24
15851-后台开发工程师 技术类 1 深圳 2017-11-24
15851-后台开发工程师 技术类 1 深圳 2017-11-24
SNG11-高级业务运维工程师(深圳) 技术类 1 深圳 2017-11-24
"""
soup = BeautifulSoup(html,'lxml') #soup = BeautifulSoup(open('tencent.html',encoding='utf-8'),'lxml') # print(soup) print(soup.prettify())

网页中插入链接和图片

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>中南民族大学官网</title>
</head>
<body>
    <a href='https://www.scuec.edu.cn/'>中南民族大学官网</a>
    <img src="https://img2.baidu.com/it/u=4158369516,1500848430&fm=26&fmt=auto&gp=0.jpg?qq-pf-to=pcqq.c2c"><img>
</body>
</html>

bs4网页解析

find_all()方法

  1. find_all() 方法搜索当前tag的所有tag子节点,并判断是否符合过滤器的条件.
  2. 通过 string 参数可以搜搜文档中的字符串内容.
  3. find_all() 方法返回全部的搜索结构,如果文档树很大那么搜索会很慢.如果我们不需要全部结果,可以使用 limit
    参数限制返回结果的数量.
from bs4 import BeautifulSoup

soup = BeautifulSoup(open('tencent.html',encoding='utf-8'),'lxml')
#print(soup)
#获取所有的tr标签
trs= soup.find_all('tr')
print(trs)
for tr in trs:
    print('='*50)
    print(tr)
#获取第二个 tr标签
tr= soup.find_all('tr',limit=2)[1]
print(tr)
#获取所以class为even的标签
#trs= soup.find_all('tr',attrs={'class':'even'})
trs= soup.find_all('tr',class_='even')
#print(trs)
for tr in trs:
    print('='*50)
    print(tr)
    
#获取所以id为test class 为test 的a标签
tag_as= soup.find_all('a',attrs={'id':'test','class':'test'})
#tag_as= soup.find_all('tr',attrs={'id':'test','class':'test'})
for t in tag_as:
    print('='*50)
    print(t)

#获取所有a标签的href 属性
alist = soup.find_all('a')
for a in alist:
    href=a['href']
    href='http//'+href
    print(href)

# #获取所有职位信息纯文本
trs= soup.find_all('tr')[1:]
movies =[]
for tr in trs:
    movie = {}
#     # tds=tr.find_all('td')
#     # #print(tds)
#     # title=tds[0].string
#     # category=tds[1].string
#     # number=tds[2].string
#     # city=tds[3].string
#     # pubtime=tds[4].string
#     # movie['title']=title
#     # movie['category']=category
#     # movie['number']=number
#     # movie['city']=city
#     # movie['pubtime']=pubtime
#     # #print(movie)
#     # movies.append(movie)
    infos=list(tr.stripped_strings)
    #print(infos)
    movie['title']=infos[0]
    movie['category']=infos[1]
    movie['number']=infos[2]
    movie['city']=infos[3]
    movie['pubtime']=infos[4]
    #print(movie)
    movies.append(movie)
# print(movies)

select()方法

from bs4 import BeautifulSoup

soup = BeautifulSoup(open('tencent.html',encoding='utf-8'),'lxml')
# select 获取一个  select one 获取多个
#print(soup)
#获取所有的tr标签
trs= soup.select('tr')
print(trs)
for tr in trs:
    print('='*50)
    print(tr)

#获取所以class为even的标签
trs= soup.select('tr[class="even"]')
print(trs)
for tr in trs:
    print('='*50)
    print(tr)

#获取所以id为test class 为test 的a标签
al=soup.select('a[class="test"][id="test"]')
print(al)


#获取所有a标签的href 属性

alist = soup.select('a')
for a in alist:
    href=a['href']
    href='http//'+href
    print(href)

# #获取所有职位信息纯文本
trs = soup.select('tr')
movies= []
for tr in trs:
    movie = {}
    #print(tr)
    infos = list(tr.stripped_strings)
    print(infos)
    movie['title'] = infos[0]
    movie['category'] = infos[1]
    movie['number'] = infos[2]
    movie['city'] = infos[3]
    movie['pubtime'] = infos[4]
    movies.append(movie)
print(movies)

爬天气预报

import requests
from bs4 import BeautifulSoup

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Referer": "http://www.weather.com.cn",
}
ALL_DATA = []
def parse_weather(url):
    response  = requests.get(url,headers=headers)
    data = response.content.decode('utf-8')
    soup = BeautifulSoup(data,'html5lib')
    conMidtab = soup.find('div',class_='conMidtab')
    tables = conMidtab.find_all('table')
    for table in tables:
        trs = table.find_all('tr')[2:]
        for index,tr in enumerate(trs):
            tds = tr.find_all('td')
            # print(index,tds)
            city_td =tds[0]
            temp_td = tds[3]
            if index==0:
                city_td = tds[1]
                temp_td = tds[4]
            city_td = list(city_td.stripped_strings)[0]

            temp_td = list(temp_td.stripped_strings)[0]
            print(city_td,temp_td)
            ALL_DATA.append({'city':city_td,'temp':temp_td})
urls = [
    'http://www.weather.com.cn/textFC/hb.shtml',
    'http://www.weather.com.cn/textFC/hd.shtml',
    'http://www.weather.com.cn/textFC/hz.shtml',
    'http://www.weather.com.cn/textFC/hn.shtml',
    'http://www.weather.com.cn/textFC/xn.shtml',
    'http://www.weather.com.cn/textFC/xb.shtml',
    'http://www.weather.com.cn/textFC/db.shtml',
    'http://www.weather.com.cn/textFC/gat.shtml',
]

for url in urls:
    parse_weather(url)

print(ALL_DATA)

pyecharts

天气预报绘图(pyecharts)

pyecharts官网链接快速开始-pyecharts

from pyecharts.charts import Bar
bar = Bar()
bar.add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
bar.add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
# render 会生成本地 HTML 文件,默认会在当前目录生成 render.html 文件
# 也可以传入路径参数,如 bar.render("mycharts.html")
bar.render("mycharts.html")

python·数据采集·bs4(爬虫2)_第1张图片

import requests
from bs4 import BeautifulSoup
from pyecharts.charts import Bar

urls = [
    'http://www.weather.com.cn/textFC/hb.shtml',
    'http://www.weather.com.cn/textFC/db.shtml',
    'http://www.weather.com.cn/textFC/hd.shtml',
    'http://www.weather.com.cn/textFC/hz.shtml',
    'http://www.weather.com.cn/textFC/hn.shtml',
    'http://www.weather.com.cn/textFC/xn.shtml',
    'http://www.weather.com.cn/textFC/xb.shtml',
    'http://www.weather.com.cn/textFC/gat.shtml',
]

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    'Referer': 'http://www.weather.com.cn',
}
ALL_DATA = []
def parse_weather(url):
    #print(url)
    response = requests.get(url,headers=headers)
    data= response.content.decode('utf-8')
    soup=BeautifulSoup(data,'html5lib')
    conMidtab=soup.find('div',class_='conMidtab')
    #print(conMidtab)
    tables = conMidtab.find_all('table')
    #print(tables)
    for table in tables:
        trs = table.find_all('tr')[2:]
        for index,tr in enumerate(trs):
            tds = tr.find_all('td')
            #print(tds)
            city_td = tds[0]
            temp_td = tds[3]
            if index == 0:
                city_td = tds[1]  #去掉省
                temp_td = tds[4]
            city_td = list(city_td.stripped_strings)[0]
            #print(city_td,temp_td)
            #temp_td = tds[-2]
            temp_td = list(temp_td.stripped_strings)[0]
            #print(city_td,temp_td)
            ALL_DATA.append({'city':city_td,'temp':temp_td})
            #print(ALL_DATA)

for url in urls:
    parse_weather(url)
print(ALL_DATA)
ALL_DATA.sort(key = lambda x:x['temp'],reverse=True)
data = ALL_DATA[0:10]
print(data)
cities = list(map(lambda x:x['city'],data))
max_temp = list(map(lambda x:x['temp'],data))
print(cities,max_temp)

bar = Bar()
bar.add_xaxis(cities)
bar.add_yaxis("最高气温", max_temp)
# render 会生成本地 HTML 文件,默认会在当前目录生成 render.html 文件
# 也可以传入路径参数,如 bar.render("mycharts.html")
bar.render("天气预报.html")

python·数据采集·bs4(爬虫2)_第2张图片

爬取豆瓣电影

import requests
#解析完了以后 写入到csv文件中
import csv
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
    'Cookie': '_vwo_uuid_v2=DC80EBC21226283A1240342BE3480FE0A|c7fd601d031edcb8037bc15529ab56f6; gr_user_id=284e3253-d581-45f9-8fa4-98744c58c832; __utmv=30149280.6234; bid=ffka2svYxXA; ll="118254"; push_noty_num=0; push_doumail_num=0; douban-fav-remind=1; dbcl2="62342531:dY5ZmFspoS8"; __utmz=30149280.1624602715.35.9.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ck=qaUj; __utmc=30149280; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1626077642%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DC0Kk1oJojNi2sgqp5GxndVXB782gWb8vlrFb_10D7JmKbbqowJSZPE3-yozCL_MYOeLYx7v_dPUeg0y5gDnRPK%26wd%3D%26eqid%3Ddca96deb0031f01b0000000660caa793%22%5D; _pk_id.100001.8cb4=287231f71f55669d.1586610883.23.1626077642.1625994977.; _pk_ses.100001.8cb4=*; ap_v=0,6.0; __utma=30149280.1293880704.1583806417.1626052464.1626077642.40; __utmt=1; __utmb=30149280.3.9.1626077642'
}
file = open('豆瓣电影.csv','w',encoding='utf-8',newline='') #创建一个文件
writer = csv.writer(file) # 创建一个写入器
writer.writerow(('影片名称','评分','海报')) # 表头
for x in range(10):
    # 构建
    url = f'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start={x*20}'
    res = requests.get(url,headers=headers)
    data = res.json()
    data_list = data.get('subjects')
    for data in data_list:
        title = data['title']
        rate = data['rate']
        cover = data['cover']
        movielist = [title,rate,cover]
        print(title,rate,cover)
        writer.writerow(movielist)

url = "https://job.alibaba.com/zhaopin/socialPositionList/doList.json"

你可能感兴趣的:(python爬虫学习笔记,python,爬虫,bs4,pyecharts)