Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.
官网说明文档链接: Beautiful Soup 4.4.0 文档.
from bs4 import BeautifulSoup
html = """
职位名称
职位类别
人数
地点
发布时间
22989-金融云区块链高级研发工程师(深圳)
技术类
1
深圳
2017-11-25
22989-金融云高级后台开发
技术类
2
深圳
2017-11-25
SNG16-腾讯音乐运营开发工程师(深圳)
技术类
2
深圳
2017-11-25
SNG16-腾讯音乐业务运维工程师(深圳)
技术类
1
深圳
2017-11-25
TEG03-高级研发工程师(深圳)
技术类
1
深圳
2017-11-24
TEG03-高级图像算法研发工程师(深圳)
技术类
1
深圳
2017-11-24
TEG11-高级AI开发工程师(深圳)
技术类
4
深圳
2017-11-24
15851-后台开发工程师
技术类
1
深圳
2017-11-24
15851-后台开发工程师
技术类
1
深圳
2017-11-24
SNG11-高级业务运维工程师(深圳)
技术类
1
深圳
2017-11-24
"""
soup = BeautifulSoup(html,'lxml')
#soup = BeautifulSoup(open('tencent.html',encoding='utf-8'),'lxml')
# print(soup)
print(soup.prettify())
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>中南民族大学官网</title>
</head>
<body>
<a href='https://www.scuec.edu.cn/'>中南民族大学官网</a>
<img src="https://img2.baidu.com/it/u=4158369516,1500848430&fm=26&fmt=auto&gp=0.jpg?qq-pf-to=pcqq.c2c"><img>
</body>
</html>
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('tencent.html',encoding='utf-8'),'lxml')
#print(soup)
#获取所有的tr标签
trs= soup.find_all('tr')
print(trs)
for tr in trs:
print('='*50)
print(tr)
#获取第二个 tr标签
tr= soup.find_all('tr',limit=2)[1]
print(tr)
#获取所以class为even的标签
#trs= soup.find_all('tr',attrs={'class':'even'})
trs= soup.find_all('tr',class_='even')
#print(trs)
for tr in trs:
print('='*50)
print(tr)
#获取所以id为test class 为test 的a标签
tag_as= soup.find_all('a',attrs={'id':'test','class':'test'})
#tag_as= soup.find_all('tr',attrs={'id':'test','class':'test'})
for t in tag_as:
print('='*50)
print(t)
#获取所有a标签的href 属性
alist = soup.find_all('a')
for a in alist:
href=a['href']
href='http//'+href
print(href)
# #获取所有职位信息纯文本
trs= soup.find_all('tr')[1:]
movies =[]
for tr in trs:
movie = {}
# # tds=tr.find_all('td')
# # #print(tds)
# # title=tds[0].string
# # category=tds[1].string
# # number=tds[2].string
# # city=tds[3].string
# # pubtime=tds[4].string
# # movie['title']=title
# # movie['category']=category
# # movie['number']=number
# # movie['city']=city
# # movie['pubtime']=pubtime
# # #print(movie)
# # movies.append(movie)
infos=list(tr.stripped_strings)
#print(infos)
movie['title']=infos[0]
movie['category']=infos[1]
movie['number']=infos[2]
movie['city']=infos[3]
movie['pubtime']=infos[4]
#print(movie)
movies.append(movie)
# print(movies)
from bs4 import BeautifulSoup
soup = BeautifulSoup(open('tencent.html',encoding='utf-8'),'lxml')
# select 获取一个 select one 获取多个
#print(soup)
#获取所有的tr标签
trs= soup.select('tr')
print(trs)
for tr in trs:
print('='*50)
print(tr)
#获取所以class为even的标签
trs= soup.select('tr[class="even"]')
print(trs)
for tr in trs:
print('='*50)
print(tr)
#获取所以id为test class 为test 的a标签
al=soup.select('a[class="test"][id="test"]')
print(al)
#获取所有a标签的href 属性
alist = soup.select('a')
for a in alist:
href=a['href']
href='http//'+href
print(href)
# #获取所有职位信息纯文本
trs = soup.select('tr')
movies= []
for tr in trs:
movie = {}
#print(tr)
infos = list(tr.stripped_strings)
print(infos)
movie['title'] = infos[0]
movie['category'] = infos[1]
movie['number'] = infos[2]
movie['city'] = infos[3]
movie['pubtime'] = infos[4]
movies.append(movie)
print(movies)
import requests
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Referer": "http://www.weather.com.cn",
}
ALL_DATA = []
def parse_weather(url):
response = requests.get(url,headers=headers)
data = response.content.decode('utf-8')
soup = BeautifulSoup(data,'html5lib')
conMidtab = soup.find('div',class_='conMidtab')
tables = conMidtab.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
# print(index,tds)
city_td =tds[0]
temp_td = tds[3]
if index==0:
city_td = tds[1]
temp_td = tds[4]
city_td = list(city_td.stripped_strings)[0]
temp_td = list(temp_td.stripped_strings)[0]
print(city_td,temp_td)
ALL_DATA.append({'city':city_td,'temp':temp_td})
urls = [
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/gat.shtml',
]
for url in urls:
parse_weather(url)
print(ALL_DATA)
pyecharts官网链接快速开始-pyecharts
from pyecharts.charts import Bar
bar = Bar()
bar.add_xaxis(["衬衫", "羊毛衫", "雪纺衫", "裤子", "高跟鞋", "袜子"])
bar.add_yaxis("商家A", [5, 20, 36, 10, 75, 90])
# render 会生成本地 HTML 文件,默认会在当前目录生成 render.html 文件
# 也可以传入路径参数,如 bar.render("mycharts.html")
bar.render("mycharts.html")
import requests
from bs4 import BeautifulSoup
from pyecharts.charts import Bar
urls = [
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/gat.shtml',
]
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
'Referer': 'http://www.weather.com.cn',
}
ALL_DATA = []
def parse_weather(url):
#print(url)
response = requests.get(url,headers=headers)
data= response.content.decode('utf-8')
soup=BeautifulSoup(data,'html5lib')
conMidtab=soup.find('div',class_='conMidtab')
#print(conMidtab)
tables = conMidtab.find_all('table')
#print(tables)
for table in tables:
trs = table.find_all('tr')[2:]
for index,tr in enumerate(trs):
tds = tr.find_all('td')
#print(tds)
city_td = tds[0]
temp_td = tds[3]
if index == 0:
city_td = tds[1] #去掉省
temp_td = tds[4]
city_td = list(city_td.stripped_strings)[0]
#print(city_td,temp_td)
#temp_td = tds[-2]
temp_td = list(temp_td.stripped_strings)[0]
#print(city_td,temp_td)
ALL_DATA.append({'city':city_td,'temp':temp_td})
#print(ALL_DATA)
for url in urls:
parse_weather(url)
print(ALL_DATA)
ALL_DATA.sort(key = lambda x:x['temp'],reverse=True)
data = ALL_DATA[0:10]
print(data)
cities = list(map(lambda x:x['city'],data))
max_temp = list(map(lambda x:x['temp'],data))
print(cities,max_temp)
bar = Bar()
bar.add_xaxis(cities)
bar.add_yaxis("最高气温", max_temp)
# render 会生成本地 HTML 文件,默认会在当前目录生成 render.html 文件
# 也可以传入路径参数,如 bar.render("mycharts.html")
bar.render("天气预报.html")
import requests
#解析完了以后 写入到csv文件中
import csv
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Cookie': '_vwo_uuid_v2=DC80EBC21226283A1240342BE3480FE0A|c7fd601d031edcb8037bc15529ab56f6; gr_user_id=284e3253-d581-45f9-8fa4-98744c58c832; __utmv=30149280.6234; bid=ffka2svYxXA; ll="118254"; push_noty_num=0; push_doumail_num=0; douban-fav-remind=1; dbcl2="62342531:dY5ZmFspoS8"; __utmz=30149280.1624602715.35.9.utmcsr=accounts.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/; ck=qaUj; __utmc=30149280; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1626077642%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DC0Kk1oJojNi2sgqp5GxndVXB782gWb8vlrFb_10D7JmKbbqowJSZPE3-yozCL_MYOeLYx7v_dPUeg0y5gDnRPK%26wd%3D%26eqid%3Ddca96deb0031f01b0000000660caa793%22%5D; _pk_id.100001.8cb4=287231f71f55669d.1586610883.23.1626077642.1625994977.; _pk_ses.100001.8cb4=*; ap_v=0,6.0; __utma=30149280.1293880704.1583806417.1626052464.1626077642.40; __utmt=1; __utmb=30149280.3.9.1626077642'
}
file = open('豆瓣电影.csv','w',encoding='utf-8',newline='') #创建一个文件
writer = csv.writer(file) # 创建一个写入器
writer.writerow(('影片名称','评分','海报')) # 表头
for x in range(10):
# 构建
url = f'https://movie.douban.com/j/search_subjects?type=movie&tag=%E8%B1%86%E7%93%A3%E9%AB%98%E5%88%86&sort=recommend&page_limit=20&page_start={x*20}'
res = requests.get(url,headers=headers)
data = res.json()
data_list = data.get('subjects')
for data in data_list:
title = data['title']
rate = data['rate']
cover = data['cover']
movielist = [title,rate,cover]
print(title,rate,cover)
writer.writerow(movielist)
url = "https://job.alibaba.com/zhaopin/socialPositionList/doList.json"