BeautifulSoup是什么
一个灵活方便的网页解析库,处理高效,支持多种解析器
利用他不用编写正则表达式即可方便地实现网页信息的提取
安装
pip install beautifulsoup4
支持的解析库
解析器 | 使用方法 | 优势 | 劣势 |
---|---|---|---|
Python标准库 | BeautifulSoup(markup, "html.parser") | 内置库,速度一般,容错率不错 | python老版本容错率差 |
lxml HTML解析库 | BeautifulSoup(markup, "lxml") | 速度快,容错率强 | 需要安装C语言库 |
lxml XML解析库 | BeautifulSoup(markup, "xml") | 速度快,唯一支持的XML解析器 | 需要安装C语言库 |
html5lib | BeautifulSoup(markup, "html5lib") | 最好的容错性,以浏览器的方式解析文档,生成HTML5格式的文档 | 速度慢,不依赖外部扩展 |
基本使用
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.prettify())
print(soup.title.string)
标签选择器
选择元素
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(type(soup.title))
print(soup.title)
print(soup.head)
print(soup.p) # 匹配第一个结果
获取名称
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.title.name)
获取属性
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.p.attrs['name'])
print(soup.p['name'])
获取内容
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.p.string)
嵌套选择
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.head.title.string)
子节点 and 子孙节点
用 contents 以返回列表的形式获取所有子节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.p.contents)
用 children 以返回迭代器的形式获取所有子节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.p.children)
for i, child in enumerate(soup.p.children):
print(i, child)
用 descendants 以返回列表的形式获取所有子孙节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):
print(i, descendant)
父节点 and 祖先节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent)
print(list(soup.a.parents))
兄弟节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.net_siblings)))
print(list(enumerate(soup.a.previous_siblings)))
标准选择器
find_all(name, attrs, recursive, text, **kwargs)
可以根据标签、属性、内容查找文档
name
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all('li'))
print(type(soup.find_all('li')[0]))
attrs
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': '...'}))
print(soup.find_all(attrs={'name': '...'}))
print(soup.find_all(id='...')
print(soup.find_all(class_='...')
text
只匹配,不会返回匹配的内容
其他
find(name, attrs, recursive, text, **kwargs)
find_parents() and find_parent()
find_next_siblings() and find_next_sibling()
find_previous_siblings() and find_previous_sibling()
find_all_next() and find_next()
find_all_previous() and find_previous()
CSS选择器
select()
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
soup.select('.class_content .blabla2')
soup.select('tag_name li')
soup.select('#id_content .element')
获取属性
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
获取内容
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
print(li.get_text())