1、解析html并以友好形式显示:BeautifulSoup(html_doc,'html.parser') print(soup.prettify())
html_doc = """
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
2、结构语句:
soup.title #获取标题
sout.title.name
soup.title.string #获取标题标签内的内容 The Dormouse's story
soup.title.parent.name
soup.p #获取第一个标签p
soup.p['class'] #获取第一个标签p的class内容
soup.a #获取第一个标签a
soup.find_all('a') #获取所有标签a,以列表返回
soup.find(id="link3") #根据属性查找
for link in soup.find_all('a'):
print(link.get('href'))
# http://example.com/elsie
# http://example.com/lacie
# http://example.com/tillie
print(soup.get_text()) #获取文档内容,不带任何标签
3、其他组件安装:
pip install lxml
pip install html5lib
4、几种解析器:
BeautifulSoup(markup, "html.parser")
BeautifulSoup(markup, "lxml")
BeautifulSoup(markup, "html5lib")
5、tag的用法:
soup = BeautifulSoup('Extremely bold')
tag = soup.b
tag.name
tag.name = "blockquote"
tag.string
tag.string.replace_with("No longer bold")
tag['class']
tag.attrs
tag['class'] = 'verybold'
tag['id'] = 1
del tag['class']
del tag['id']
6、tag.contents 将子节点以列表输出。
通过tag的 .children 生成器,可以对tag的子节点进行循环:
for child in title_tag.children:
print(child)
.descendants 属性可以对所有tag的子孙节点进行递归循环
for child in head_tag.descendants:
print(child)
7、循环输出不带标签的所有内容:
for string in soup.strings:
print(repr(string))
去掉空白
for string in soup.stripped_strings:
print(repr(string))
8、.parent 获得父节点
.parents获得所有父节点
.next_sibling / .previous_sibling 兄弟节点
.next_element 和 .previous_element 指向解析过程中下一个被解析的对象
9、find/find_all
使用正则:
import re
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
body
b
列表
soup.find_all(["a", "b"])
tag.has_attr('id')
soup.find_all(href=re.compile("elsie"), id='link1')
data_soup.find_all(attrs={"data-foo": "value"})
soup.find_all("a", class_="sister")
soup.find_all(string="Elsie")
soup.find_all("a", limit=2) #只返回2个
soup.html.find_all("title", recursive=False) #只检查1级子节点
find_parents() 和 find_parent()
find_next_siblings() 合 find_next_sibling()
find_previous_siblings() 和 find_previous_sibling()
find_all_next() 和 find_next()
find_all_previous() 和 find_previous()
css选择器方式查找:
soup.select("p nth-of-type(3)")
[
...
]soup.select("body a")
[Elsie,
Lacie,
Tillie]
soup.select("html head title")
[The Dormouse's story ]
soup.select("body > a") #>一级子标签,多级的不匹配
兄弟节点
soup.select("#link1 ~ .sister")
[Lacie,
Tillie]
soup.select("#link1 + .sister")
[Lacie]
查找类:.xx
soup.select(".sister")
[Elsie,
Lacie,
Tillie]
soup.select("[class~=sister]")
[Elsie,
Lacie,
Tillie]
通过ID查找:
soup.select("#link1")
[Elsie]
soup.select("a#link2")
[Lacie]
soup.select("#link1,#link2")
[Elsie,
Lacie]
通过属性查找
soup.select('a[href]')
[Elsie,
Lacie,
Tillie]
通过属性的值查找:
soup.select('a[href="http://example.com/elsie"]')
[Elsie]
soup.select('a[href^="http://example.com/"]')
[Elsie,
Lacie,
Tillie]
soup.select('a[href$="tillie"]')
[Tillie]
soup.select('a[href*=".com/el"]')
[Elsie]
只查找1个
soup.select_one(".sister")
10、append()追加内容
soup = BeautifulSoup("Foo")
soup.a.append("Bar")
soup
FooBar
soup.a.contents
[u'Foo', u'Bar']
insert
markup = 'I linked to example.com'
soup = BeautifulSoup(markup)
tag = soup.a
tag.insert(1, "but did not endorse ")
tag
I linked to but did not endorse example.com
tag.contents
[u'I linked to ', u'but did not endorse', example.com]
soup = BeautifulSoup("stop")
tag = soup.new_tag("i")
tag.string = "Don't"
soup.b.string.insert_before(tag)
soup.b
Don'tstop
soup.b.i.insert_after(soup.new_string(" ever "))
soup.b
Don't ever stop
soup.b.contents
[Don't, u' ever ', u'stop']
clear()清除string
markup = 'I linked to example.com'
soup = BeautifulSoup(markup)
tag = soup.a
tag.clear()
tag
extract移除元素
markup = 'I linked to example.com'
soup = BeautifulSoup(markup)
a_tag = soup.a
i_tag = soup.i.extract()
a_tag
I linked to
i_tag
example.com
print(i_tag.parent)
None
decompose也是移除元素
markup = 'I linked to example.com'
soup = BeautifulSoup(markup)
a_tag = soup.a
soup.i.decompose()
a_tag
I linked to
replace_with替换
markup = 'I linked to example.com'
soup = BeautifulSoup(markup)
a_tag = soup.a
new_tag = soup.new_tag("b")
new_tag.string = "example.net"
a_tag.i.replace_with(new_tag)
a_tag
I linked to example.net
wrap包装
soup = BeautifulSoup("
I wish I was bold.
")soup.p.string.wrap(soup.new_tag("b"))
I wish I was bold.
soup.p.wrap(soup.new_tag("div"))
I wish I was bold.
I wish I was bold.
unwrap
markup = 'I linked to example.com'
soup = BeautifulSoup(markup)
a_tag = soup.a
a_tag.i.unwrap()
a_tag
I linked to example.com
prettify格式化输出,可以指定编码格式
get_text 获得文档内容,指定分隔符
soup.get_text("|")
u'\nI linked to |example.com|\n'
如果不知道文档编码,使用UnicodeDamit来自动编码
from bs4 import UnicodeDammit
dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!")
print(dammit.unicode_markup)
Sacré bleu!
dammit.original_encoding
'utf-8'
11、lxml解析比其他块
Beautiful Soup对文档的解析速度不会比它所依赖的解析器更快,如果对计算时间要求很高或者计算机的时间比程序员的时间更值钱,那么就应该直接使用 lxml .
换句话说,还有提高Beautiful Soup效率的办法,使用lxml作为解析器.Beautiful Soup用lxml做解析器比用html5lib或Python内置解析器速度快很多.
https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh/