Beautiful Soup是一个可以从HTML或XML中提取数据的python库,了解了HTML或XML的结构,能很方便地获取数据
这里使用的是 beautifulsoup4 以及 lxml 作为解析器
pip install beautifulsoup4
pip install lxml
下面一段HTML代码将作为例子被多次用到
html_doc = '''
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
'''
使用 BeautifulSoup() 解析html代码,能够得到一个 BeautifulSoup 对象,并能按照标准的锁紧格式的结构输出
BeautifulSoup(doc, 解析器 [, 解析方式])
import bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
print(soup.prettify())
#
#
#
# The Dormouse's story
#
#
#
#
#
# The Dormouse's story
#
#
#
# Once upon a time there were three little sisters; and their names were
#
# Elsie
#
# ,
#
# Lacie
#
# and
#
# Tillie
#
# ; and they lived at the bottom of a well.
#
#
# ...
#
#
#
Beautiful Soup将HTML文档转换成复杂的树结构,每个节点都是python对象,可分为4类:Tag、NavigableString、BeautifulSoup、Comment
soup = BeautifulSoup('Extremely bold
')
tag = soup.p
type(tag) #
Tag的属性:
print(tag.name) # u'p'
tag.name = 'span'
# Extremely bold
# 获取
print(tag['class']) # u'boldest'
print(tag.class) # u'boldest'
# 添加、修改
tag['class'] = 'verybold'
tag['id'] = 1
tag
# Extremely bold
# 删除
del tag['class']
del tag['id']
tag
# Extremely bold
# 获取不存在
tag['class']
# KeyError: 'class'
print(tag.get('class'))
# None
# 获取多值属性时,返回类型是list
css_soup = BeautifulSoup('')
css_soup.p['class']
# ["body", "strikeout"]
# 获取不是多值属性时,作为字符串返回
id_soup = BeautifulSoup('')
id_soup.p['id']
# 'my id'
# 修改多值属性时,赋值list,会将多个属性值合并为一个值
rel_soup = BeautifulSoup('Back to the homepage
')
rel_soup.a['rel']
# ['index']
rel_soup.a['rel'] = ['index', 'contents']
print(rel_soup.p)
# Back to the homepage
# xml不包含多值属性
xml_soup = BeautifulSoup('', 'xml')
xml_soup.p['class']
# u'body strikeout'
soup = BeautifulSoup('Extremely bold
')
tag = soup.p
tag.string # u'Extremely bold'
type(tag.string) #
unicode_string = unicode(tag.string) # u'Extremely bold'
type(unicode_string) #
tag.string.replace_with('hello world')
tag # hello world
注意:NavigableString类仅包含字符串,不包含其他内容(如tag)
markup = ""
soup = BeautifulSoup(markup)
comment = soup.b.string
type(comment)
#
一个 tag 可能包含多个字符串或其他的 tag,其都是 tag 的子节点;而字符串没有子节点
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc)
soup.a
# Elsie
head_tag = soup.head
head_tag.contents
# [The Dormouse's story ]
通过 tag的 .children 生成器,可以对子节点进行循环
for child in head_tag.contents[0].children:
print(child) # The Dormouse's story
.descendants 对所有tag的子孙节点进行递归循环
for child in head_tag.descendants:
print(child)
# The Dormouse's story
# The Dormouse's story
head_tag.contents[0].string # u'The Dormouse's story'
head_tag.string # u'The Dormouse's story'
soup.html.string # None
循环获取tag中包含的所有字符串(包括空格和空行)
for string in soup.strings:
print(repr(string))
# u"The Dormouse's story"
# u'\n\n'
# u"The Dormouse's story"
# u'\n\n'
# u'Once upon a time there were three little sisters; and their names were\n'
# u'Elsie'
# u',\n'
# u'Lacie'
# u' and\n'
# u'Tillie'
# u';\nand they lived at the bottom of a well.'
# u'\n\n'
# u'...'
# u'\n'
循环获取tag中包含的所有字符串(全部是空格的行被忽略,段首、段末的空白被删除)
for string in soup.stripped_strings:
print(repr(string))
# u"The Dormouse's story"
# u"The Dormouse's story"
# u'Once upon a time there were three little sisters; and their names were'
# u'Elsie'
# u','
# u'Lacie'
# u'and'
# u'Tillie'
# u';\nand they lived at the bottom of a well.'
# u'...'
soup.title.parent # The Dormouse's story
soup.title.string.parent # The Dormouse's story
type(soup.html.parent( #
soup.parent # None
link = soup.a
for parent in link.parents:
if parent is None:
print(parent)
else:
print(parent.name)
# p
# body
# html
# [document]
# None
同一节点下的所有子节点互为兄弟节点(不一定是同一类标签,可以是tag或字符串)
sibling_soup = BeautifulSoup("text1text2 ")
sibling_soup .b.next_sibling # text2
sibling_soup .c.next_sibling # None
sibling_soup .c.precious_sibling # text1
sibling_soup .b.precious_sibling # None
迭代获取当前节点的兄弟节点
last_a_tag # Tillie
last_a_tag.next_sibling # '; and they lived at the bottom of a well.'
last_a_tag.next_element # u'Tillie'
last_a_tag.previous_element # u' and\n'
for element in last_a_tag.next_elements:
print(repr(element))
# u'Tillie'
# u';\nand they lived at the bottom of a well.'
# u'\n\n'
# ...
# u'...'
# u'\n'
# None
Beautiful Soup定义了很多搜索方法,主要介绍 find() 和 find_all() ,其他方法与其类似
过滤器可被用于 tag 的 name 、节点属性、字符串或其混合中
soup.find_all('b')
# [The Dormouse's story]
import re
for tag in soup.find_all(re.compile("^b")):
print(tag.name)
# body
# b
传入列表参数,返回与列表中任一元素匹配的内容
soup.find_all(["a", "b"])
# [The Dormouse's story,
# Elsie,
# Lacie,
# Tillie]
传入True,则匹配所有tag,不包括字符串节点
for tag in soup.find_all(True):
print(tag.name)
# html
# head
# title
# body
# p
# b
# p
# a
# a
# a
# p
传入方法,方法被定义为只接受一个元素参数,若方法返回True则当前元素被匹配并且被找到,否则返回False
def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
soup.find_all(has_class_but_no_id)
# [The Dormouse's story
,
# Once upon a time there were...
,
# ...
]
soup.find_all("title")
# [The Dormouse's story ]
soup.find_all(href=re.compile("elsie"))
# [Elsie]
data_soup = BeautifulSoup('foo!')
data_soup.find_all(attrs={"data-foo": "value"})
# [foo!]
css_soup = BeautifulSoup('')
css_soup.find_all("p", class_="strikeout")
# []
css_soup.find_all("p", class_="body")
# []
css_soup.find_all("p", class_="body strikeout")
# []
soup.find_all(text=["Tillie", "Elsie", "Lacie"])
# [u'Elsie', u'Lacie', u'Tillie']
soup.find_all("a", text="Elsie")
# [Elsie]
限制返回的搜索结果的数量
soup.find_all("a", limit=2)
# [Elsie,
# Lacie]
ss = '''
The Dormouse's story
'''
soup.html.find_all("title")
# [The Dormouse's story ]
soup.html.find_all("title", recursive=False)
# []
像调用find_all()一样调用tag
# 以下两行代码是等价的
soup.title.find_all(text=True)
soup.title(text=True)
soup.find_all('title', limit=1)
# [The Dormouse's story ]
soup.find('title')
# The Dormouse's story
print(soup.find("nosuchtag"))
# None
a_string = soup.find(text="Lacie")
a_string
# u'Lacie'
a_string.find_parents("a")
# [Lacie]
a_string.find_parent("p")
# Once upon a time there were three little sisters; and their names were
# Elsie,
# Lacie and
# Tillie;
# and they lived at the bottom of a well.
first_link = soup.a
first_link
# Elsie
first_link.find_next_siblings("a")
# [Lacie,
# Tillie]
last_link = soup.find("a", id="link3")
last_link
# Tillie
last_link.find_previous_siblings("a")
# [Lacie,
# Elsie]
first_link = soup.a
first_link
# Elsie
first_link.find_all_next(text=True)
# [u'Elsie', u',\n', u'Lacie', u' and\n', u'Tillie',
# u';\nand they lived at the bottom of a well.', u'\n\n', u'...', u'\n']
与find_all_next()、find_next()类似,只是是查找当前 tag 之前的节点
soup.select("title") # 标签查找
soup.select("body a") # 标签逐层查找
soup.select("p > a:nth-of-type(2)") # 直接子标签
soup.select("#link1 + .sister") # 兄弟节点
soup.select(".sister") # 类名
soup.select("a#link2") # id
soup.select('a[href]') # 是否有某属性
soup.select('a[href$="tillie"]') # 属性值