基本使用
实例1:
html = """
The Dormouse's storyThe Dormouse's story
Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.head)
print(soup.p)
#输出:
The Dormouse's storyThe Dormouse's storyThe Dormouse's story
获取名称
html = """
The Dormouse's storyThe Dormouse's story
Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
#输出:
title #获取的是标签的名称
获取属性
html = """
The Dormouse's storyThe Dormouse's story
Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.attrs['name'])
print(soup.p['name'])
#输出:
dromouse
dromouse
获取内容
html = """
The Dormouse's storyThe Dormouse's story
Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.string)
#输出:
The Dormouse's story
嵌套选择
html = """
The Dormouse's storyThe Dormouse's story
Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.head.title.string)
#输出:
The Dormouse's story
子节点和子孙节点
实例1:
html = """
The Dormouse's storyOnce upon a time there were three little sisters; and their names were
Elsie
Lacie
and
Tillie
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.contents)#子节点
#输出:
['\n Once upon a time there were three little sisters; and their names were\n ',
Elsie
, '\n', Lacie, ' \n and\n ', Tillie, '\n and they lived at the bottom of a well.\n ']
In [8]:
实例2:
html = """
The Dormouse's storyOnce upon a time there were three little sisters; and their names were
Elsie
Lacie
and
Tillie
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.children)#也是子节点,但是是以遍历的方式输出
for i, child in enumerate(soup.p.children):
print(i, child)
#输出:
0
Once upon a time there were three little sisters; and their names were
1
Elsie
2
3 Lacie
4
and
5 Tillie
6
and they lived at the bottom of a well.
实例3:
html = """
The Dormouse's storyOnce upon a time there were three little sisters; and their names were
Elsie
Lacie
and
Tillie
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):#子孙节点,会输出子节点和孙节点
print(i, child)
#输出:
0
Once upon a time there were three little sisters; and their names were
1
Elsie
2
3 Elsie
4 Elsie
5
6
7 Lacie
8 Lacie
9
and
父节点和祖先节点
实例2:
html = """
The Dormouse's storyOnce upon a time there were three little sisters; and their names were
Elsie
Lacie
and
Tillie
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.a.parent)#a的父节点输出
#输出
Once upon a time there were three little sisters; and their names were
Elsie
Lacie
and
Tillie
and they lived at the bottom of a well.
实例2:
html = """
The Dormouse's storyOnce upon a time there were three little sisters; and their names were
Elsie
Lacie
and
Tillie
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.parents)))#会输出a的所有父级的节点,也就是所有p标签的内容
兄弟节点
html = """
The Dormouse's storyOnce upon a time there were three little sisters; and their names were
Elsie
Lacie
and
Tillie
and they lived at the bottom of a well.
...
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(list(enumerate(soup.a.next_siblings)))#输出第一个a标签后面的,与a同级的内容,不一定包含在a内,只要和a同级即可。
print(list(enumerate(soup.a.previous_siblings)))#输出第一个a前面的,与a同级的内容
#输出:
[(0, '\n'), (1, Lacie), (2, ' \n and\n '), (3, Tillie), (4, '\n and they lived at the bottom of a well.\n ')]
[(0, '\n Once upon a time there were three little sisters; and their names were\n ')]
标准选择器
find_all( name , attrs , recursive , text , **kwargs ) #可根据标签名、属性、内容查找文档
实例1:
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all('ul'))
print(type(soup.find_all('ul')[0]))
#输出:
[
attrs
实例1:
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}))
print(soup.find_all(attrs={'name': 'elements'}))
#输出:
[
[
实例2:
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(id='list-1'))
print(soup.find_all(class_='element'))
#输出:
[
[
Foo, Bar, Jay, Foo, Bar]text
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text='Foo'))
#输出:['Foo', 'Foo']
find( name , attrs , recursive , text , **kwargs )
find返回单个元素,find_all返回所有元素
实例:
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.find('ul'))
print(type(soup.find('ul')))
print(soup.find('page'))
#输出:
None
CSS选择器#通过select()直接传入CSS选择器即可完成选择
实例1:
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading'))
print(soup.select('ul li'))
print(soup.select('#list-2 .element'))
print(type(soup.select('ul')[0]))
#输出:
[
[
Foo, Bar, Jay, Foo, Bar][
Foo, Bar]实例2:
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul.select('li'))
#输出:
[
Foo, Bar, Jay][
Foo, Bar]获取属性
实例:
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
输出:
list-1
list-1
list-2
list-2
获取内容
实例1:
html='''
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'lxml')
for li in soup.select('li'):
print(li.get_text())
#输出:
Foo
Bar
Jay
Foo
Bar