XPath 常用规则如下:
一个示例,代表选择所有名称为 title,同时属性为 lang 的值为 eng 的节点:
//title[@lang='eng']
一个实例:
经过处理后,li 节点标签被补全,并且自动添加了 body, html 节点
from lxml import etree
# 声明一段 HTML 文本
text = '''
'''
# 调用 HTML 类进行初始化,构造一个 XPath 解析对象
# 注意最后一个 li 节点是没有闭合的,但 etree 模块可以自动修正
html = etree.HTML(text)
# 使用 tostring 方法输出修正后的 HTML 代码,但是是 bytes 类型
result = etree.tostring(html)
# 通过 decode() 方法转化为 str 类型
print(result.decode('utf-8'))
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//*')
print(result)
[, , , , , , , , , , , , , ]
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//li')
print(result)
[, , , , ]
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//li/a')
print(result)
[, , , , ]
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//ul//a')
print(result)
[, , , , ]
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/../@class')
print(result)
['item-1']
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//a[@href="link4.html"]/parent::*/@class')
print(result)
['item-1']
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]')
print(result)
[, ]
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/text()')
print(result)
['\r\n ']
# 先获取 a 节点:
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]/a/text()')
print(result)
# 使用 // 获取子孙节点:
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]//text()')
print(result)
['first item', 'fifth item']
['first item', 'fifth item', '\r\n ']
from lxml import etree
html = etree.parse('test.html', etree.HTMLParser())
result = html.xpath('//li/a/@href')
print(result)
['link1.html', 'link2.html', 'link3.html', 'link4.html', 'link5.html']
from lxml import etree
text = '''
first item
'''
html = etree.HTML(text)
# 使用之前的匹配方法获得的结果为空
result = html.xpath('//li[@class="li"]/a/text()')
print(result)
# 使用 contains() 方法
result = html.xpath('//li[contains(@class, "li")]/a/text()')
print(result)
[]
['first item']
from lxml import etree
text = '''
first item
'''
html = etree.HTML(text)
result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()')
print(result)
['first item']
from lxml import etree
text = '''
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/a/text()')
print(result)
result = html.xpath('//li[last()]/a/text()')
print(result)
result = html.xpath('//li[last()-2]/a/text()')
print(result)
result = html.xpath('//li[position()<3]/a/text()')
print(result)
['first item']
['fifth item']
['third item']
['first item', 'second item']
from lxml import etree
text = '''
'''
html = etree.HTML(text)
result = html.xpath('//li[1]/ancestor::*')
print(result)
result = html.xpath('//li[1]/ancestor::div')
print(result)
result = html.xpath('//li[1]/attribute::*')
print(result)
result = html.xpath('//li[1]/child::a[@href="link1.html"]')
print(result)
result = html.xpath('//li[1]/descendant::span')
print(result)
result = html.xpath('//li[1]/following::*[2]') # 1 为当前节点
print(result)
result = html.xpath('//li[1]/following-sibling::*') # 包含当前节点
print(result)
[, , , ]
[]
['item-0']
[]
[]
[]
[, , , ]
from bs4 import BeautifulSoup
# 声明 html
html = """
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
# 实例化 BeautifulSoup 对象
soup = BeautifulSoup(html, 'lxml')
print(soup.prettify())
print(soup.title.string)
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
,
Lacie
and
Tillie
;
and they lived at the bottom of a well.
...
The Dormouse's story
from bs4 import BeautifulSoup
# 声明 html
html = """
The Dormouse's story
The Dormouse's story
Once upon a time there were three little sisters; and their names were
,
Lacie and
Tillie;
and they lived at the bottom of a well.
...
"""
# 实例化 BeautifulSoup 对象
soup = BeautifulSoup(html, 'lxml')
print(soup.title)
print(type(soup.title))
print(soup.title.string)
print(soup.head)
print(soup.p)
The Dormouse's story
The Dormouse's story
The Dormouse's story
The Dormouse's story
print(soup.title.name)
title
print(soup.p.attrs) # 获取所有属性
print(soup.p.attrs['name']) # 获取 name 属性
print(soup.p['name']) # 同上
print(soup.p['class']) # 获取 class 属性,由于 class 可能有多个,因此返回的是列表
{'class': ['title'], 'name': 'dromouse'}
dromouse
dromouse
['title']
from bs4 import BeautifulSoup
html = """
The Dormouse's story
"""
soup = BeautifulSoup(html, 'lxml')
print(soup.head.title)
print(type(soup.head.title))
print(soup.head.title.string)
The Dormouse's story
The Dormouse's story
from bs4 import BeautifulSoup
html = """
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
Lacie
and
Tillie
and they lived at the bottom of a well.
...
"""
soup = BeautifulSoup(html, 'lxml')
# contents 属性
print(soup.p.contents)
['\n Once upon a time there were three little sisters; and their names were\n ',
Elsie
, '\n', Lacie, ' \n and\n ', Tillie, '\n and they lived at the bottom of a well.\n ']
# children 属性
print(soup.p.children)
for i, child in enumerate(soup.p.children):
print(i, child)
0
Once upon a time there were three little sisters; and their names were
1
Elsie
2
3 Lacie
4
and
5 Tillie
6
and they lived at the bottom of a well.
# descendants 属性
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):
print(i, child)
0
Once upon a time there were three little sisters; and their names were
1
Elsie
2
3 Elsie
4 Elsie
5
6
7 Lacie
8 Lacie
9
and
10 Tillie
11 Tillie
12
and they lived at the bottom of a well.
from bs4 import BeautifulSoup
html = """
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
...
"""
soup = BeautifulSoup(html, 'lxml')
# parent 属性
print(soup.a.parent)
Once upon a time there were three little sisters; and their names were
Elsie
# parents 属性
print(soup.a.parents)
for i, parent in enumerate(soup.a.parents):
print(i, parent)
0
Once upon a time there were three little sisters; and their names were
Elsie
1
Once upon a time there were three little sisters; and their names were
Elsie
...
2
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
...
3
The Dormouse's story
Once upon a time there were three little sisters; and their names were
Elsie
...
from bs4 import BeautifulSoup
html = """
Once upon a time there were three little sisters; and their names were
Elsie
Hello
Lacie
and
Tillie
and they lived at the bottom of a well.
"""
soup = BeautifulSoup(html, 'lxml')
# parent 属性
print('Next Sibling', soup.a.next_sibling)
print('Previous Sibling', soup.a.previous_sibling)
print('Next Siblings', list(enumerate(soup.a.next_siblings)))
print('Previous Siblings', list(enumerate(soup.a.previous_siblings)))
Next Sibling
Hello
Previous Sibling
Once upon a time there were three little sisters; and their names were
Next Siblings [(0, '\n Hello\n '), (1, Lacie), (2, ' \n and\n '), (3, Tillie), (4, '\n and they lived at the bottom of a well.\n ')]
Previous Siblings [(0, '\n Once upon a time there were three little sisters; and their names were\n ')]
from bs4 import BeautifulSoup
html = """
Once upon a time there were three little sisters; and their names were
BobLacie
"""
soup = BeautifulSoup(html, 'lxml')
print('Next Sibling:')
print(type(soup.a.next_sibling))
print(soup.a.next_sibling)
print(soup.a.next_sibling.string)
print('Parents:')
print(type(soup.a.parents))
print(list(soup.a.parents)[0])
print(list(soup.a.parents)[0].attrs['class'])
Next Sibling:
Lacie
Lacie
Parents:
Once upon a time there were three little sisters; and their names were
BobLacie
['story']
find_all() 查询所有符合条件的元素,API 如下:
find_all(name, attrs, recursive, text, **kwargs)
name:根据节点名查询元素:
from bs4 import BeautifulSoup
html='''
Hello
- Foo
- Bar
- Jay
- Foo
- Bar
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(name='ul'))
print(type(soup.find_all(name='ul')[0]))
[
- Foo
- Bar
- Jay
,
- Foo
- Bar
]
# 嵌套查询
for ul in soup.find_all(name='ul'):
print(ul.find_all(name='li'))
for li in ul.find_all(name='li'):
print(li.string)
[Foo , Bar , Jay ]
Foo
Bar
Jay
[Foo , Bar ]
Foo
Bar
from bs4 import BeautifulSoup
html='''
Hello
- Foo
- Bar
- Jay
- Foo
- Bar
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(attrs={'id': 'list-1'}), '\n')
print(soup.find_all(id='list-1'), '\n')
print(soup.find_all(attrs={'class': 'element'}), '\n')
print(soup.find_all(class_='element'))
[
- Foo
- Bar
- Jay
]
[
- Foo
- Bar
- Jay
]
[Foo , Bar , Jay , Foo , Bar ]
[Foo , Bar , Jay , Foo , Bar ]
import re
from bs4 import BeautifulSoup
html='''
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.find_all(text=re.compile('link')))
['Hello, this is a link', 'Hello, this is a link, too']
from bs4 import BeautifulSoup
html='''
Hello
- Foo
- Bar
- Jay
- Foo
- Bar
'''
soup = BeautifulSoup(html, 'lxml')
print(soup.select('.panel .panel-heading')) # 选择 class 为 panel 中 class 为 panel-heading 的节点
print(soup.select('ul li')) # 选择 ul 节点里的 li 节点
print(soup.select('#list-2 .element')) # 选择 id 为 list-2 中 class 为 element 的节点
print(type(soup.select('ul')[0]))
[
Hello
]
[Foo , Bar , Jay , Foo , Bar ]
[Foo , Bar ]
for ul in soup.select('ul'):
print(ul.select('li'))
[Foo , Bar , Jay ]
[Foo , Bar ]
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
list-1
list-1
list-2
list-2
for li in soup.select('li'):
print('Get Text:', li.get_text())
print('String:', li.string)
Get Text: Foo
String: Foo
Get Text: Bar
String: Bar
Get Text: Jay
String: Jay
Get Text: Foo
String: Foo
Get Text: Bar
String: Bar
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
print(doc('li'))
first item
second item
third item
fourth item
fifth item
from pyquery import PyQuery as pq
doc = pq(url='https://www.taobao.com')
print(doc('title'))
# 如下代码等效
from pyquery import PyQuery as pq
import requests
doc = pq(requests.get('https://www.taobao.com').text)
print(doc('title'))
淘宝网 - 淘!我喜欢
淘宝网 - 淘!我喜欢
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('title'))
This is a Demo
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
print(doc('#container .list li'))
print(type(doc('#container .list li')))
first item
second item
third item
fourth item
fifth item
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
items = doc('#container')
print(type(items))
print(items)
# 所有子孙节点
lis = items.find('ul, li')
print(type(lis))
print(lis)
# 直接子节点
lis = items.children('.list')
print(type(lis))
print(lis)
- first item
- second item
- third item
- fourth item
- fifth item
- first item
- second item
- third item
- fourth item
- fifth item
first item
second item
third item
fourth item
fifth item
- first item
- second item
- third item
- fourth item
- fifth item
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
items = doc('.list')
# 直接父节点
container = items.parent()
print(type(container))
print(container, '\n')
# 所有祖先节点
parents = items.parents()
print(type(parents))
print(parents, '\n')
# class 为 wrap 的祖先节点
parent = items.parents('.wrap')
print(type(parent))
print(parent)
- first item
- second item
- third item
- fourth item
- fifth item
- first item
- second item
- third item
- fourth item
- fifth item
- first item
- second item
- third item
- fourth item
- fifth item
- first item
- second item
- third item
- fourth item
- fifth item
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
li = doc('.list .item-0.active')
# 所有兄弟节点
print(li.siblings(), '\n')
# 筛选后的兄弟节点
print(li.siblings('.active'))
second item
first item
fourth item
fifth item
fourth item
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
# 单个节点
li = doc('.item-0.active')
print(li)
print(str(li))
# 多个节点
lis = doc('li').items()
print(type(lis))
for li in lis:
print(str(li).strip(), type(li), sep='\n')
third item
third item
first item
second item
third item
fourth item
fifth item
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
# 单个元素
a = doc('.item-0.active a')
print(a, type(a))
print(a.attr('href'))
print(a.attr.href)
print('\n')
# 多个元素
a = doc('a')
for item in a.items():
print(item.attr('href'))
third item
link3.html
link3.html
link2.html
link3.html
link4.html
link5.html
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
# 单个节点
a = doc('.item-0.active a')
print(a)
print(a.text()) # text()
print('\n')
print(a.html()) # html()
print('\n')
# 多个节点
li = doc('li')
print(li)
print(li.text())
print('\n')
for item in li.items():
print(item.html())
third item
third item
third item
first item
second item
third item
fourth item
fifth item
first item second item third item fourth item fifth item
first item
second item
third item
fourth item
fifth item
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active')
print(li)
li.addClass('active')
print(li)
third item
third item
third item
from pyquery import PyQuery as pq
html = '''
'''
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')
print(li)
li.text('changed item')
print(li)
li.html('changed item')
print(li)
third item
third item
changed item
changed item
from pyquery import PyQuery as pq
html = '''
Hello, World
This is a paragraph.
'''
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
# 只要 Hello, World,不要 p 节点里的内容
wrap.find('p').remove()
print(wrap.text())
Hello, World
This is a paragraph.
Hello, World
from pyquery import PyQuery as pq
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
doc = pq(html)
# 第一个 li 节点
li = doc('li:first-child')
print('第一个 li 节点', li)
# 最后一个 li 节点
li = doc('li:last-child')
print('最后一个 li 节点', li)
# 第二个 li 节点
li = doc('li:nth-child(2)')
print('第二个 li 节点', li)
# 第三个 li 之后的 li 节点
li = doc('li:gt(2)')
print('第三个 li 之后的 li 节点', li)
# 偶数位置的 li 节点
li = doc('li:nth-child(2n)')
print('偶数位置的 li 节点', li)
# 包含 second 的 li 节点
li = doc('li:contains(second)')
print('包含 second 的 li 节点', li)
第一个 li 节点 first item
最后一个 li 节点 fifth item
第二个 li 节点 second item
第三个 li 之后的 li 节点 fourth item
fifth item
偶数位置的 li 节点 second item
fourth item
包含 second 的 li 节点 second item