表达式 | 描述 |
---|---|
nodename | 选取此节点的所有子节点 |
/ | 从当前节点选取直接子节点 |
// | 从当前节点选取子孙节点 |
. | 选取当前节点 |
… | 选取当前节点的父节点 |
@ | 选取属性 |
//title[@lang="eng"]
pip3 install lxml
from lxml import etree
text = """
"""
html = etree.HTML(text)
result = etree.tostring(html)
print(result.decode('utf-8'))
etree 模块可以自动修正 HTML 文本
直接读取文本
test.html
<div>
<ul>
<li class="item-0"><a href="link1.html">first itema>li>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-inactive"><a href="link3.html">third itema>li>
<li class="item-1"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>
ul>
div>
test.py
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//*")
print(result)
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//li")
print(result)
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//li/a")
print(result)
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//ul//a")
print(result)
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//a[@href=\"link4.html\"]/../@class")
print(result)
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//a[@href=\"link4.html\"]/parent::*/@class")
print(result)
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//li[@class=\"item-0\"]")
print(result)
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//li[@class=\"item-0\"]/text()")
print(result)
获取 class 属性为 item-0 的 li 节点,并获取其内部的文本
先选取 a 节点再获取文本
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//li[@class=\"item-0\"]/a/text()")
print(result)
使用 //
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//li[@class=\"item-0\"]//text()")
print(result)
获取子孙节点下的所有文本://
获取特定子孙节点下的所有文本:/特定节点/
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
result = html.xpath("//li/a/@href")
print(result)
from lxml import etree
text = """
first item
"""
html = etree.HTML(text)
result = html.xpath("//li[@class=\"li\"]/a/text()")
print(result)
from lxml import etree
text = """
first item
"""
html = etree.HTML(text)
result = html.xpath("//li[contains(@class, \"li\")]/a/text()")
print(result)
from lxml import etree
text = """
first item
"""
html = etree.HTML(text)
result = html.xpath(
"//li[contains(@class, \"li\") and @name=\"item\"]/a/text()")
print(result)
运算符 | 描述 |
---|---|
or | 或 |
and | 与 |
mod | 求余 |
| | 求两个节点集 |
+ | 加 |
- | 减 |
* | 乘 |
div | 除 |
= | 等于 |
!= | 不等于 |
< | 小于 |
<= | 小于等于 |
> | 大于 |
>= | 大于等于 |
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
# 选取第一个节点
result = html.xpath("//li[1]/a/text()")
print(result)
# 选取最后一个节点
result = html.xpath("//li[last()]/a/text()")
print(result)
# 获取限定位置的节点
result = html.xpath("//li[position() < 3]/a/text()")
print(result)
# 获取倒数第三个节点
result = html.xpath("//li[last() - 2]/a/text()")
print(result)
from lxml import etree
html = etree.parse("./test.html", etree.HTMLParser())
# ancestor轴: 获取所有祖先节点
result = html.xpath("//li[1]/ancestor::*")
print(result)
result = html.xpath("//li[1]/ancestor::div")
print(result)
# attribute轴: 获取所有属性值
result = html.xpath("//li[1]/attribute::*")
print(result)
# child轴: 获取所有直接子节点
result = html.xpath("//li[1]/child::a[@href=\"link1.html\"]")
print(result)
# descendant轴: 获取所有子孙节点
result = html.xpath("//li[1]/descendant::span")
print(result)
# following轴: 获取当前节点之后的所有节点
result = html.xpath("//li[1]/following::*[2]")
print(result)
# following-sibling轴: 获取当前节点之后的所有同级节点
result = html.xpath("//li[1]/following-sibling::*")
print(result)
解析器 | 使用方法 | 优势 | 劣势 |
---|---|---|---|
Python 标准库 | BeautifulSoup(markup, “html.parser”) | Python 的内置标准库、执行速度适中、文档容错能力强 | Python 2.7.3 或 Python 3.2.2 前的版本中文容错能力差 |
LXML HTML 解析器 | BeautifulSoup(markup, “lxml”) | 速度快、文档容错能力强 | 需要安装 C 语言库 |
LXML XML 解析器 | BeautifulSoup(markup, “xml”) | 速度快、唯一支持 XML 的解析器 | 需要安装 C 语言库 |
html5lib | BeautifulSoup(markup, “html5lib”) | 提供最好的容错性、以浏览器的方式解析文档、生成HTML5格式的文档 | 速度慢、不依赖外部扩展 |
pip3 install beautifulsoup4
pip3 install lxml
from bs4 import BeautifulSoup
html = """
Test
Test
···
"""
soup = BeautifulSoup(html, "lxml")
# 以标准的缩进格式输出
print(soup.prettify())
print(soup.title.string)
from bs4 import BeautifulSoup
html = """
Test
Test
···
"""
soup = BeautifulSoup(html, "lxml")
print(soup.title)
print(type(soup.title))
print(soup.title.string)
print(soup.head)
print(soup.p)
print(soup.title.name)
print(soup.p.attrs)
print(soup.p.attrs["name"])
print(soup.p.string)
print(soup.head.title)
print(soup.head.title.string)
直接子节点
子孙节点(descendants)
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "lxml")
print(soup.p.descendants)
for i, child in enumerate(soup.p.descendants):
print(i, child)
from bs4 import BeautifulSoup
html = """
"""
soup = BeautifulSoup(html, "lxml")
print(soup.a.parent)
from bs4 import BeautifulSoup
html = """
"""
soup = BeautifulSoup(html, "lxml")
print(type(soup.a.parents))
print(list(enumerate(soup.a.parents)))
from bs4 import BeautifulSoup
html = """
"""
soup = BeautifulSoup(html, "lxml")
print("next sibling", soup.a.next_sibling)
print("prev sibling", soup.a.previous_sibling)
print("next siblings", list(enumerate(soup.a.next_siblings)))
print("prev siblings", list(enumerate(soup.a.previous_siblings)))
from bs4 import BeautifulSoup
html = """
"""
soup = BeautifulSoup(html, "lxml")
print(soup.a.next_sibling)
print(soup.a.next_sibling.string)
print(soup.a.parents)
print(list(soup.a.parents)[0])
print(list(soup.a.parents)[0].attrs["class"])
findall(name, attrs, recursive, text, **kwargs)
from bs4 import BeautifulSoup
html = """
Hello
- A
- B
- C
- a
- b
- c
"""
soup = BeautifulSoup(html, "lxml")
print(soup.find_all(name="ul"))
print(type(soup.find_all(name="ul")[0]))
for ul in soup.find_all(name="ul"):
print(ul.find_all(name="li"))
for li in ul.find_all(name="li"):
print(li.string)
from bs4 import BeautifulSoup
html = """
Hello
- A
- B
- C
- a
- b
- c
"""
soup = BeautifulSoup(html, "lxml")
print(soup.find_all(attrs={"id": "list-1"}))
print(soup.find_all(attrs={"class": "element"}))
# 等效于
print(soup.find_all(id="list-1"))
print(soup.find_all(class_="element"))
import re
from bs4 import BeautifulSoup
html = """
"""
soup = BeautifulSoup(html, "lxml")
print(soup.find_all(text=re.compile("link")))
from bs4 import BeautifulSoup
html = """
Hello
- A
- B
- C
- a
- b
- c
"""
soup = BeautifulSoup(html, "lxml")
print(soup.find(name="ul"))
print(type(soup.find(name="ul")))
print(soup.find(class_="list"))
from bs4 import BeautifulSoup
html = """
Hello
- A
- B
- C
- a
- b
- c
"""
soup = BeautifulSoup(html, "lxml")
print(soup.select(".panel .panel-body .list"))
print(soup.select("ul li"))
print(soup.select("#list-2 .element"))
print(type(soup.select("ul")[0]))
soup = BeautifulSoup(html, "lxml")
for ul in soup.select("ul"):
print(ul.select("li"))
soup = BeautifulSoup(html, "lxml")
for ul in soup.select("ul"):
print(ul["id"])
# 等效于
print(ul.attrs["id"])
soup = BeautifulSoup(html, "lxml")
for li in soup.select("li"):
print(f"text: {li.get_text()}")
# 等效于
print(f"string: {li.string}")
pip3 install pyquery
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
print(doc("li"))
from pyquery import PyQuery
doc = PyQuery(url="https://www.bilibili.com/")
# 等效于
doc = PyQuery(requests.get(url).text)
print(doc("title"))
from pyquery import PyQuery
doc = PyQuery(filename="test.html")
print(doc("li"))
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
print(doc("#container .list li"))
print(type(doc("#container .list li")))
for item in doc("#container .list li").items():
print(item.text())
from pyquery import PyQuery
doc = PyQuery(html)
items = doc(".list")
lis = items.children()
print(type(lis))
print(lis)
from pyquery import PyQuery
doc = PyQuery(html)
items = doc(".list")
print(type(items))
print(items)
lis = items.find("li")
print(type(lis))
print(lis)
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
items = doc(".list")
container = items.parent()
print(type(container))
print(container)
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
items = doc(".list")
parents = items.parents()
print(type(parents))
print(parents)
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
lis = doc(".list .item-0.active")
for item in lis.siblings().items():
print(item.text())
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
li = doc(".item-0.active")
print(li)
print(str(li))
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
lis = doc("li").items()
print(type(lis))
for li in lis:
print(li, type(li))
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
a = doc(".item-0.active a")
print(a, type(a))
print(a.attr("href"))
# 等效于
print(a.attr.href)
a = doc("a")
print(a, type(a))
print(a.attr("href"))
for item in a.items():
print(item.attr("href"))
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
a = doc(".item-0.active a")
print(a)
print(a.text())
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
li = doc(".item-0.active")
print(li)
print(li.html())
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
li = doc(".item-0.active")
a = doc(".item-0.active a")
# 输出内容相同,但内容并不相等
print(li.html())
print(a)
# 类型为str
print(type(li.html()))
# 类型为pyquery.pyquery.PyQuery
print(type(a))
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
li = doc(".item-0.active")
print(li)
li.remove_class("active")
print(li)
li.add_class("active")
print(li)
from pyquery import PyQuery
html = """
"""
doc = PyQuery(html)
li = doc(".item-0.active")
print(li)
li.attr("name", "link")
print(li)
li.text("changed item")
print(li)
li.html("changed item")
print(li)
from pyquery import PyQuery
html = """
ABC
abc
"""
doc = PyQuery(html)
li = doc(".item-0.active")
warp = doc(".warp")
warp.find("p").remove()
print(warp.text())
from pyquery import PyQuery
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
doc = PyQuery(html)
# 第一个li节点
li = doc("li:first-child")
print(li)
# 最后一个li节点
li = doc("li:last-child")
print(li)
# 第二个li节点
li = doc("li:nth-child(2)")
print(li)
# 下标大于2的节点
li = doc("li:gt(2)")
print(li)
# 以2为倍数位置的节点
li = doc("li:nth-child(2n)")
print(li)
# 包含second文本的节点
li = doc("li:contains(second)")
print(li)
pip3 install parsel
from parsel import Selector
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
selector = Selector(text=html)
items1 = selector.css(".item-0")
print(len(items1), type(items1), items1)
items2 = selector.xpath("//li[contains(@class, 'item-0')]")
print(len(items2), type(items2), items2)
from parsel import Selector
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
selector = Selector(text=html)
items = selector.css(".item-0")
print(f"type of items: {type(items)}")
for item in items:
print(f"type of item: {type(item)}")
text = item.xpath(".//text()").get()
print(text)
from parsel import Selector
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
selector = Selector(text=html)
result = selector.xpath("//li[contains(@class, \"item-0\")]//text()").getall()
print(result)
result = selector.css(".item-0 *::text").getall()
print(result)
from parsel import Selector
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
selector = Selector(text=html)
result = selector.css(".item-0.active a::attr(href)").get()
print(result)
result = selector.xpath("//li[contains(@class, \"item-0\") and contains(@class, \"active\")]/a/@href").get()
print(result)
from parsel import Selector
html = """
- first item
- second item
- third item
- fourth item
- fifth item
"""
selector = Selector(text=html)
result = selector.css(".item-0").re("link(.*?)\"")
print(result)
result = selector.css(".item-0 *::text").re(".*item")
print(result)
# 获取第一个符合
result = selector.css(".item-0").re_first(">(.*?item)")
print(result)