from lxml import etree
# 此处采取直接读取文本进行解析的方法
html = etree.parse('html.txt', etree.HTMLParser())
result = html.xpath('//a/text()')
print(result)
['ФÉê¿ËµÄ¾ÈÊê'] # html.txt编码为:GBK时的结果
['è\x82\x96ç\x94³å\x85\x8bç\x9a\x84æ\x95\x91èµ\x8e'] # html.txt编码为:UTF-8时的结果
先使用open()函数读取html.txt,再将字符串传递给etree.HTML()
from lxml import etree
f = open('html.txt', 'r') # 如果文件编码为UTF-8, 记得添加参数:encoding='utf8'
text = f.read()
html = etree.HTML(text, etree.HTMLParser())
result = html.xpath('//a/text()')
print(result)
['肖申克的救赎']
为解析器etree.HTMLParser()传递一个参数:encoding=‘gbk’,记得,此处编码与文件html.txt一致
推荐该方法,简洁
from lxml import etree
html = etree.parse('html.txt', etree.HTMLParser(encoding='gbk'))
result = html.xpath('//a/text()')
print(result)
['肖申克的救赎']
我正在看的那本书根本就没有使用etree.HTMLParser(encoding=‘gbk’),一律使用etree.HTMLParser()默认版本,多亏了PyCharm的补全功能,让我看见了etree.HTMLParser()的参数列表,第一个参数就是:encoding=None,然后试了下果然解决了中文乱码问题!