HTML 解析

pyquery 使用

参考

from pyquery import PyQuery as pq
from lxml import etree
#四种创建对象的方法
doc1 =pq(etree.fromstring('
first section 1111 17-01-28 22:51 second section 2222 17-01-28 22:53
')) doc2 = pq('
first section 1111 17-01-28 22:51 second section 2222 17-01-28 22:53
') #直接给html字符串创建对象 doc3 = pq(filename ='hello')#给html文件 doc4 = pq(url = 'http://google.com')#给url
doc('.class')#获取对应class的对象
doc('#id')#获取对应的id对象
data = doc('tr')#以list形式返回文件中tr元素
for tr in doc('tr').items:
    print(tr('td').eq(2).text)#输出tr元素中第二个td元素的文本。
doc('p').attr('id')#获取p标签的属性id值
doc('p').find('#n')#在p块中查找id

beautifulsoup4

beautifulsoup4

同上

# beautiful练习
html_doc = """
The Dormouse's story

The Dormouse's story

Once upon a time there were three little sisters; and their names were Elsie, Lacie and Tillie; and they lived at the bottom of a well.

...

""" from bs4 import BeautifulSoup soup = BeautifulSoup(html_doc,'lxml') print(soup.prettify())#格式化输出html print(soup.title.string) for link in soup.find_all('a'):#获取输出所有a标签的链接 print(link.get('href')) print(soup.get_text())#输出所有的文本

你可能感兴趣的:(HTML 解析)