本文转自:
https://blog.csdn.net/jeremyjone/article/details/80450236
PyQuery 是灵活而又强大的网页解析库,而在爬虫框架 PySpider 和 Scrapy 中又随处可以 PyQuery 的身影。
本节,我们通过一连串的极其简易的实例,来深入学习 PyQuery。
那我们开始吧!
有三种,可以传入字符串,传入url,传入文件。
字符串初始化
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)#声明pq对象
print(doc('li'))#用css选择器来实现,如果要选id前面加#,如果选class,前面加.,如果选标签名,什么也不加
print(type(doc('li')))
for item in doc('li'):
print(item)
first item
second item
third item
fourth item
fifth item
URL初始化
直接传入URL,进行URL初始化,程序会自动请求URL,获得html并返回要查找的字符串
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')#程序会自动请求url
print(doc('head'))#返回head标签
ç™¾åº¦ä¸€ä¸‹ï¼Œä½ å°±çŸ¥é“
文件初始化
from pyquery import PyQuery as pq
doc = pq(filename='D://demo.html')#直接传入文件名称及路径,程序会自动寻找并请求
print(doc('li'))
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li'))#会查找id为container class为list,标签为li的对象,只是层级关系,并没有后者一定是前者的子对象
first item
second item
third item
fourth item
fifth item
查找元素
#查找元素
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
#子元素
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')#拿到items
print(type(items))
print(items)
lis = items.find('li')#利用find方法,查找items里面的li标签,得到的lis也可以继续调用find方法往下查找,层层剥离
print(type(lis))
print(lis)
- first item
- second item
- third item
- fourth item
- fifth item
first item
second item
third item
fourth item
fifth item
first item
second item
third item
fourth item
fifth item
third item
fourth item
也可以用.children()查找直接子元素
#也可以用.children()查找直接子元素
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
lis = items.children()
print(type(lis))
print(lis)
lis = items.children('.active')
print(lis)
first item
second item
third item
fourth item
fifth item
third item
fourth item
父元素
#父元素
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()#.parent()查找对象的父元素
print(type(container))
print(container)
#祖先节点
parents = items.parents()#.parents()祖先节点
parent = items.parents('#container')#当然也可以传入参数
print('parent:', parent)
- first item
- second item
- third item
- fourth item
- fifth item
parent:
- first item
- second item
- third item
- fourth item
- fifth item
兄弟元素
#兄弟元素
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')#空格表示里面,没有空格表示整体
print(li.siblings())#.siblings()兄弟元素,即同级别的元素,不包括自己
second item
first item
fourth item
fifth item
获取属性
#获取属性
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.attr('href'))#定义a标签的href属性用于指定超链接目标的URL。 如果用户选择了a标签中的内容,那么浏览器会尝试检索并显示href属性指定的URL所表示的文档,或者执行JavaScript表达式、方法和函数的列表。
print(a.attr.href)
third item
link3.html
link3.html
获取文本
#获取文本
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text)#.text()获取文本信息
third item
]>
获取HTML
#获取html
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html())#.html()获取所在html
third item
third item
#addClass、removeClass
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active')#删除
print(li)
li.addClass('active')#增加
print(li)
third item
third item
third item
attr、css
#attr、css
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')#增加一个属性
print(li)
li.css('font-size', '14px')#增加一个css
print(li)
third item
third item
third item
remove
#remove
html = '''
Hello, World
This is a paragraph.
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()#找到p标签然后删除
print(wrap.text())
Hello, World
This is a paragraph.
Hello, World
其他DOM方法
其他DOM方法
http://pyquery.readthedocs.io/en/latest/api.html
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child')
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)
first item
fifth item
second item
fourth item
fifth item
second item
fourth item
second item