PyQuery详解

一:安装pyquery

pip install pyquery

二:初始化

1,字符串初始化

html='''
''' from pyquery import PyQuery as pq doc = pq(html) print(doc('li'))

2,URL初始化


from pyquery im其port PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))

3,文件初始化


from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li')) #'li'为选择器

三:基本CSS选择器选择器


html='''
''' from pyquery import PyQuery as pq doc = pq(html) print(doc('#container .list li')) #查找id为container里面的class为list的li标签

四:查找元素

1,查找子元素


html='''
''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') print(type(items)) print(items) lis = items.find('li')   #find查找当前items元素里面的'li' print(type(lis)) print(lis)

lis = items.children() #children查找所有直接子元素
print(type(lis))
print(lis)

lis = items.children('.active')
print(lis)# 因为'list'的父元素只有一个,所以用parent

2,查找父元素


html='''
# 因为'list'的父元素只有一个,所以用parent
''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') container = items.parent() # 因为'list'的父元素只有一个,所以用parent print(type(container)) print(container)

html='''
 
''' from pyquery import PyQuery as pq doc = konggepq(html) items = doc('.list')parent container = items.parents() # 因为'list'的父元素不只一个,所以用parents print(type(parents)) print(parents)

parent = items.parents('.wrap')
print(parent)

3,查找兄弟元素li


html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active') #选择'.list'里面的'.item-0.active'标签 #li = doc('.list.item-0.active') #同时选择'.list'与'.item-0.active'标签.区别在于两标签之间有空格 print(li.siblings()) # siblings()获取所有兄弟节点

html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active') print(li.siblings('.active'))

五,遍历


html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) lis = doc('li').items() print(type(lis)) for li in lis:li print(li)

六:获取信息

1,获取属性


html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') #a前面的空格表示里面的a标签 print(a) print(a.attr('href')) print(a.attr.href) # 结果同上

2,获取文本


html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') #a前面的空格表示里面的a标签 print(a) print(a.text()) # 获取文本

3,获取HTML


html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') #a前面的空格表示里面的a标签 print(a) print(a.html()) # 获取html

七:DOM操作(节点操作)

1,addClass添加标签 removeClass移除标签


html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.removeClass('active') #移除active标签 print(li) li.addClass('active') #添加active标签 print(li)

2,attr css


html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.attr('name','link') #attr把name=link属性添加覆盖到li标签 print(li) li.css('font-size',14px) #css把style=font-size:14px的属性添加到li标签 print(li)

3, remove


html = '''
Hello,World

This is a paragraph.

''' from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove() #remove()移除p标签,以便下一步打印Hello,World print(wrap.text())

4,其他DOM方法

http://pyquery.readthedocs.io/en/latest/api/.html

八:伪类选择器


html='''
 
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('li:first-child') #选择li标签中的第一个子标签 print(li) li = doc('li:last-chlid') #选择li标签中的最后一个子标签 print(li) li = doc('li:nth-chlid(2)') #nth-chlid(2)指定选择li标签中第二个子标签 print(li) li = doc('li:gt(2)') # 选择序号比2大的标签 print(li) li = doc('li:nth-chlid(2n)') # nth-chlid(2n)选择偶数标签 print(li) li = doc('li:contains(second)') # 查找包含second文本的标签 print(li)bb

更多CSS选择器可以查看http://www.w3school.com.cn/css/index.asp

八:官方文档

http://pyquery.readthedocs.io/

你可能感兴趣的:(爬虫)