PyQuery是强大而又灵活的网页解析库,如果你觉得正则写起来太麻烦,如果你觉得BeautifulSoup语法太难记,如果你熟悉jQuery的语法 那么,PyQuery就是你绝佳的选择。 一、初始化方式,有三种,可以传入字符串,传入url,传入文件。 字符串初始化 html = '''''' from pyquery import PyQuery as pq doc = pq(html)#声明pq对象 print(doc('li'))#用css选择器来实现,如果要选id前面加#,如果选class,前面加.,如果选标签名,什么也不加 URL初始化 也可以直接传入URL,进行URL初始化,程序会自动请求URL,获得html并返回要查找的字符串 from pyquery import PyQuery as pq doc = pq(url='http://www.baidu.com')#程序会自动请求url print(doc('head'))#返回head标签 文件初始化 from pyquery import PyQuery as pq doc = pq(filename='D://demo.html')#直接传入文件名称及路径,程序会自动寻找并请求 print(doc('li')) 二、基本css选择器 html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) print(doc('#container .list li'))#会查找id为container class为list,标签为li的对象,只是层级关系,没有后者一定是前者的子对象 查找元素 html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' 子元素 from pyquery import PyQuery as pq doc = pq(html) items = doc('.list')#拿到items print(type(items)) print(items) lis = items.find('li')#利用find方法,查找items里面的li标签,得到的lis也可以继续调用find方法往下查找,层层剥离 print(type(lis)) print(lis) 也可以用.children()查找直接子元素 lis = items.children() print(type(lis)) print(lis) lis = items.children('.active') print(lis) 父元素 html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) items = doc('.list') container = items.parent()#.parent()查找对象的父元素 print(type(container)) print(container) 祖先节点 parents = items.parents()#.parents()祖先节点 parent = items.parents('.wrap')#当然也可以传入参数 print(parent) 兄弟元素 html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.list .item-0.active')#空格表示里面,没有空格表示整体 print(li.siblings())#.siblings()兄弟元素,即同级别的元素,不包括自己 三、遍历 html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) lis = doc('li').items()#.items会是一个生成器 print(type(lis)) for li in lis: print(li) 四、获取信息 获取属性 html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') print(a) print(a.attr('href'))#定义a标签的href属性用于指定超链接目标的URL。 如果用户选择了a标签中的内容,那么浏览器会尝试检索并显示href属性指定的URL所表示的文档,或者执行JavaScript表达式、方法和函数的列表。 print(a.attr.href) 结果: "link3.html">class="bold">third item link3.html link3.html 获取文本 html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) a = doc('.item-0.active a') print(a) print(a.text)#.text()获取文本信息 获取html html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) print(li.html())#.html()获取所在html 五、DOM操作 addClass、removeClass html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.removeClass('active')#删除 print(li) li.addClass('active')#增加 print(li) attr、css html = '''
- first item
- second item
- third item
- fourth item
- fifth item
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('.item-0.active') print(li) li.attr('name', 'link')#增加一个属性 print(li) li.css('font-size', '14px')#增加一个css print(li) 结果: < li class ="item-0 active" > < a href="link3.html" > < span class ="bold" > third item < / span > < / a > < / li > < li class ="item-0 active" name="link" > < a href="link3.html" > < span class ="bold" > third item < / span > < / a > < / li > < li class ="item-0 active" name="link" style="font-size: 14px" > < a href="link3.html" > < span class ="bold" > third item < / span > < / a > < / li > remove html = '''
- first item
- second item
- third item
- fourth item
- fifth item
Hello, World''' from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove()#找到p标签然后删除 print(wrap.text()) 结果: Hello, World This is a paragraph. Hello, World 其他DOM方法 http://pyquery.readthedocs.io/en/latest/api.html 六、伪类选择器 html = '''This is a paragraph.
''' from pyquery import PyQuery as pq doc = pq(html) li = doc('li:first-child') print(li) li = doc('li:last-child') print(li) li = doc('li:nth-child(2)') print(li) li = doc('li:gt(2)') print(li) li = doc('li:nth-child(2n)') print(li) li = doc('li:contains(second)') print(li) 结果: < li class ="item-0" > first item < / li > < li class ="item-0" > < a href="link5.html" > fifth item < / a > < / li > < li class ="item-1" > < a href="link2.html" > second item < / a > < / li > < li class ="item-1 active" > < a href="link4.html" > fourth item < / a > < / li > < li class ="item-0" > < a href="link5.html" > fifth item < / a > < / li > < li class ="item-1" > < a href="link2.html" > second item < / a > < / li > < li class ="item-1 active" > < a href="link4.html" > fourth item < / a > < / li > < li class ="item-1" > < a href="link2.html" > second item < / a > < / li > 更多CSS选择器可以查看 http://www.w3school.com.cn/css/index.asp 官方文档 http://pyquery.readthedocs.io/
- first item
- second item
- third item
- fourth item
- fifth item