html = '''
<div>
<ul>
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))
查找所有的li标签。输出结果如下:
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))
选出百度网站里面head标签里面的内容。
输出结果如下:
<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>百度一下,你就知道</title></head>
from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))
原理也一样初始化文件。
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li'))
选择id=container和list类下的里标签。空格代表一个嵌套。
输出结果为:
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
html = '''
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(list)
find找出所有li标签。
输出结果为:
<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
查找所有的直接子元素
lis = items.children('.active')
print(lis)
查找子元素里类为active类的元素。
html = '''
- first item
- second item
- third item
- fourth item
- fifth item
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container)
打印父元素:
<class 'pyquery.pyquery.PyQuery'>
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents)
返回所有的父元素。
结果为
<class 'pyquery.pyquery.PyQuery'>
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
div><div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
还可以加入参数进行筛选。
parent = items.parents('.wrap')
print(parent)
选取类为wrap的标签。
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active') //没加空格代表同类,同时选择item-0和active的标签
print(li.siblings())
选择类为list的 选择item-0的标签。
li.siblings()即为选择兄弟元素的。
html = '''
<div class="wrap">
<div id="container">
class="list">
- class="item-0">first item
- class="item-1">"link2.html">second item
- class="item-0 active">"link3.html">class="bold">third item
- class="item-1 active">"link4.html">fourth item
- class="item-0">"link5.html">fifth item
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
print(li)
.items()方法,返回一个迭代对象。
<class 'generator'>
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.attr('href'))
print(a.attr.href)
选取a下属性为href的内容。
<a href="link3.html"><span class="bold">third itemspan>a>
link3.html
link3.html
html = '''
<div class="wrap">
<div id="container">
class="list">
- class="item-0">first item
- class="item-1">"link2.html">second item
- class="item-0 active">"link3.html">class="bold">third item
- class="item-1 active">"link4.html">fourth item
- class="item-0">"link5.html">fifth item
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())
结果为:
<a href="link3.html">"bold">third itema>
third item
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html())
获取整个html代码。
增加类和删除类。
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active') //选择li标签
print(li)
li.removeClass('active') //移除active标签
print(li)
li.addClass('active') //增加active镖旗啊
print(li)
输出结果:
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-0"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link') //把li增加标签name=link。如果已经存在name属性则改变name=link。
print(li)
li.css('font-size', '14px')//设置font-size=14px
print(li)
输出结果:
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third itemspan>a>li>
html = '''
Hello, World
This is a paragraph.
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove()
print(wrap.text())
如果只获取Hello,world
.remove移除。
运行结果:
Hello, World This is a paragraph.
Hello, World
http://pyquery.readthedocs.io/en/latest/api.html
html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first itemli>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
ul>
div>
div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') //第一个孩子标签
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)') //第n个孩子标签
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)
结果:
<li class="item-0">first itemli>
<li class="item-0"><a href="link5.html">fifth itema>li>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-0"><a href="link5.html">fifth itema>li>
<li class="item-1"><a href="link2.html">second itema>li>
<li class="item-1 active"><a href="link4.html">fourth itema>li>
<li class="item-1"><a href="link2.html">second itema>li>
http://pyquery.readthedocs.io/