五. PyQuery

PyQuery

初始化

字符串初始化

html = '''
<div>
    <ul>
         <li class="item-0">first itemli>
         <li class="item-1"><a href="link2.html">second itema>li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
         <li class="item-1 active"><a href="link4.html">fourth itema>li>
         <li class="item-0"><a href="link5.html">fifth itema>li>
     ul>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))

查找所有的li标签。输出结果如下:

<li class="item-0">first itemli>
         <li class="item-1"><a href="link2.html">second itema>li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
         <li class="item-1 active"><a href="link4.html">fourth itema>li>
         <li class="item-0"><a href="link5.html">fifth itema>li>

URL初始化

from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com')
print(doc('head'))

选出百度网站里面head标签里面的内容。
输出结果如下:

<head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>&#231;&#153;&#190;&#229;&#186;&#166;&#228;&#184;&#128;&#228;&#184;&#139;&#239;&#188;&#140;&#228;&#189;&#160;&#229;&#176;&#177;&#231;&#159;&#165;&#233;&#129;&#147;</title></head> 

文件初始化

from pyquery import PyQuery as pq
doc = pq(filename='demo.html')
print(doc('li'))

原理也一样初始化文件。

基本CSS选择器

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first itemli>
         <li class="item-1"><a href="link2.html">second itema>li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
         <li class="item-1 active"><a href="link4.html">fourth itema>li>
         <li class="item-0"><a href="link5.html">fifth itema>li>
     ul>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li'))

选择id=container和list类下的里标签。空格代表一个嵌套。
输出结果为:

<li class="item-0">first itemli>
         <li class="item-1"><a href="link2.html">second itema>li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
         <li class="item-1 active"><a href="link4.html">fourth itema>li>
         <li class="item-0"><a href="link5.html">fifth itema>li>

查找元素

子元素

html = '''
<div id="container">
    <ul class="list">
         <li class="item-0">first itemli>
         <li class="item-1"><a href="link2.html">second itema>li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
         <li class="item-1 active"><a href="link4.html">fourth itema>li>
         <li class="item-0"><a href="link5.html">fifth itema>li>
     ul>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
print(type(items))
print(items)
lis = items.find('li')
print(type(lis))
print(list)

find找出所有li标签。
输出结果为:

<class 'pyquery.pyquery.PyQuery'>
<ul class="list">
         <li class="item-0">first itemli>
         <li class="item-1"><a href="link2.html">second itema>li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
         <li class="item-1 active"><a href="link4.html">fourth itema>li>
         <li class="item-0"><a href="link5.html">fifth itema>li>
     ul>

<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first itemli>
         <li class="item-1"><a href="link2.html">second itema>li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
         <li class="item-1 active"><a href="link4.html">fourth itema>li>
         <li class="item-0"><a href="link5.html">fifth itema>li>

children

查找所有的直接子元素

lis = items.children('.active')
print(lis)

查找子元素里类为active类的元素。

父元素

html = '''

'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
container = items.parent()
print(type(container))
print(container)

打印父元素:

<class 'pyquery.pyquery.PyQuery'>
<div id="container">
    <ul class="list">
         <li class="item-0">first itemli>
         <li class="item-1"><a href="link2.html">second itema>li>
         <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
         <li class="item-1 active"><a href="link4.html">fourth itema>li>
         <li class="item-0"><a href="link5.html">fifth itema>li>
     ul>
 div>

parents元素

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents()
print(type(parents))
print(parents)

返回所有的父元素。
结果为

<class 'pyquery.pyquery.PyQuery'>
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>
 div><div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>

还可以加入参数进行筛选。

parent = items.parents('.wrap')
print(parent)

选取类为wrap的标签。

兄弟元素

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')  //没加空格代表同类,同时选择item-0和active的标签
print(li.siblings())

选择类为list的 选择item-0的标签。
li.siblings()即为选择兄弟元素的。

遍历

html = '''
<div class="wrap">
    <div id="container">
        
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items()
print(type(lis))
for li in lis:
    print(li)

.items()方法,返回一个迭代对象。

<class 'generator'>
<li class="item-0">first itemli>

<li class="item-1"><a href="link2.html">second itema>li>

<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>

<li class="item-1 active"><a href="link4.html">fourth itema>li>

<li class="item-0"><a href="link5.html">fifth itema>li>

获取信息

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.attr('href'))
print(a.attr.href)

选取a下属性为href的内容。

<a href="link3.html"><span class="bold">third itemspan>a>
link3.html
link3.html

获取文本

html = '''
<div class="wrap">
    <div id="container">
        
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text())

结果为:

<a href="link3.html">"bold">third itema>
third item

获取html

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html())

获取整个html代码。

DOM操作

addClass、removeClass

增加类和删除类。

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active') //选择li标签
print(li)
li.removeClass('active') //移除active标签
print(li)
li.addClass('active')  //增加active镖旗啊
print(li)

输出结果:

<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>

<li class="item-0"><a href="link3.html"><span class="bold">third itemspan>a>li>

<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>

修改属性和css

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link')  //把li增加标签name=link。如果已经存在name属性则改变name=link。
print(li)
li.css('font-size', '14px')//设置font-size=14px
print(li)

输出结果:

<li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>

<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third itemspan>a>li>

<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third itemspan>a>li>

remove

html = '''
Hello, World

This is a paragraph.

'''
from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove() print(wrap.text())

如果只获取Hello,world
.remove移除。
运行结果:

Hello, World This is a paragraph.
Hello, World

其他DOM方法

http://pyquery.readthedocs.io/en/latest/api.html

伪类选择器

html = '''
<div class="wrap">
    <div id="container">
        <ul class="list">
             <li class="item-0">first itemli>
             <li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-0 active"><a href="link3.html"><span class="bold">third itemspan>a>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>
         ul>
     div>
 div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') //第一个孩子标签
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)') //第n个孩子标签
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)

结果:

<li class="item-0">first itemli>

<li class="item-0"><a href="link5.html">fifth itema>li>

<li class="item-1"><a href="link2.html">second itema>li>

<li class="item-1 active"><a href="link4.html">fourth itema>li>
             <li class="item-0"><a href="link5.html">fifth itema>li>

<li class="item-1"><a href="link2.html">second itema>li>
             <li class="item-1 active"><a href="link4.html">fourth itema>li>

<li class="item-1"><a href="link2.html">second itema>li>

官方文档

http://pyquery.readthedocs.io/

你可能感兴趣的:(Python从入门到放弃系列)