python3 2018分布式爬虫教程 -7 PyQuery 库详解

PyQuery:网页解析库,相比于BeautifulSoup语法更简单

安装命令:

pip install pyquery

pyquery 初始化对象的三种方式:

1.字符串初始化:

#coding=utf-8

from pyquery import PyQuery as pq

html='''

Hello

  • Foo
  • Bar
  • Jay
  • Foo
  • Bar
''' # doc 一个pyquery对象 doc = pq(html) # 获取html中所有的 li 标签 print(doc('li')) '''
  • Foo
  • Bar
  • Jay
  • Foo
  • Bar
  • '''

    2.url初始化:当请求返回为gbk编码时可以设置 encoding='gbk' 

    #coding=utf-8
    
    from pyquery import PyQuery as pq
    
    # doc 一个pyquery对象
    doc = pq(url="https://search.51job.com/list/000000,000000,0000,00,9,99,%2B,2,3.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare="
             ,encoding='gbk')
    # 获取html中所有的 li 标签
    print(doc('title'))
    
    
    '''
    【全国招聘,求职】-前程无忧
    '''

    3.文本初始化:

    #coding=utf-8
    
    from pyquery import PyQuery as pq
    doc = pq(filename='demo.html')
    print(doc('li'))
    
    '''
    
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • '''

    4.基本CSS选择器

    . 代表class

    # 代表id

    中间以空格隔开

    当一个 class 或者 id 的内容有空格时:比如:class="item-1 active"

    获取元素时 用: .item-1.active    (注意中间没有空格

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    print(doc('#container .item-0.active a'))
    
    '''
    third item
    '''

    5.查找元素:

    查找子元素: doc.find(‘li’) 查找所有子元素为 li 标签的元素

    查找子元素: doc.children() 查找所有子元素

    查找子元素: doc.children(‘a’) 查找所有子元素为 a 标签的元素

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    item = doc('#container .list')
    print(item.find('li'))
    print("----------------------------")
    print(item.children('a'))
    print("----------------------------")
    print(item.children())
    
    '''
    
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • ---------------------------- fourth item ----------------------------
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • fourth item '''

    查找父元素:

    items.parent(): 查找直接父元素

    items.parent(): 查找直接父元素

    items.parents():查找所有祖先元素

    items.parents('.wrap'):查找指定祖先元素

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    item = doc('#container .list .item-0')
    print(item.parent())
    print("----------------------------")
    print(item.parents())
    print("----------------------------")
    print(item.parents('.list'))
    
    '''
    
     
    ----------------------------
    
     
    ----------------------------
    
    '''

    查找兄弟节点:

    siblings():查找所有兄弟节点

    siblings('.item-0'):查找所有 class 为 'item-0' 的兄弟节点

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    item = doc('#container .list a')
    print(item.siblings())
    print("-------------------------------")
    print(item.siblings('.item-0'))
    
    
    '''
    
  • fifth item
  • fourth item
  • third item
  • second item
  • first item
  • -------------------------------
  • fifth item
  • third item
  • first item
  • '''

    6.遍历:

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    item = doc('#container .list li').items()
    for one in item:
        print(one)
    
    '''
    
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • '''

    7.获取信息:

    获取属性:

    获取文本:

    获取html:

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    item = doc('#container .list .item-0')
    a = item('a')
    print(a.attr.href)
    print(a.attr('href'))
    print('----------------------------------')
    print(a.text())
    print('----------------------------------')
    print(a.html())
    
    '''
    link3.html
    link3.html
    ----------------------------------
    third item fifth item
    ----------------------------------
    third item
    '''
    
    

    8.dom操作:

    addClass、removeClass:此操作是永久失效

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    li = doc('.item-0.active')
    print(li.removeClass('active'))
    print(li)
    print("------------------------------")
    print(li.addClass('sub'))
    print(li)
    
    
    '''
    
  • third item
  • third item
  • ------------------------------
  • third item
  • third item
  • '''

    9.增加,修改,删除属性:attr、css

    增减或者修改属性: a.attr('id','test')        添加 id 属性

    增减或者修改属性: a.css('id','test')        添加 id 属性

    删除属性:a.remove_attr('class')             删除 class

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    a = doc('.list .item')
    print(a.attr('id','test'))
    print(a.remove_attr('id'))
    print("------------------------------")
    print(a.attr('class','test'))
    print(a.remove_attr('class'))
    
    
    '''
    fourth item
         
    fourth item
         
    ------------------------------
    fourth item
         
    fourth item
    '''

    10.删除标签

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    Hello, World

    This is a paragraph.

    ''' doc = pq(html) div_text = doc('.sss') print(div_text.text()) print("-------------------------") print(div_text('p').remove()) print(div_text.text()) ''' Hello, World This is a paragraph. -------------------------

    This is a paragraph.

    Hello, World '''

    after()在节点后添加值

    before()在节点之前插入值

    append()将值添加到每个节点

    contents()返回文本节点内容

    empty()删除节点内容

    val()设置或获取属性值

    其他DOM方法:http://pyquery.readthedocs.io/en/latest/api.html

    11.伪类选择器

    # coding=utf-8
    
    from pyquery import PyQuery as pq
    
    html = '''
    
    '''
    doc = pq(html)
    li = doc('li:first-child')
    print(li)
    li = doc('li:last-child')
    print(li)
    li = doc('li:nth-child(2)')
    print(li)
    li = doc('li:gt(2)')
    print(li)
    li = doc('li:nth-child(2n)')
    print(li)
    li = doc('li:contains(second)')
    print(li)
    
    
    '''
    
  • first item
  • fifth item
  • second item
  • fourth item
  • fifth item
  • second item
  • fourth item
  • second item
  • '''

     

     

     

     

     

     

     

     

     

     

    你可能感兴趣的:(python,pyquery,详解,python,爬虫)