Python3爬虫入门之pyquery库的使用

pyquery

初始化

字符串初始化

html = '''

'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('li'))  # CSS选择器
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • URL初始化

    from pyquery import PyQuery as pq
    doc = pq(url='http://www.baidu.com')
    print(doc('head'))
    百度一下,你就知道 
    

    文件初始化

    from pyquery import PyQuery as pq
    doc = pq(filename='demo.html')
    print(doc('li'))
    ---------------------------------------------------------------------------
    
    FileNotFoundError                         Traceback (most recent call last)
    
     in ()
          1 from pyquery import PyQuery as pq
    ----> 2 doc = pq(filename='demo.html')
          3 print(doc('li'))
    
    
    D:\Anaconda3\lib\site-packages\pyquery\pyquery.py in __init__(self, *args, **kwargs)
        214             # specific case to get the dom
        215             if 'filename' in kwargs:
    --> 216                 html = open(kwargs['filename'])
        217             elif 'url' in kwargs:
        218                 url = kwargs.pop('url')
    
    
    FileNotFoundError: [Errno 2] No such file or directory: 'demo.html'
    

    基本CSS选择器

    ‘id’ = ‘#’, ‘class’ = ‘.’, 标签直接写

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    print(doc('#container .list li'))  # 'id' = '#', 'class' = '.'
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • 查找元素

    子元素

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    print(type(items))
    print(items)
    lis = items.find('li')
    print(type(lis))
    print(lis)
    
    
    
    
    
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • lis = items.children()
    print(type(lis))
    print(lis)
    
    
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • lis = items.children('.active')
    print(lis)
  • third item
  • fourth item
  • 父元素

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    container = items.parent()
    print(type(container))
    print(container)
    
    
    
    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    items = doc('.list')
    parents = items.parents()
    print(type(parents))
    print(parents)
    
    
    
    parent = items.parents('.wrap')
    print(parent)
    
    

    兄弟元素

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.list .item-0.active')
    print(li.siblings())
  • second item
  • first item
  • fourth item
  • fifth item
  • html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.list .item-0.active')
    print(li.siblings('.active'))
  • fourth item
  • 遍历

    单个元素

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
  • third item
  • html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    lis = doc('li').items()
    print(type(lis))
    for li in lis:
        print(li)
    
    
  • first item
  • second item
  • third item
  • fourth item
  • fifth item
  • 获取信息

    获取属性

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.attr('href'))
    print(a.attr.href)
    third item
    link3.html
    link3.html
    

    获取文本

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    a = doc('.item-0.active a')
    print(a)
    print(a.text())
    third item
    third item
    

    获取HTML

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    print(li.html())
  • third item
  • third item

    DOM操作

    addClass、removeClass

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    li.removeClass('active')
    print(li)
    li.addClass('active')
    print(li)
  • third item
  • third item
  • third item
  • attr、css

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('.item-0.active')
    print(li)
    li.attr('name', 'link')
    print(li)
    li.css('font-size', '14px')
    print(li)
  • third item
  • third item
  • third item
  • remove

    html = '''
    
    Hello, World

    This is a paragraph.

    '''
    from pyquery import PyQuery as pq doc = pq(html) wrap = doc('.wrap') print(wrap.text()) wrap.find('p').remove() print(wrap.text())
    Hello, World This is a paragraph.
    Hello, World
    

    其他DOM方法

    http://pyquery.readthedocs.io/en/latest/api.html

    伪类选择器

    html = '''
    
    '''
    from pyquery import PyQuery as pq
    doc = pq(html)
    li = doc('li:first-child')
    print(li)
    li = doc('li:last-child')
    print(li)
    li = doc('li:nth-child(2)')
    print(li)
    li = doc('li:gt(2)')
    print(li)
    li = doc('li:nth-child(2n)')
    print(li)
    li = doc('li:contains(second)')
    print(li)
  • first item
  • fifth item
  • second item
  • fourth item
  • fifth item
  • second item
  • fourth item
  • second item
  • 更多CSS选择器可以查看
    http://www.w3school.com.cn/css/index.asp

    官方文档

    http://pyquery.readthedocs.io/

    你可能感兴趣的:(爬虫)