Python爬虫(四) | 解析库--BeautifulSoup、Xpath、pyquery

1.BeautifulSoup

#coding=utf-8
_date_ = '2019/3/28 16:58'
from bs4 import BeautifulSoup

#1.小练
html = """
aaaaa

"""
soup = BeautifulSoup(html,'lxml')
print(soup.prettify()) #输出格式化的Html
print(soup.title.string)

#2.选择元素
html = open('test2.html','r',encoding='utf-8').read()
soup = BeautifulSoup(html,'lxml')
print(soup.title.string)
print(soup.head)
print(soup.p)

#3.提取信息
html = open('test2.html','r',encoding='utf-8').read()
soup = BeautifulSoup(html,'lxml')
# (1)获取名称
print(soup.title.name)
#(2)获取属性
print(soup.p.attrs)
print(soup.p.attrs['name'])
print(soup.p['class'])
#(3)获取内容
print(soup.p.string)

#4.关联选择
#(1)子节点和子孙节点 contents,子节点用children,所有子孙节点,descendants
html = open('test2.html','r',encoding='utf-8').read()
soup = BeautifulSoup(html,'lxml')
print(soup.p.contents)
print(soup.p.children)
for i,j in enumerate(soup.p.children):
    print(i,j)

#(2)父节点 parent,和祖先节点parents
#(3)next_sibing,previous_sibling下一个和上一个兄弟节点
#next_sibings,previous_siblings后面和前面兄弟节点

#5.方法选择器find_all()
import re
html = open('test2.html','r',encoding='utf-8').read()
soup = BeautifulSoup(html,'lxml')
print(soup.find_all(attrs={'class':'story'}))
print(soup.find_all(text=re.compile('they')))

#6.Css选择器
html = open('test2.html','r',encoding='utf-8').read()
soup = BeautifulSoup(html,'lxml')
print(soup.select('.title'))
print(soup.select('a')[2])
for i in soup.select('a'):
    print(i.get_text(),i.string)
    print(i['href'])

2.Xpath

#coding=utf-8
_date_ = '2019/3/26 19:36'
from lxml import etree
text = """

"""
#1.可以补全代码
html = etree.HTML(text)  #进行初始化
result = etree.tostring(html)  #转换成bytes类型
print(result.decode('utf-8')) #转化成str类型

#2.读取文件中的
html = etree.parse('test.html',etree.HTMLParser())
result = etree.tostring(html)
print(result.decode('utf-8'))

#3.选取所有节点 //*  选择所有节点
html = etree.parse('test.html',etree.HTMLParser())
result = html.xpath('//*')
print(result)

#4.选择特定的节点 //li
html = etree.parse('test.html',etree.HTMLParser())
result = html.xpath('//li')
print(result)

#5.选择内部的子节点 //li/a
html = etree.parse('test.html',etree.HTMLParser())
result = html.xpath('//li/a')
print(result)

#6.选择内部的子孙节点 //ul//a
html = etree.parse('test.html',etree.HTMLParser())
result = html.xpath('//ul//a')
print(result)

#7.选择父节点 .. or parent::*
html = etree.parse('test.html',etree.HTMLParser())
result = html.xpath('//a[@href="link1.html"]/../@class')
result2 = html.xpath('//a[@href="link1.html"]/parent::*/@class')
print(result,result2)
#
# #8.属性匹配,@符号进行属性过滤
html = etree.parse('test.html',etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]')
print(result)
#
#9.获取文本 text()
html = etree.parse('test.html',etree.HTMLParser())
result = html.xpath('//li[@class="item-0"]//text()')
print(result)

#10.多属性匹配   contains()
text='
  • first item
  • ' html = etree.HTML(text) result = html.xpath('//li[contains(@class,"li")]') print(result) #11.多属性匹配 text='
  • first item
  • first item
  • ' html = etree.HTML(text) result = html.xpath('//li[contains(@class,"li") and @name="a"]') print(result) #12.按序号选择 html = etree.parse('test.html',etree.HTMLParser()) result = html.xpath('//li[1]//a/text()') print(result) result = html.xpath('//li[last()]//a/text()') print(result) result = html.xpath('//li[position()<3]//a/text()') print(result) result = html.xpath('//li[last()-2]//a/text()') print(result) #13.节点轴选择 html = etree.parse('test.html',etree.HTMLParser()) result = html.xpath('//li[1]/ancestor::*') #祖先节点 print(result) result = html.xpath('//li[1]/ancestor::div') #祖先节点 print(result) result = html.xpath('//li[1]/parent::*') #父亲节点 print(result) result = html.xpath('//li[1]/attribute::*') #属性 print(result) result = html.xpath('//li[1]/child::*') #儿子节点 print(result) result = html.xpath('//li[1]/following::li') #同级节点 print(result)

    3.Xpath

    #coding=utf-8
    _date_ = '2019/3/28 17:48'
    html="""
    
    """
    from pyquery import PyQuery as pq
    #1.初始化
    doc = pq(html)
    print(doc('li'))
    print(doc('.item-1'))
    print(doc('#a'))
    #2.传入网址
    doc = pq(url='https://www.baidu.com')
    # print(doc)
    #3.打开文件
    doc = pq(filename='test.html')
    print(doc('.item-1'))
    
    #查找节点
    #1.查找子孙节点  find()
    #2.查找子节点 children()
    #3.查找父节点 parent()
    #4.查找祖父节点 parents()
    #5.查找兄弟节点siblings()
    
    #获取信息
    #1.获取属性 .attr('href)  or .attr.href
    #2.获取文本  .text()
    
    #伪类选择器
    #第一个节点 :first-child
    #最后一个节点 :last-child
    #第二个节点  :nth-child(2)
    #第三个节点后的 :gt(2)
    #偶数节点 :nth-child(2n)
    #包含内容的节点 :contains(second)
    

    你可能感兴趣的:(爬虫)