python爬虫xpath,BeautifulSoup和正则用法全

from lxml import etree
from bs4 import BeautifulSoup
import re

html = """



xpath test





  • 时间

  • 地点

  • 任务




这里是个小标题



  1. 1

  2. 2

  3. 3



  • 84

  • 104

  • 223




这里是H3的内容
百度一下

  • test1

  • test2






  • 1

  • 2

  • 3

  • 4

  • 5

  • 6

  • 7

  • 8

  • 9

  • 10






"""

def title():

#第一种,xpath提取
html_etree = etree.HTML(html)
# print(type(html_etree))  #
# result = etree.tostring(html_etree)  #如果标签不全,tostring()可以补全
# print(result.decode('utf-8'))  #tostring()后的数据类型是bytes,需要decode()转成str
title_xpath1 = html_etree.xpath('/html/head/title/text()')  #需要text()把文字解析出来
print('用xpath绝对路径方法提取title:', title_xpath1)  #xpath返回的是列表
title_xpath2 = html_etree.xpath('//head/title/text()')  #效果一样,/表示绝对路径,//表示相对路径
print('用xpath相对路径方法提取title:', title_xpath2)

#第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# print(soup)
# print(type(soup))  #
title_soup = soup.select('title')  #soup.select返回的也是列表,需要提取出来在用get_text()拿出文字
# css选择器,标签名不加修饰,类名前加点,id名前加#,可组合查找
# print(title_soup)
# print(type(title_soup))  #list
title_BeautifuleSoup = title_soup[0].get_text()
# title_BeautifuleSoup = soup.title.get_text()
print('用BeautifulSoup方法提取title:', title_BeautifuleSoup)

#第三种,正则表达式提取
re_pattern = re.compile(r'(.*?)', re.S)  #(.*?)是需要匹配返回的字符串,re.S可换行匹配
# print(type(re_pattern))  #re.compile返回的是数据类型正则表达式:
title_re_compile = re.findall(re_pattern, html)
print('用正则表达式方法提取title:', title_re_compile)
#可以不使用re.compile
title_re = re.findall(r'(.*?)', html)
print('用正则表达式跳过re.compile提取title:', title_re)

def price():

#第一种,xpath提取
html_etree = etree.HTML(html)
# price_xpath = html_etree.xpath('/html/body/div/@price')
# price_xpath = html_etree.xpath('/html/body/child::*/@price')  #child::*  选取当前节点所有子元素
# price_xpath = html_etree.xpath('/html/body/child::div/@price')  # child::div  子节点定位div标签
# price_xpath = html_etree.xpath('//@price')  #相对路径,且price属性只有一个
# price_xpath = html_etree.xpath("//div[@id='testid']/ancestor::div")  #ancestor::  提取所有父辈div元素
# price_xpath = html_etree.xpath("//div[@id='testid']/ancestor::div/@price")  #父辈定位div元素price属性
price_xpath = html_etree.xpath("//div[@id='testid']/ancestor-or-self::div/@price")  # 父辈及当前节点div元素
print('用xpath方法提取price:', price_xpath)

#第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
price_BeautifulSoup = soup.div.attrs['price']
# price_BeautifulSoup = soup.find('div').attrs['price']
# price_BeautifulSoup = soup.select('div')[0].attrs['price']
print('用BeautifulSoup方法提取price:', price_BeautifulSoup)

#第三种,正则表达式提取
re_pattern = re.compile(r'
', re.S) price_re = re.findall(re_pattern, html) print('用正则表达式跳过re.compile提取price:', price_re)

提取第一个div下ul下li的文字

def ul_li():

# 第一种,xpath提取
html_etree = etree.HTML(html)
# ul_li = html_etree.xpath('//div/div[1]/ul/child::*/text()')  #child::节点子元素方法
# ul_li = html_etree.xpath('//div/div[1]/ul/li/text()')
# ul_li = html_etree.xpath("//div[@id='testid']/preceding::div/ul/li/text()")  #preceding::  当前节点标签之前的所有节点,可定点
ul_li = html_etree.xpath("//div[@id='testid']/preceding::li/text()")  #preceding::  可避免重复节点带来的麻烦
print('用xpath方法提取ul标签下的li的内容:', ul_li)

# 第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# 第一种BeautifulSoup方法
# ul_li = soup.select('ul')[0].select('li')
# ul_li = [i.get_text() for i in ul_li]
#另外一种BeautifulSoup方法
ul_li = soup.div.div.get_text()
ul_li = ul_li.strip()  #删除首尾空格
ul_li = ul_li.split('\n')  #按换行符分割字符串
print('用BeautifulSoup方法提取ul_li:', ul_li)

# 第三种,正则表达式提取
re_pattern = re.compile(r'
.*?
.*?
    .*?
  • (.*?)
  • .*?
  • (.*?)
  • .*?
  • (.*?)
  • ', re.S) re_ul_li = re.findall(re_pattern, html) print('用正则表达式跳过re.compile提取ul_li:', re_ul_li)

def first_id():

# 第一种,xpath提取
html_etree = etree.HTML(html)
first_id = html_etree.xpath('//div/div[2]/@id')
print('用xpath方法提取first_id的内容:', first_id)

# 第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
first_id = soup.select('div')[2].attrs['id']
print('用BeautifulSoup方法提取first_id:', first_id)

# 第三种,正则表达式提取
re_comppile = re.compile(r"
", re.S) first_id = re.findall(re_comppile, html) print('用正则表达式跳过re.compile提取first_id:', first_id)

def h2():

# 第一种,xpath提取
html_etree = etree.HTML(html)
h2 = html_etree.xpath('//div/div[2]/h2/text()')
print('用xpath方法提取h2的内容:', h2)

# 第二种,BeautifulSoup提取
soup = BeautifulSoup(html, 'lxml')
# h2 = soup.select('h2')[0].get_text()
h2 = soup.div.h2.get_text()
print('用BeautifulSoup方法提取h2:', h2)

# 第三种,正则表达式提取
re_comppile = re.compile(r'

(.*?)

', re.S) h2 = re.findall(re_comppile, html) print('用正则表达式跳过re.compile提取h2:', h2)

def main():
title()
price()
ul_li()
first_id()
h2()

if name == 'main':
main()

你可能感兴趣的:(python爬虫xpath,BeautifulSoup和正则用法全)