python爬虫之xpath详解(附加实战代码)

xpath解析

  • 最常用切最便捷高效的一种方式
  • xpath解析原理:
    1. 实例化一个etree的对象,且需要将被解析的页面源码数据加载到该对象中
    2. 调用etree对象中的xpath方法结合xpath表达式实现标签的定位和内容的捕获
  • 环境安装
    • pip install lxml
  • 如何实例化一个etree对象
    1. 导包:from lxml import etree
    2. 将本地的HTML文档中的数据加载到该对象中:
      • etree.parse(filePath,etree.HTMLParser())
      • 解析本地本件第二个参数最好加上,不然可能报错
    3. 可以将互联网上获取的源码数据加载到该对象中
      • etree.HTML(‘page_text’)
  • xpath(‘xpath表达式’)
    • 标签定位:
      • xpath表达式只能用层级定位定位标签
      # 标签的定位
      #最前面的/表示从根节点开始
      # 一个标签返回一个element对象
      r=tree.xpath('/html/head/title')
      
    • 多层级定位
      #一个//表示一个多层级,也可以表示从任意位置开始定位
      r=tree.xpath('/html//title')
      
    • 精准定位
      #精准定位class为song的divs
      r = tree.xpath('//div[@class="ong"]')
      
    • 索引定位
      # 索引定位,返回第几个元素,且索引从1开始
      r = tree.xpath('//div[@class="song"]/p[3]')
      
    • 取直系文本
      #取文本,text()返回的是一个列表,取得是直系内容
        r = tree.xpath('//div[@class="song"]//li[5]/a/text()')
      
    • 取非直系文本
      #获取标签中非直系的文本内容
      r = tree.xpath('//li[7]//text()')
      
    • 取属性值
      #取属性值
      r = tree.xpath('//div[@class="song"]/img/@src')[0]
      
    • 以上所有xpath方法返回的都是列表

xpath实战之爬取58二手房

import requests
from lxml import etree
if __name__=='__main__':
    # 获取页面源码数据
    url='https://bj.58.com/ershoufang/'
    # UA伪装
    head={
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
    }
    page_text=requests.get(url=url,headers=head).text
    #数据解析
    tree =etree.HTML(page_text)
    list=tree.xpath('//div[@class="property-content-detail"]')
    fp=open('58.txt','w',encoding='utf-8')
    for h3 in list:
        #./表示定位到的div标签
        title=h3.xpath('.//text()')[0]
        print(title)
        fp.write(title+'\n')
    fp.close()

xpath实战之4k图片解析下载

import requests
from lxml import etree
import os
if __name__=='__main__':
    # 获取页面源码数据
    url='https://pic.netbian.com/4kmeinv/'
    # UA伪装
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
    }
    response = requests.get(url=url, headers=head)
    #手动给响应数据设置编码
    # response.encoding='gbk'
    page_text=response.text
    tree=etree.HTML(page_text)
    li_list=tree.xpath('//div[@class="slist"]/ul/li')
    if not os.path.exists('./picLibs'):
        os.mkdir('./picLibs')
    for li in li_list:
        img_src='https://pic.netbian.com'+li.xpath('./a/img/@src')[0]
        img_name=li.xpath('./a/img/@alt')[0]+'.jpg'
        # 通用解决中文乱码的解决方案
        img_name=img_name.encode('iso-8859-1').decode('gbk')
        # print(img_name,img_src)
        img_data=requests.get(url=img_src,headers=head).content
        img_path='picLibs/'+img_name
        with open(img_path,'wb')as fp:
            fp.write(img_data)
            print(img_name+'下载完成!!')

xpath实战之全国城市名称爬取

import requests
from lxml import etree
import os
if __name__=='__main__':
    # 获取页面源码数据
    url='https://www.aqistudy.cn/historydata/'
    # UA伪装
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
    }
    page_text = requests.get(url=url, headers=head).text
    tree = etree.HTML(page_text)
    host_li_list=tree.xpath('//div[@class="bottom"]/ul/li')
    all_city_names=[]
    # 也可以一次获取全部
    # tree.xpath('//div[@class="bottom"]/ul/li/a | //div[@class="bottom"]/ul/div[2]/li/a')
    #解析热门城市名称
    for li in host_li_list:
        host_city_name=li.xpath('./a/text()')[0]
        all_city_names.append(host_city_name)
    city_names_list=tree.xpath('//div[@class="bottom"]/ul/div[2]/li')
    #解析全部城市名称
    for li in city_names_list:
        city_name=li.xpath('./a/text()')[0]
        all_city_names.append(city_name)
    print(all_city_names,len(all_city_names))

xpath实战之图片爬取

import requests
import os
from lxml import etree
if __name__=='__main__':
    lxm=0
    url = 'https://www.vilipix.com/ranking'
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.99 Safari/537.36 Edg/97.0.1072.69'
    }
    page_text=requests.get(url=url,headers=head).text
    tree = etree.HTML(page_text)
    img_list=tree.xpath('//div[@class="title"]/a')
    if not os.path.exists('./p站'):
        os.mkdir('./p站')
    for i in img_list:
        img_url='https://www.vilipix.com'+i.xpath('./@href')[0]
        img_data = requests.get(url=img_url, headers=head).text
        ptree=etree.HTML(img_data)
        p_list=ptree.xpath('//a[@href="javascript: void(0)"]/img')
        for img in p_list:
            lxm+=1
            img_p=img.xpath('./@src')[0]
            pp=requests.get(url=img_p,headers=head).content
            img_name=img.xpath('./@alt')[0]+str(lxm)+'.jpg'
            img_path='p站/'+img_name
            img_path=img_path.replace("?","L")
            with open(img_path,'wb') as fp:
                fp.write(pp)
                print(img_name+'下载完成!!')
    print("over!!!!!")

你可能感兴趣的:(python,python,爬虫,开发语言)