python中request、lxml、xpath使用

request、lxml、xpath

request

环境搭建

pip install requests

使用方法

  • 下载完包之后,在项目中引入包
import requests

发送请求

get请求

import requests

# 通过request发送请求
result = requests.get("https://www.douban.com/", headers=headers)
print(result)  # 
print(result.headers)  # 响应头

with open(r"H:\File\Python\爬虫\response\01 resquest\index.html", "a", encoding="utf-8") as File:
    File.write(result.text)

如果发生不可以访问,说明没有设置请求头,在python请求中默认携带的浏览器是python自己的,所以会被反爬

# 模拟request请求方式
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

post请求

import requests

# 模拟request请求方式
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

data = {
    "wd": "python"
}

result = requests.post(f"https://www.baidu.com/", headers=headers, data=data)

print(result.text)

携带params参数请求

import requests

# 模拟request请求方式
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

keyWord = {
    "wd": "python"
}

result = requests.get(f"https://www.baidu.com/", headers=headers, params=keyWord)
# 指定编码格式
result.encoding = "utf-8"

# text 返回文本数据
print(result.text)

# content 字节流数据,如图片、视频等
print(result.content)

# url 查看完整的url地址
print(result.url)

# encoding 查看编码方式
print(result.encoding)

with open(r"H:\File\Python\爬虫\response\01 resquest\02.html", "a", encoding="utf-7") as File:
    File.write(result.text)

JSON数据

  • 直接在后面xxx.hson获取到的是字典类型的数据
import requests

url = "https://zhuanlan.zhihu.com/api/articles/xxx/xxx"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
    "Referer": "https://zhuanlan.zhihu.com/p/620865916",
    "Cookie": "_zap=c0b8f512-012a-4601-a317-5772cb"
}

resp = requests.get(url, headers=headers).json()
print(resp["paging"]["previous"])

session请求

import requests
url = "https://v.uuuab.com/72/72143/23xxx1.shtml"
data = {"usename": "dad", "password": "dasd"}

resp = requests.Session().post(url, data=data)
resp.encoding = "gbk"
print(resp.text)

lxml

环境搭建

  • 需要安装lxml
pip install lxml

使用方式

  • 在项目需要引入
import requests
from lxml import etree

使用方式

获取标签内的文字

# 需要使用到 /a/text()
text = text.xpath('//*[@id="floor-www-index_558"]/a/text()')

获取某属性文字

# 包含 contains 假如 navigation-变化的值
text.xpath('//*[@id="floor-av_557"]/ul/li[contains(@class,"navigation-")]/a/text()')
# 获取a标签中href中的值
text.xpath('//*[@id="floor-nav_557"]/div/div/div/ul/li/a[contains(@href,"https://")]')

实例

使用方式

import requests
from lxml import etree

htmlText = requests.get("https://www.csdn.net/")
text = etree.HTML(htmlText.text)

# 找到 class 名为 headswiper-item
print(text.xpath(
    '//*[@id="floor-www-index_558"]/div/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[@class="headswiper-item"]'))

# a 标签下的text内容
print(text.xpath(
    '//*[@id="floor-www-index_558"]/div/div[3]/div[1]/div[2]/div[1]/div[2]/div/div[@class="headswiper-item"]/a/text()'))

# 按照文字匹配
print(text.xpath('//*[@id="floor-nav_557"]/div/div/div/ul/li/a[text()="后端"]/text()'))

# 包含 contains 假如 navigation-变化的值
print(text.xpath('//*[@id="floor-nav_557"]/div/div/div/ul/li[contains(@class,"navigation-")]/a/text()'))

# 获取a标签中的 href
for x in text.xpath('//*[@id="floor-nav_557"]/div/div/div/ul/li/a[contains(@href,"https://")]'):
    print(etree.tostring(x)

爬取图片

from lxml import etree
import requests

url = "https://tieba.baidu.com/p/84301xxx83"
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

htmlText = requests.get(url, headers=headers).text
text = etree.HTML(htmlText)

imgUrlList = [item for item in text.xpath('//*[@id="post_content_147694672167"]/img/@src')]

index = 0
for imgUrl in imgUrlList:
    with open(r"H:\File\Python\爬虫\response\02 xpath\{}.jpg".format(index), "wb") as File:
        File.write(requests.get(imgUrl, headers=headers).content)
        pass
    index += 1

爬取图片

import time

import requests
from lxml import etree


# 发送网页请求
class Request:
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
    }

    response_text = ""
    response_content = ""

    def __init__(self, url: str, encoding: str = "gbk") -> object:
        try:
            # 发送请求
            response = requests.get(url, headers=self.headers)
            response.encoding = encoding
            self.response_text = response.text
            self.response_content = response.content
        except Exception as e:
            print(e)

    def __str__(self):
        return str(self.response_text)

    def get_text(self) -> str:
        return str(self.response_text)


# 解析网页
class EtreeHtml:
    img_infolist = []
    page_list = []

    def __init__(self, html_text: str):
        etree_html = etree.HTML(html_text)
        # 当前网页页码
        self.page_list = etree_html.xpath('//*[@id="main"]/div[@class="page"]/a/@href')
        # 图片列表
        for item in etree_html.xpath('//*[@id="main"]/div[3]/ul/li'):
            try:
                img_src = item.xpath("a/img/@src")[0]
                img_info = item.xpath("a/b/text()")[0]
                img_obj = {
                    "img_info": img_info,
                    "img_src": img_src
                }
                self.img_infolist.append(img_obj)
                pass
            except Exception as e:
                print("暂无数据", e)
                pass
            pass
        pass

    def __str__(self):
        return self.img_infolist

    def ImgInfolist(self):
        return self.img_infolist

    def Pagelist(self):
        return self.page_list


# 写入文件
class WriteImages:
    def __init__(self, path: str, source_data: bytes, Filename: str):
        with open(f"{path}/{Filename}", "wb") as File:
            print(f"正在写入{Filename}")
            File.write(source_data)


class craw:
    def __init__(self, url: str, index: int):
        html_text = Request(url).response_text
        html_info = EtreeHtml(html_text).img_infolist
        for item in html_info:
            img_info = item["img_info"]
            img_src = item["img_src"]
            WriteImages("./response/02 xpath/美女图网", Request(img_src).response_content, f"{img_info}.jpg")  # 写入图片
            pass
        print("-------------写入完成----------------")
        time.sleep(1)


if __name__ == "__main__":
    url_list = [
        f"http://www.netbian.com/index_{index}.htm" for index in range(2, 30)
    ]
    count = 2
    for item in url_list:
        print(f"-------------{count}-------------")
        craw(item, count)
        count += 1

BeautifulSoup

  • print(bs.title.text) # 读取title标签内容
  • print(bs.title.string) # 读取title标签内容
    • 区别是:
      • text不会获取注释内容
      • string会获取注释内容

环境安装

pip install bs4

使用方式

url = "https://v.uuuab.com/72/72143/2xxx31.shtml"
resp = requests.Session().post(url)
resp.encoding = "gbk"
bs = BeautifulSoup(resp.text, "html.parser")

print(bs.title)  # 读取title标签
print(bs.title.text)  # 读取title标签内容
print(bs.title.string)  # 读取title标签内容
print(bs.p.attrs)  # 获取p的属性

# 获取单个属性
print(bs.p.get("class"))
print(bs.p["class"])
print(bs.a["href"])

常用方法

  • 常用方法
    1. find(‘div’,class_=“abc”)
    2. find_all(‘div’,class_=“abc”)
    3. select()
      • bs.select(“#abc”)
      • bs.select(“.abc”)
from bs4 import BeautifulSoup
from lxml import etree
import requests

url = "https://v.uuuab.com/72/72143/232xxx1.shtml"
resp = requests.Session().post(url)
resp.encoding = "gbk"
bs = BeautifulSoup(resp.text, "html.parser")

find_p_text = bs.find("p", class_="version channel")
print(etree.HTML(str(find_p_text)).xpath("//a/@href"))  # xpath相结合,但是需要将 find_p_text 转为str类型

# 找到小说的内容
find_div_content = bs.find("div", id="chaptercontent").get_text()
with open("./response/BeautifulSoup/楼柒沉煞小说-第1章 落入血人怀抱.txt", "w", encoding="utf-8") as File:
    File.write(find_div_content)

你可能感兴趣的:(Python,python,开发语言)