python爬虫---结合urllib.request与xpath下载网页图片

# 网页地址:https://sc.chinaz.com/

# 1) 请求对象的定制中
# (2)获取网页的源码
# (3)下载


# 需求:下载的前十页的图片
# 第一页页码和其他页码不一样
# https://sc.chinaz.com/tupian/dahaitupian.html
# https://sc.chinaz.com/tupian/dahaitupian_2.html

import urllib.request
from lxml import etree


def create_request(page):
    if (page == 1):
        url = "https://sc.chinaz.com/tupian/dahaitupian.html"
    else:
        url = "https://sc.chinaz.com/tupian/dahaitupian_" + str(page) + ".html"
    # print(url)

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
    }
    # 请求对象的定制
    request = urllib.request.Request(url=url, headers=headers)
    return request


def get_content(request):
    # 模拟浏览器向服务器发起请求
    response = urllib.request.urlopen(request)
    # 获取响应数据
    content = response.read().decode("utf-8")
    return content


def save_img(content):
    # urllib.request.urlretrieve("图片地址","文件名字")
    tree = etree.HTML(content)

    # //div[@class='container']//img/@data-original
    # //div[@class='container']//img/@alt"

    # 图片xpath路径
    src_list = tree.xpath("//div[@class='container']//img/@data-original")   # xpath语法
    name_list = tree.xpath("//div[@class='container']//img/@alt")
    # print(len(src_list),len(name_list))

    for i in range(len(name_list)):
        name = name_list[i]
        src = src_list[i]
        url = "http:"+src
        # print(url)
        # 下载图片
        urllib.request.urlretrieve(url=url,filename="./image/"+name+".jpg")  # 需要自己穿建一个image文件夹


if __name__ == '__main__':
    start_page = int(input("请输入起始页码"))
    end_page = int(input("请输入结束页码"))

    for page in range(start_page, end_page + 1):
        # print(page)
        # 1.请求对象的定制
        request = create_request(page)
        # 2.获取网页源码
        content = get_content(request)
        # 3.下载图片
        save_img(content)

你可能感兴趣的:(python,爬虫,开发语言)