男生福利,女生勿看!!!xpath 爬取某网站小姐姐图片

效果图请自行脑补,此处不做展示(狗头保命)

from lxml import etree
import requests
import os

if __name__ == '__main__':
    url = "http://pic.netbian.com/4kmeinv/"
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
                      "85.0.4183.83 Safari/537.36"
    }

    response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    page_text = response.text

    tree = etree.HTML(page_text)

    li_list = tree.xpath("//div[@class='slist']/ul/li")

    if not os.path.exists("./picLibs"):
        os.mkdir("./picLibs")

    for li in li_list:
        img_src = "http://pic.netbian.com" + li.xpath("./a/img/@src")[0]
        img_name = li.xpath("./a/img/@alt")[0] + ".jpg"

        img_data = requests.get(url=img_src, headers=headers).content
        img_path = "picLibs/" + img_name

        with open(img_path, "wb") as fp:
            fp.write(img_data)
            print(img_name + "下载成功")

    for x in range(2, 172):
        url_too = f"http://pic.netbian.com/4kmeinv/index_{x}.html"
        response_too = requests.get(url=url_too, headers=headers)
        response_too.encoding = response_too.apparent_encoding
        page_text = response_too.text

        tree_too = etree.HTML(page_text)

        li_list = tree_too.xpath("//div[@class='slist']/ul/li")

        for li_too in li_list:
            new_img_src = f"http://pic.netbian.com/4kmeinv/index_{x}.html" + li_too.xpath("./a/img/@src")[0]
            img_name = li_too.xpath("./a/img/@alt")[0] + ".jpg"

            img_data = requests.get(url=new_img_src, headers=headers).content
            img_path = "picLibs/" + img_name

            with open(img_path, "wb") as fp:
                fp.write(img_data)
                print(img_name + "下载成功")

优化后(可能并没有优化)

from lxml import etree
import requests
import os


def get(url, headers):
    response = requests.get(url=url, headers=headers)
    response.encoding = response.apparent_encoding
    return response.text


def parse(url, headers):
    response = get(url=url, headers=headers)
    tree = etree.HTML(response)
    li_list = tree.xpath("//div[@class='slist']/ul/li")
    return li_list


def save(url):
    for li in li_list:
        img_src = url + li.xpath("./a/img/@src")[0]
        img_name = li.xpath("./a/img/@alt")[0] + ".jpg"
        img_data = requests.get(url=img_src, headers=headers).content
        img_path = "小姐姐图片/" + img_name

        with open(img_path, "wb") as fp:
            fp.write(img_data)
            print(img_name + "下载成功")


if __name__ == '__main__':
    url = "http://pic.netbian.com/4kmeinv/"
    headers = {
     
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/"
                      "85.0.4183.83 Safari/537.36"
    }

    li_list = parse(url=url, headers=headers)

    if not os.path.exists("./小姐姐图片"):
        os.mkdir("./小姐姐图片")

    save(url)

    for x in range(2, 172):
        url = f"http://pic.netbian.com/4kmeinv/index_{x}.html"
        
        li_list = parse(url=url, headers=headers)
        
        save(url)

你可能感兴趣的:(Spider_Practise)