python 多线程下载秀人集写真

python 多线程下载秀人集写真


import os
import requests
import time
from urllib import parse
from concurrent.futures import ThreadPoolExecutor

from lxml import etree
requests.DEFAULT_RETRIES = 2


def downoad():
    pool = ThreadPoolExecutor(max_workers=10)
    # 目标网址:https://www.xiurenb.com

    # 定义请求头

    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36 Edg/96.0.1054.62'
    }

    # 格式化列表
    img_list = []
    url_list = []
    page_list = []

    # 编码输入数据
    human_unencode = input('Enter the human_name:')
    human_encode = parse.quote(human_unencode)

    # 编码后索引url
    url_human = 'https://www.xiurenba.cc/plus/search/index.asp?keyword=' + str(human_encode) + '&searchtype=title'

    # 获取指定人物写真集列表页数
    res_first = requests.get(url_human, headers)
    tree_first = etree.HTML(res_first.text)
    Num_first = len(tree_first.xpath('/html/body/div[3]/div[1]/div/div/ul/div[3]/div/div[2]/a'))
    print(f'Page_total:{Num_first}')

    # 获取指定页数的每个写真集的url并写入列表
    i = input('Enter the PageNumber:')
    downloadPages = tuple(int(item) for item in i.split())
    for dPage in downloadPages:
        res_human = requests.get(url_human + '&p=' + str(dPage))
        tree_human = etree.HTML(res_human.text)
        jihe_human = tree_human.xpath('/html/body/div[3]/div[1]/div/div/ul/div[3]/div/div[1]/div/div[1]/h2/a/@href')
        # 遍历当前页中的所有写真url
        for page in jihe_human:
            page_list.append(page)
        time.sleep(2)


    # 获取每个写真集的全部图片
    for Page_Num in page_list:
        url = 'https://www.xiurenba.cc' + str(Page_Num)
        Num_res = requests.get(url, headers)
        Num_tree = etree.HTML(Num_res.text)
        Num = len(Num_tree.xpath('/html/body/div[3]/div/div/div[4]/div/div/a'))
        url_list.append(url)
        for i in range(1, int(Num) - 2):
            url_other = url[:-5] + '_' + str(i) + '.html'
            url_list.append(url_other)
        # 获取所有图片url
        for url_img in url_list:
            res = requests.get(url_img, headers)
            tree = etree.HTML(res.text)
            img_src = tree.xpath('/html/body/div[3]/div/div/div[5]/p/img/@src')
            for img in img_src:
                img_list.append(img)
        # 创建保存目录
        res = requests.get(url_list[0], headers)
        res.encoding = 'utf-8'
        tree = etree.HTML(res.text)
        path_name = tree.xpath('/html/body/div[3]/div/div/div[1]/h1//text()')[0][11:]
        print(path_name)
        if not os.path.exists(f'D:/xiuren/{human_unencode}'):
            os.mkdir(f'D:/xiuren/{human_unencode}')
        the_path_name = f'D:/xiuren/{human_unencode}/' + path_name
        if not os.path.exists(the_path_name):
            os.mkdir(the_path_name)
            # 保存图片数据
            num = 0
            for j in img_list:
                try:
                    pool.submit(writeFile, headers, human_unencode, img_list, j, num, path_name)
                except:
                    continue
                num += 1
                time.sleep(0.2)
            # 再次格式化列表
            img_list = []
            url_list = []
        else:
            print('gone>>>')
            # 再次格式化列表
            img_list = []
            url_list = []

    # 输出结束提示
    print('Finished!')


def writeFile(headers, human_unencode, img_list, j, num, path_name):
    img_url = 'https://www.xiurenba.cc' + j
    img_data = requests.get(img_url, headers, timeout=3).content
    img_name = img_url.split('/')[-1]
    finish_num = str(num) + '/' + str(len(img_list))
    with open(f'D:/xiuren/{human_unencode}/' + path_name + '/' + img_name, 'wb') as f:
        print(f'Downloading the img:{img_name}/{finish_num}')
        f.write(img_data)
        f.close()


# 按间距中的绿色按钮以运行脚本。
if __name__ == '__main__':
    downoad()

你可能感兴趣的:(python,前端,开发语言)