python 图片、文件 通过 request header 多线程下载

写爬虫过程中发现图片下载比较慢,遂使用多线程下载来提速

import threading
import requests


class MulThreadDownload(threading.Thread):
    def __init__(self, url, startpos, endpos, temp_dict, headers, proxies):
        super(MulThreadDownload, self).__init__()
        self.url = url
        self.startpos = startpos
        self.endpos = endpos
        self.temp_dict = temp_dict
        self.headers = headers
        self.proxies = proxies

    def download(self, proxies):
        headers = {"Range": "bytes=%s-%s" % (self.startpos, self.endpos)}
        if self.headers:
            self.headers['Range'] = "bytes=%s-%s" % (self.startpos, self.endpos)
            headers = self.headers

        for i in range(10):
            # 切换代理形式,增加成功率
            if i in [2, 5]:
                proxies = {'http': 'http:{}'.format(proxies.get("https")[6:])}
            if i in [3, 7]:
                proxies = {'http': proxies.get("https")}
            try:
                res = requests.get(self.url, headers=headers, proxies=proxies, timeout=3)
                if res.content:
                    self.temp_dict[self.startpos]=res.content
                    break
                else:
                    continue
            except Exception as e:
                print(f'{self.url} down load error {str(e)}')
                continue


    def run(self):
        self.download(self.proxies)


def download_img_multi_thread(url, headers, proxies):
    # 获取文件的大小和文件名
    filesize = 0
    if headers:
        filesize = int(requests.head(url, headers=headers, proxies=proxies).headers.get('Content-Length'))
    if not headers:
        filesize = int(requests.head(url, proxies=proxies).headers.get('Content-Length'))
    if filesize:
        # 线程数
        threadnum = 5
        # 信号量,同时只允许5个线程运行
        # threading.BoundedSemaphore(threadnum)
        # 默认5线程现在,也可以通过传参的方式设置线程数
        step = filesize // threadnum
        mtd_list = []
        start = 0
        end = -1
        # 如果文件大小为11字节,那就是获取文件0-10的位置的数据。如果end = 10,说明数据已经获取完了。
        temp_dict = dict()
        while end < filesize - 1:
            start = end + 1
            end = start + step - 1
            if end > filesize - 1:
                end = filesize - 1
            if filesize - 1 - end < step:
                end = filesize - 1
            t = MulThreadDownload(url, start, end, temp_dict, headers, proxies)
            t.start()
            mtd_list.append(t)

        for i in mtd_list:
            i.join()
        # 所有线程都下完,组合所有字节到一起
        temp_dict = sorted(temp_dict.items(), key=lambda x: x[0])
        temp_b = b''
        for i in temp_dict:
            temp_b = temp_b+i[1]
        if len(temp_b) == filesize:  # 校验文件大小
            return temp_b
        else:
            print(f'file download failed temp_b {len(temp_b)} filesize {filesize}')
    return None

你可能感兴趣的:(爬虫)