python爬虫创建进程池下载

一、使用multiprocessing.Pool

二、使用步骤

1.代码

代码如下(示例):

import json
import multiprocessing
import os
import random
from turtle import pd
import requests

# 多个请求头
User_agent_list = [
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]  
# 随机选择请求头
headers = {'User-Agent': random.choice(User_agent_list)}


# 图片的保存路径
path = r'C:\Users\...\zazhi'
# 待下载的图片
url_list = [。。。]

def read_ip():
    try:
        with open('ip_data.json', 'r', encoding='utf-8') as fs:
            ip = json.loads(fs.read())  
   			# fs.read()读取出一个str字符串,通过json.loads 将字符串的内容反序列化成Python对象
    except IOError as e:
        print(e)
    print('读取数据完成!')
    return ip

def download_pics(url,path,can_use_ip):
    '''
    下载文件
    :param url: 被下载文件的url
    :param path: 文件的保存路径
    :param can_use_ip: 所使用的代理ip
    :return: null
    '''
    # 图片名称

    file_name = url.split('/')[-1]
    print('开始下载:%s'%file_name)
    ip = random.choice(can_use_ip)
    try:
        response = requests.get(url, headers=headers, proxies=ip)
    except requests.exceptions.ConnectionError:
        pass
    except requests.exceptions.ChunkedEncodingError:
        pass
    else:
        assert response.status_code == 200

        if os.path.isfile(path + "\{}".format(file_name)):
            print('文件({})已存在{}'.format(file_name,url))
            print('----------------&----------------')

        else:
            with open(path + "\{}".format(file_name), 'wb') as f:
                f.write(response.content)
                print('下载完成图片{}.{}'.format(file_name,url))

if __name__ == '__main__':
    ip = read_ip()
    # 创建进程池
    pool = multiprocessing.Pool(30)
    # 添加任务
    for url in url_list:
        pool.apply_async(download_pics,(url,path,ip))
    # 进程池不再接受新的请求,调用close
    pool.close()
    # 等待子进程结束
    pool.join()

总结

这种通过创建进程池的方法可以提高下载的速度,但是并不太明显,要下载的东西不多的时候,可以用

你可能感兴趣的:(python,python,多进程)