代码如下(示例):
import json
import multiprocessing
import os
import random
from turtle import pd
import requests
# 多个请求头
User_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
# 随机选择请求头
headers = {'User-Agent': random.choice(User_agent_list)}
# 图片的保存路径
path = r'C:\Users\...\zazhi'
# 待下载的图片
url_list = [。。。]
def read_ip():
try:
with open('ip_data.json', 'r', encoding='utf-8') as fs:
ip = json.loads(fs.read())
# fs.read()读取出一个str字符串,通过json.loads 将字符串的内容反序列化成Python对象
except IOError as e:
print(e)
print('读取数据完成!')
return ip
def download_pics(url,path,can_use_ip):
'''
下载文件
:param url: 被下载文件的url
:param path: 文件的保存路径
:param can_use_ip: 所使用的代理ip
:return: null
'''
# 图片名称
file_name = url.split('/')[-1]
print('开始下载:%s'%file_name)
ip = random.choice(can_use_ip)
try:
response = requests.get(url, headers=headers, proxies=ip)
except requests.exceptions.ConnectionError:
pass
except requests.exceptions.ChunkedEncodingError:
pass
else:
assert response.status_code == 200
if os.path.isfile(path + "\{}".format(file_name)):
print('文件({})已存在{}'.format(file_name,url))
print('----------------&----------------')
else:
with open(path + "\{}".format(file_name), 'wb') as f:
f.write(response.content)
print('下载完成图片{}.{}'.format(file_name,url))
if __name__ == '__main__':
ip = read_ip()
# 创建进程池
pool = multiprocessing.Pool(30)
# 添加任务
for url in url_list:
pool.apply_async(download_pics,(url,path,ip))
# 进程池不再接受新的请求,调用close
pool.close()
# 等待子进程结束
pool.join()
这种通过创建进程池的方法可以提高下载的速度,但是并不太明显,要下载的东西不多的时候,可以用