@python实现多线程爬取动态表情
在网上练习项目项目,看到这个表情包爬取,刚好也解决一下自己的表情库存告急情况
主要是包含了爬虫部分和多线程类的构造两大部分
直接上代码
def downloads_img(url, path):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36 HBPC/11.0.5.300'
}
# 2、发送请求
response = requests.get(url=url, headers=headers, stream=True)
selectors = parsel.Selector(response.text)
num = 0
data_list = selectors.xpath('//div[@id="container"]//div/a/img')
for data in data_list:
title = data.xpath('./@title').getall()
title = ''.join(title)
img_url = data.xpath('./@data-original').getall()
img_url = ''.join(img_url)
# 数据保存
try:
with open(path + title[0:4] + os.path.splitext(img_url)[-1], mode='wb') as f:
img_data = requests.get(url=img_url, headers=headers, stream=True).content
print("正在保存图片:", title)
f.write(img_data)
num += 1
print("保存成功:", title, num)
except:
pass
# 多线程类,即从threading.thread继承创建的子类,并实例化后调用 start() 方法启动新线程,即它调用了线程的 run() 方法
class DOwnload_img(Thread):
# 重写构造函数
def __init__(self, queue, path):
Thread.__init__(self)
#类属性
self.queue = queue
self.path = path
if not os.path.exists(path):
os.mkdir(path)
# 调用了线程的run()方法
def run(self) -> None:
while True:
url = self.queue.get()
try:
downloads_img(url, self.path)
except:
print("下载失败")
# 防止程序假死
finally:
# 当爬虫程序执行完成/出错中断之后,发送消息给线程, 代表线程必须停止执行
self.queue.task_done()
其中用到的第三方库
import os
import parsel
import requests
#多线程程序用到的包
from queue import Queue
from threading import Thread
main函数
if name == ‘main’:
img_url = ‘https://www.fabiaoqing.com/biaoqing/lists/page/{page}.html’
urls = [img_url.format(page = page) for page in range(1, 101)]
queue = Queue()
path = ‘img11/’
# 创建线程 x 个
for x in range(10):
worker = DOwnload_img(queue, path)
worker.daemon = True # 线程保护
worker.start()
for url in urls:
queue.put(url)
queue.join()
print("下载完成...")