如题:这次来个真正的多线程版本。
估计也没人看,如果有感兴趣的或者不懂的可以留言。有目前正在学爬虫的也可以交流。
import threading
import requests
from lxml import etree
import urllib
import re
from queue import Queue
path = 'D:/壁纸/python/斗图啦-多线程版本/' # 设置图片的存储路径
class Producer(threading.Thread):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36",
}
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Producer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
# 获取每个页面的url并传递给parse_page函数
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get() # 获取最后一个url
self.parse_page(url)
# 解析每个页面
def parse_page(self, url):
response = requests.get(url, headers=self.headers)
text = response.text
html = etree.HTML(text)
imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
# 获取每个页面所有图片的下载链接和名字,并存入图片信息队列img_queue中
for img in imgs:
img_url = img.get('data-original') # get函数可以获取到标签中的属性
alt = img.get('alt')
alt = re.sub(r'[\??\.。\d!!/::=<>|"]', '', alt)
self.img_queue.put((img_url, alt)) # 将每张图片的url添加到队列中
class Consumer(threading.Thread):
def __init__(self, page_queue, img_queue, *args, **kwargs):
super(Consumer, self).__init__(*args, **kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
# 从图片信息队列中获取下载链接和名字,并下载
def run(self):
while True:
if self.page_queue.empty() and self.img_queue.empty():
break
img_url, alt = self.img_queue.get() # 获取每个图片的链接
urllib.request.urlretrieve(img_url, path + alt + '.jpg') # 下载图片到本地并定义图片名
print('图片"%s"下载成功' % alt)
def main():
page_queue = Queue(1000) # 生成线程安全的队列
img_queue = Queue(10000)
for x in range(1, 10):
url = "https://www.doutula.com/photo/list/?page=%d" % x
page_queue.put(url) # 将链接添加到队列中
for x in range(5):
t = Producer(page_queue, img_queue)
t.start()
for x in range(5):
t = Consumer(page_queue, img_queue)
t.start()
if __name__ == '__main__':
main()