python多线程爬虫爬取斗图啦网站表情包图片

如题:这次来个真正的多线程版本。
估计也没人看,如果有感兴趣的或者不懂的可以留言。有目前正在学爬虫的也可以交流。

import threading
import requests
from lxml import etree
import urllib
import re
from queue import Queue

path = 'D:/壁纸/python/斗图啦-多线程版本/'  # 设置图片的存储路径


class Producer(threading.Thread):
    headers = {
        'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36",
        }

    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Producer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    # 获取每个页面的url并传递给parse_page函数
    def run(self):
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()  # 获取最后一个url
            self.parse_page(url)

    #  解析每个页面
    def parse_page(self, url):
        response = requests.get(url, headers=self.headers)
        text = response.text
        html = etree.HTML(text)
        imgs = html.xpath("//div[@class='page-content text-center']//img[@class!='gif']")
        # 获取每个页面所有图片的下载链接和名字,并存入图片信息队列img_queue中
        for img in imgs:
            img_url = img.get('data-original')  # get函数可以获取到标签中的属性
            alt = img.get('alt')
            alt = re.sub(r'[\??\.。\d!!/::=<>|"]', '', alt)
            self.img_queue.put((img_url, alt))  # 将每张图片的url添加到队列中


class Consumer(threading.Thread):
    def __init__(self, page_queue, img_queue, *args, **kwargs):
        super(Consumer, self).__init__(*args, **kwargs)
        self.page_queue = page_queue
        self.img_queue = img_queue

    # 从图片信息队列中获取下载链接和名字,并下载
    def run(self):
        while True:
            if self.page_queue.empty() and self.img_queue.empty():
                break
            img_url, alt = self.img_queue.get()  # 获取每个图片的链接
            urllib.request.urlretrieve(img_url, path + alt + '.jpg')  # 下载图片到本地并定义图片名
            print('图片"%s"下载成功' % alt)


def main():
    page_queue = Queue(1000)  # 生成线程安全的队列
    img_queue = Queue(10000)
    for x in range(1, 10):
        url = "https://www.doutula.com/photo/list/?page=%d" % x
        page_queue.put(url)  # 将链接添加到队列中

    for x in range(5):
        t = Producer(page_queue, img_queue)
        t.start()

    for x in range(5):
        t = Consumer(page_queue, img_queue)
        t.start()


if __name__ == '__main__':
    main()


你可能感兴趣的:(python,爬虫)