多线程爬取斗图啦网表情包

本文旨在交流学习,勿作他用,否则后果自负
环境 linux+pycharm+anaconda

import queue
import requests
import threading
from lxml import etree
from user_agent import UA

class Spider_img(threading.Thread):
    def __init__(self,url_queue):
        super(Spider_img, self).__init__()
        self.url_queue=url_queue
        self.headers={
            "referer": "https://www.doutula.com/photo/list/",
            "upgrade-insecure-requests": "1",
            "user-agent": UA,
      }

    def run(self):
        while not self.url_queue.empty():
            url = self.url_queue.get()
            self.Get_img(url)
            self.url_queue.task_done()

    def Get_img(self,url):
        req = requests.get(url=url,headers=self.headers)
        html = etree.HTML(req.text)
        a_list = html.xpath('//div[@class="page-content text-center"]//img')
        for a in a_list:
            img_url = "".join(a.xpath('./@data-original')).strip()
            print(img_url)

if __name__=='__main__':
    url_queue=queue.Queue()

    for i in range(1,3000):
        url='https://www.doutula.com/photo/list/?page={}'.format(i)
        url_queue.put(url)

    for j in range(20):
        t=Spider_img(url_queue)
        t.start()
        t.join()

你可能感兴趣的:(多线程爬取斗图啦网表情包)