学习Python的日子 爬虫(5)

多线程糗事百科案例

from queue import Queue
from threading import Thread, Lock
import time
import requests
import json
from lxml import etree

# 采集线程是否退出:True退出,False不退出
crawl_exit = False
parse_exit = False


# 采集数据的线程
class ThreadCrawl(Thread):
    def __init__(self, thread_name, page_queue, data_queue):
        super(ThreadCrawl, self).__init__()
        self.thread_name = thread_name
        self.page_queue = page_queue
        self.data_queue = data_queue
        self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/6.0)"}

    def run(self):
        while not crawl_exit:
            try:
                page = self.page_queue.get(block=False)
                url = "https://www.qiushibaike.com/8hr/page/" + str(page) + "/"
                print("%s开始工作了,页数是:%d,url=%s" % (self.thread_name, page, url))
                request = requests.get(url, headers=self.headers)
                html = request.text
                # print(html)
                # 把数据装入data_queue队列
                self.data_queue.put(html)
                time.sleep(1)
            except Exception as e:
                pass
                # break


# 解析数据的线程
class 

你可能感兴趣的:(python)