多线程糗事百科案例
from queue import Queue
from threading import Thread, Lock
import time
import requests
import json
from lxml import etree
# 采集线程是否退出:True退出,False不退出
crawl_exit = False
parse_exit = False
# 采集数据的线程
class ThreadCrawl(Thread):
def __init__(self, thread_name, page_queue, data_queue):
super(ThreadCrawl, self).__init__()
self.thread_name = thread_name
self.page_queue = page_queue
self.data_queue = data_queue
self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/6.0)"}
def run(self):
while not crawl_exit:
try:
page = self.page_queue.get(block=False)
url = "https://www.qiushibaike.com/8hr/page/" + str(page) + "/"
print("%s开始工作了,页数是:%d,url=%s" % (self.thread_name, page, url))
request = requests.get(url, headers=self.headers)
html = request.text
# print(html)
# 把数据装入data_queue队列
self.data_queue.put(html)
time.sleep(1)
except Exception as e:
pass
# break
# 解析数据的线程
class