t1 = threading.Thread(targe=func,args=(,))
t1.setDaemon(True)
t1.start() #此时线程才会启动
队列模块的使用
from queue import Queue
q = Queue(maxsize=100)
item = {}
q.put_nowait(item) #不等待直接放,队列满的时候会报错
q.put(item) #放入数据,队列满的时候会等待
q.get_nowait() #不等待直接取,队列空的时候会报错
q.get() #取出数据,队列为空的时候会等待
q.qsize() #获取队列中现存数据的个数
q.join() #队列中维持了一个计数,计数不为0时候让主线程阻塞等待,队列计数为0的时候才会继续往后执行
q.task_done()
# put的时候计数+1,get不会-1,get需要和task_done 一起使用才会-1
```
# 3. 线程中使用队列,队列可用于线程间的数据通讯
from queue import Queue
import threading
q = Queue()
def add_to_queue():
for i in range(0, 100):
print("存入队列: {}".format(i))
q.put(i)
def get_from_queue():
# 但是在我们获取队列元素的时候, 我们并不知道队列中放了几个元素,
# 这个时候我们就会使用while的死循环来获取,知道取完为止
# for i in range(0, 100):
while True:
print("从队列中取出: {}".format(q.get()))
q.task_done()
# 创建线程
t = threading.Thread(target=add_to_queue)
# 设置为守护线程
t.setDaemon(True)
# 启动线程
t.start()
t = threading.Thread(target=get_from_queue)
t.setDaemon(True)
t.start()
# 队列加入主线线程, 等待队列中任务完成为止
q.join()
'''
Queue.qsize() 返回队列的大小
Queue.empty() 如果队列为空,返回True,反之False
Queue.full() 如果队列满了,返回True,反之False,Queue.full 与 maxsize 大小对应
Queue.get([block[, timeout]])获取队列,timeout等待时间
Queue.get_nowait() 相当于Queue.get(False),非阻塞方法
Queue.put(item) 写入队列,timeout等待时间
Queue.task_done() 在完成一项工作之后,Queue.task_done()函数向任务已经完成的队列发送一个信号。每个get()调用得到一个任务,接下来task_done()调用告诉队列该任务已经处理完毕。
Queue.join() 实际上意味着等到队列为空,再执行别的操作
'''
2、糗事百科实例
(1)、导入库
import requests
from threading import Thread
from lxml import etree
from queue import Queue
(2)、创建类,用来获取网页
class GetHTML(Thread):#继承Thread类
def __init__(self, url_queue, html_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'}
while self.url_queue.empty() == False:
url = self.url_queue.get()
response = requests.get(url, headers=headers)
if response.status_code == 200:
self.html_queue.put(response.text)
(3)、创建解析网页信息类
class ParserHTML(Thread):
def __init__(self, html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
while self.html_queue.empty() == False:
html = self.html_queue.get()
e = etree.HTML(html)
span_contents = e.xpath('//div[@class="content"]/span[1]')
for span in span_contents:
info = span.xpath('string(.)')
print(info.strip())
(4)、主函数
if __name__ == "__main__":
# 存储url的容器
url_queue = Queue()
# 存储网页的容器
html_queue = Queue()
start_url = 'https://www.qiushibaike.com/text/page/{}/'
for i in range(1, 14):
url = start_url.format(i)
url_queue.put(url)
crawl_list = []
for i in range(3):
crawl = GetHTML(url_queue, html_queue)
crawl_list.append(crawl)
crawl.start()
for crawl in crawl_list:
crawl.join()
parser_list = []
for i in range(3):
parser = ParserHTML(html_queue)
parser_list.append(parser)
parser.start()
for parser in parser_list:
parser.join()
3、完整代码
import requests
from threading import Thread
from lxml import etree
from queue import Queue
# 获取网页类
class GetHTML(Thread):
def __init__(self, url_queue, html_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:6.0) Gecko/20100101 Firefox/6.0'}
while self.url_queue.empty() == False:
url = self.url_queue.get()
response = requests.get(url, headers=headers)
if response.status_code == 200:
self.html_queue.put(response.text)
# 创建解析网页,获取信息类
class ParserHTML(Thread):
def __init__(self, html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
while self.html_queue.empty() == False:
html = self.html_queue.get()
e = etree.HTML(html)
span_contents = e.xpath('//div[@class="content"]/span[1]')
for span in span_contents:
info = span.xpath('string(.)')
print(info.strip())
if __name__ == "__main__":
# 存储url的容器
url_queue = Queue()
# 存储网页的容器
html_queue = Queue()
start_url = 'https://www.qiushibaike.com/text/page/{}/'
for i in range(1, 14):
url = start_url.format(i)
url_queue.put(url)
crawl_list = []
for i in range(3):
crawl = GetHTML(url_queue, html_queue)
crawl_list.append(crawl)
crawl.start()
for crawl in crawl_list:
crawl.join()
parser_list = []
for i in range(3):
parser = ParserHTML(html_queue)
parser_list.append(parser)
parser.start()
for parser in parser_list:
parser.join()