python爬虫系列(五):多线程实例

相信大家经过前几篇博客的浏览已经对爬虫不算是陌生了,那今天博主来说说多线程的使用吧。

就是因为GPL这个家伙,人们把python中的多线程视为鸡肋。但是就是这样的鸡肋在爬虫中是非常有必要的。原因在这里就不阐述了。(密集I/o操作的程序中,这个鸡肋还是很香的呢!)下面通过一个具体的事例来介绍多线爬虫。话不多说,直接上代码。

from urllib import request
import queue
from lxml import etree
import random,codecs
import threading
import json,time
#手写抓取页面的函数类
class Thread_crawl(threading.Thread):
    def __init__(self,threadID,pagequeue,dataqueue):
        #threading.Thread.__init__(self)
        super(Thread_crawl,self).__init__()
        self.threadID = threadID
        self.pagequeue = pagequeue
        self.dataqueue = dataqueue
        self.headers = {"User-Agent" : "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.94 Safari/537.36;"}
    def run(self):
        print(self.threadID+"开始工作")

        while not Exit_crawl:
            try:
                page = self.pagequeue.get(False)
                url = 'https://www.qiushibaike.com/8hr/page/'+str(page)+'/'

                req = request.Request(url,headers=self.headers)
                content = request.urlopen(req).read()

                sleep_time = random.randint(0,2)
                time.sleep(sleep_time)
                self.dataqueue.put(content)
                # print(self.dataqueue.qsize())
            except:
                print('下载失败')
        print(self.threadID+"工作结束")


Exit_crawl = False
Exit_pares = False
#手写解析线程类
class Thread_parse(threading.Thread):
    def __init__(self,threadID,dataqueue,filename,lock):
        super(Thread_parse,self).__init__()
        self.threadID = threadID
        self.dataqueue = dataqueue
        self.filename = filename
        self.lock = lock
    def run(self):
        print(self.threadID+"开始工作")
        while not Exit_pares:
            try:
                self.parse()
            except:
                pass
    def parse(self):
        html_data = self.dataqueue.get(False)
        node = etree.HTML(html_data)
        node_list = node.xpath('//div[contains(@id,"qiushi_tag")]')
        for obj in node_list:
            # xpath返回的列表,这个列表就这一个参数,用索引方式取出来,用户名
            username = (obj.xpath('.//h2')[0].text)
            # 取出标签下的内容,段子内容
            print(username)
            content = (obj.xpath('.//div[@class="content"]/span')[0].text)
            # print(content)
            items = {
                "username":username,
                "content":content,
            }
            # print(items)
            with self.lock:
                data = (json.dumps(items,ensure_ascii=False))
                print((data))
                self.filename.write(data.replace('\n',''))


def main():
    pagequeue = queue.Queue(20)
    dataqueue = queue.Queue()
    lock = threading.Lock()
    filename = codecs.open('duanzi.txt','w',encoding='utf-8')
    for i in range(1,11):
        pagequeue.put(i)

    #初始化采集线程
    crawlthreads = []
    crawlList = ['采集线程1号','采集线程2号','采集线程3号']
    for threadID in crawlList:
        thread = Thread_crawl(threadID,pagequeue,dataqueue)
        thread.start()
        crawlthreads.append(thread)

    time.sleep(1)
    #初始化解析线程
    paresthreads = []
    parselist = ['解析线程1号','解析线程2号','解析线程3号']
    for threadID in parselist:
        thread = Thread_parse(threadID,dataqueue,filename,lock)
        thread.start()
        paresthreads.append(thread)

    while not pagequeue.empty():
        pass

    global Exit_crawl
    Exit_crawl = True
    print("页码对列为空")
    for thread in crawlthreads:
        thread.join()
        print("采集等待")


    while not dataqueue.empty():
        pass

    global Exit_pares
    Exit_pares = True

    print("数据对列为空")

    for thread in paresthreads:
        thread.join()
        print("解析等待")

    with lock:
        filename.close()
    print("任务完成,谢谢使用")

if __name__ == "__main__":
    main()

上面代码中可能有前面没提到的知识,比如:html的解析,和xpath的使用。没关系,先把代码执行出来,看看爬虫的作用。会更感兴趣,接下来,我们一起了解。

你可能感兴趣的:(爬虫)