多线程爬糗事百科

import threading
import json
import requests
from lxml import etree
from Queue import Queue
import time


class ThreadCrawl(threading.Thread):
    def __init__(self, threadname, pageQueue, dataQueue):
        super(ThreadCrawl, self).__init__()

        self.threadname = threadname
        self.pageQueue = pageQueue
        self.dataQueue = dataQueue
        self.headers = {"User-Agent": "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0;"}

    def run(self):
        print "启动"+self.threadname
        while not CRAWL_EXIT:
            try:
                page = self.pageQueue.get(False)
                url = "https://www.qiushibaike.com/8hr/page/"+str(page)+"/"
                content = requests.get(url, headers=self.headers).text
                time.sleep(1)
                self.dataQueue.put(content)
            except:
                pass
        print "结束"+self.threadname


class ThreadParse(threading.Thread):
    def __init__(self, threadname, dataQueue, filename, lock):
        super(ThreadParse, self).__init__()
        self.threadname = threadname
        self.dataQueue = dataQueue
        self.filename= filename
        self.lock = lock

    def run(self):
        while not PARSE_EXIT:
            try:
                html = self.dataQueue.get(False)
                self.parse(html)
            except:
                pass

    def parse(self,html):
        html = etree.HTML(html)
        result = html.xpath('//div[contains(@id,"qiushi_tag")]')

        for res in result:
            username = res.xpath('.//img/@alt')[0]

            pic = res.xpath('./div/a/img/@src')

            duanzi = res.xpath('.//div[@class="content"]/span')[0].text.strip()

            zan = res.xpath('.//i')[0].text

            comment = res.xpath('.//i')[1].text

            items = {
                "username": username,
                "image": pic,
                "content": duanzi,
                "zan": zan,
                "comment": comment
            }

            with self.lock:
                self.filename.write(json.dumps(items, ensure_ascii=False).encode('utf=8')+"\n")


CRAWL_EXIT = False
PARSE_EXIT = False


def main():
    pageQueue = Queue(20)

    for i in range(1, 21):
        pageQueue.put(i)

    dataQueue = Queue()

    filename = open("duanzi.json", "a")

    lock = threading.Lock()

    crawlList = ["采集线程1号", "采集线程2号", "采集线程3号"]
    threadcrawl = []
    for threadname in crawlList:
        thread = ThreadCrawl(threadname, pageQueue, dataQueue)
        thread.start()
        threadcrawl.append(thread)

    parseList = ["解析线程1号", "解析线程2号", "解析线程3号"]
    threadparse = []
    for threadname in parseList:
        thread = ThreadParse(threadname, dataQueue, filename, lock)
        thread.start()
        threadparse.append(thread)

    while not pageQueue.empty():
        pass

    global CRAWL_EXIT
    CRAWL_EXIT = True

    for thread in threadcrawl:
        thread.join()
        print "1"

    while not dataQueue.empty():
        pass

    global PARSE_EXIT
    PARSE_EXIT = True

    for thread in threadparse:
        thread.join()
        print "2"

    with lock:
        filename.close()

    print "谢谢使用"


if __name__ == "__main__":
    main()

你可能感兴趣的:(多线程爬糗事百科)