多线程实现糗事百科爬虫学习记录

 糗事百科爬虫记录~

#!/usr/bin/python3
#coding=utf-8

import requests
from lxml  import etree
import json
import time
import threading
from queue import Queue


class QiuBaiSpider:

    def __init__(self):

        self.start_url = "https://www.qiushibaike.com/8hr/page/{}/"
        self.headers = {}
        self.headers["User-Agent"] = "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36"
        self.headers["cookie"] ="_ga=GA1.2.234604122.1544328176; __cur_art_index=3500; Hm_lvt_18a964a3eb14176db6e70f1dd0a3e557=1544339120; _gid=GA1.2.797685492.1544446033; _xsrf=2|9bbbff68|88fa2d00e243512b68399ae2190d78d3|1544446076; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1544328175,1544446032,1544446079; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37={}; _gat=1".format(str(int(time.time())))
        self.queue_url = Queue()
        self.queue_pares_url_temp = Queue()
        #self.queue_html_str = Queue()
        self.queue_content_list = Queue()

    def get_url_list(self):
        #return [self.start_url.format(i) for i in range(1,14)]
        for i in range(1,14):
            self.queue_url.put(self.start_url.format(i))

    def pares_url_temp(self):
        while True:
            #获url队列中的url
            url_temp = self.queue_url.get()
            response = requests.get(url_temp,headers=self.headers)
            print(url_temp)
            #return response.content
            #将url响应放入相应队列里
            self.queue_pares_url_temp.put(response.content)
            #实现queue_url队列里面的元素个数减一
            self.queue_url.task_done()

    def content_list(self):
        html_str = self.queue_pares_url_temp.get()
        #格式化获取到的html字符串
        html = etree.HTML(html_str)
        item_list = []
        # for class in ["typs_video", "typs_multi", "typs_image", "typs_word"]
        #     div_list = html.xpath("//div[@class='recommend-article']/ul/li[@class='tihuan']")
        #     div_list.replace("tihuan",class)
        div_list = html.xpath("//div[@class='recommend-article']/ul/li[contains(@class,'typs_image')]")
        for div in div_list:
            #创建空字典,存储获取到的内容
            item = {}
            item["content"] = div.xpath(".//a[@class='recmd-content']/text()")
            item["content_img"] = div.xpath(".//a[contains(@class,'recmd-left')]/@href")
            item["content_img"] = "https://www.qiushibaike.com" + item["content_img"][0] if len(item["content_img"])>0 else None
            item["user_name"] = div.xpath(".//span[@class='recmd-name']/text()")
            item["like"] = div.xpath(".//div[@class='recmd-num']/text()")
            item["like"] = item["like"][0] if len(item["like"])>0 else None
            item["comment"] = item["like"][1] if len(item["like"])&len(item["like"])>len(item["like"][0]) else None
            print(item)
            item_list.append(item)
        #return item_list
        #将内容获取添加至线程队列中去
        self.queue_content_list.put(item_list)
        #减一
        self.queue_pares_url_temp.task_done()

    def save_content(self):
        while True:
            content_list = self.queue_content_list.get()
            file_path = "D:\Program Files\Project\helloworld\qiubai.txt"
            for content in content_list:
                with open(file_path,"a",encoding="utf-8") as f:
                    f.write(json.dumps(content,ensure_ascii=False,indent=2))
                    f.write("\n")
            #减一
            self.queue_content_list.task_done()
            print("保存成功")

    def run(self):
        Thread_list = []
        #1、准备start_url,获取url地址池
        for i in range(4):
            t_url = threading.Thread(target=self.get_url_list())
            Thread_list.append(t_url)

        #2、遍历请求每个url,获取相应
        for i in range(4):
            t_pares_url_temp = threading.Thread(target=self.pares_url_temp())
            Thread_list.append(t_pares_url_temp)

        #3、提取数据
        for i in range(4):
            t_content_list = threading.Thread(target=self.content_list())
            Thread_list.append(t_content_list)

        #4、保存数据
        for i in range(4):
            t_save_content = threading.Thread(target=self.save_content())
            Thread_list.append(t_save_content)

        for t in Thread_list:
            #把子线程设置为守护线程
            t.setDaemon(True)
            t.start()

        for q in [self.queue_url,self.queue_pares_url_temp,self.content_list(),]:
            #线程阻塞
            q.join()

if __name__ == "__main__":
    qiubai = QiuBaiSpider()
    qiubai.run()

仅做技术记录,若侵权告知后删除~

你可能感兴趣的:(多线程实现糗事百科爬虫学习记录)