多线程爬取百度贴吧

之前也写过爬取爬取百度贴吧的图片

今天写了一个多线程爬取百度贴吧内各种帖子的回复 感觉windows上面还是多线程的效率比较高效

鉴于不想码字,代码也写的比较丑漏,我就直接上代码啦,有漏洞或者有不足的欢迎指正

# -*- coding: utf-8 -*-
import requests
import re, urllib
import urllib3
import queue
from threading import Thread

class Spider():
    def __init__(self):
         self.headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
                           ' Chrome/75.0.3770.100 Safari/537.36'}
    #获取关键字搜索的内容帖子
    def get_url(self, url):
        urllib3.disable_warnings()
        response = requests.get(url,headers=self.headers, timeout=5)
        # print(response.text)
        return response
    #获取帖子的总的页数
    def get_pages(self, html):
        reg = r''
        imgre = re.compile(reg)
        pageList = imgre.findall(html)
        page = pageList[0].split("=")
        return page[-1]
    #获取每一页中帖子的地址 标题 作者 创建时间
    def get_page_url(self, html, time_end):
        url = re.compile(r'(.*?)')
        create_time_list = create_time.findall(html)
        mssg_list = zip(title_list, author_list, url_list, create_time_list)
        for x in mssg_list:
            if x[3][2] == ":":
                pass
            elif int(x[3].split("-")[0]) > 2000:
                if int(x[3].split("-")[0]) < int(time_end.split("-")[0]):
                    break
                elif int(x[3].split("-")[1]) < int(time_end.split("-")[1]):
                    break
            elif int(x[3].split("-")[0]) < int(time_end.split("-")[1]):
                break
            elif int(x[3].split("-")[0]) == int(time_end.split("-")[1]):
                if int(x[3].split("-")[1]) < int(time_end.split("-")[2]):
                    break
            sub_url = "https://tieba.baidu.com"+x[2]
            finally_msg = [x[0], x[1], sub_url, x[3]]
            print(finally_msg)
            self.write_ret(finally_msg)

    #保存字段
    def write_ret(self, mssg):
        with open("ret.txt", "a+", encoding="utf-8") as f:
            str = ", ".join(mssg) + "\n"
            f.write(str)
    def show_main(self, url):
        try:
            url_true = url.split("+")[0]
            time_end = url.split("+")[1]
            html = self.get_url(url_true).text
            self.get_page_url(html, time_end)
        except Exception as e:
            print(e)

class Mythread(Thread):
    def __init__(self, queue):
        super(Mythread, self).__init__()
        self.queue = queue

    def run(self):
        self.tieba()

    def tieba(self):
        while not self.queue.empty():
            spider = Spider()
            url = self.queue.get()
            spider.show_main(url)
def main():
    while True:
        try:
            keyword = input("输入搜索内容:")
            time_end = input("输入你要查询的截止时间:例2019-8-22:")
            if len(time_end.split("-")) < 3:
                print("您输入的有误")
                continue
            url = "https://tieba.baidu.com/f?ie=utf-8&kw=%s&fr=search" % urllib.parse.quote(keyword)
            spider = Spider()
            html = spider.get_url(url).text
            page = int(int(spider.get_pages(html)) / 50)
            print("一共%s页" % page)
            break
        except Exception:
            print("你的输入不合法")
            continue

    myqueue = queue.Queue()
    for v in range(page):
        url = "https://tieba.baidu.com/f?kw=%s&ie=utf-8&pn=%s" % (urllib.parse.quote(keyword), v * 50) + "+"+time_end
        myqueue.put(url)

    allthread = []
    for v in range(200):
        mythread = Mythread(myqueue)
        mythread.start()
        allthread.append(mythread)

    for v in allthread:
        v.join()
if __name__ == "__main__":
    main()



你可能感兴趣的:(学习笔记)