Python多线程爬取百度贴吧

任务

爬一个帖子内各个楼层的用户名,发表内容,时间,最后按照时间顺序排列各个元素

重要步骤

  1. 解析网页
    使用的是beautifulsoup解析,很方便很快捷!如果需要直接使用我的代码请查看class的名字是否相同,不同的直接修改即可!
response = requests.get(spider_url, headers=headers).content
            soup = BeautifulSoup(response, "html.parser")
            # soup = BeautifulSoup(open("page{}.html".format(page)), "html.parser")
            for j in soup.find_all("div", class_="l_post l_post_bright j_l_post clearfix"):
                floor = [j.find_all("li", class_="d_name")[0].text.strip(),
                         j.find_all("div", class_="d_post_content j_d_post_content")[0].text.strip(),
                         j.find_all("div", class_="core_reply_tail clearfix")[0].text[-16:].strip()]
                print(floor)
                self.data_queue.put(floor)
  1. 多线程
    我使用了4个线程,主要是为了做展示!线程和队列结合使用!可以简单理解为很多工人操作一条流水线!定义一个页码队列,每个线程都去改队列拿取页码,保证每个线程不回重复爬去,最后将结果放入到结果队列中!详情就差看我的代码

  2. 细节处理
    因为百度是有反爬的!所以在刚开始解析网页的时候,我建议是不要一直发请求,而是将页面保存到本地,多保存几页,然后写测试,测试成功的话再发真正的请求!以免ip被封!如果被封,等差不多半个小时即可。所以很多人爬取的时候一直没结果,建议检查这里!
    Python多线程爬取百度贴吧_第1张图片

结果展示

Python多线程爬取百度贴吧_第2张图片

完整代码(注意spider_url即贴吧地址)

import csv
import threading
from queue import Queue
import requests
import pandas as pd
from bs4 import BeautifulSoup


CRAWL_EXIT = False
class ThreadCrawl(threading.Thread):
    def __init__(self, thread_name, page_queue,data_queue):
        # 调用父类初始化方法
        super(ThreadCrawl, self).__init__()
        self.threadName = thread_name
        self.page_queue = page_queue
        self.data_queue = data_queue

    def run(self):
        print(self.threadName + ' 启动************')
        while not CRAWL_EXIT:
            page = self.page_queue.get(block=False)  # 从里面获取值
            # URL就是网页地址
            headers = {
     
                'Accept': 'application/json, text/javascript, */*; q=0.01',
                'Accept-Encoding': 'gzip, deflate, br',
                'Accept-Language': 'zh-CN,zh;q=0.9',
                'Connection': 'Keep-Alive',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
                'sec-ch-ua': '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"'
            }
            spider_url = "https://tieba.baidu.com/p/6132068127?pn={}".format(page)
            print(spider_url)
            response = requests.get(spider_url, headers=headers).content
            soup = BeautifulSoup(response, "html.parser")
            # soup = BeautifulSoup(open("page{}.html".format(page)), "html.parser")
            for j in soup.find_all("div", class_="l_post l_post_bright j_l_post clearfix"):
                floor = [j.find_all("li", class_="d_name")[0].text.strip(),
                         j.find_all("div", class_="d_post_content j_d_post_content")[0].text.strip(),
                         j.find_all("div", class_="core_reply_tail clearfix")[0].text[-16:].strip()]
                print(floor)
                self.data_queue.put(floor)

# 保存数据到csv文件
def toCsv(s):
    with open('data.csv', 'a', encoding="utf-8_sig", newline='') as f:
        csv.writer(f, dialect='excel').writerow(s)
        print('写入完成')

# 时间排序函数
def date_sort(x):
  ls = list(x)
  #冒泡排序
  for j in range(len(ls)-1):
    for i in range(len(ls)-j-1):
      lower = ls[i].split('-')
      upper = ls[i+1].split('-')
      for s in range(3):
        if int(lower[s]) > int(upper[s]):
          ls[i],ls[i+1] = ls[i+1],ls[i]
          break
        elif int(lower[s]) < int(upper[s]):
          break
  return ls

def main():
    # 声明一个队列,使用循环在里面存入100个页码
    page_queue = Queue(10)
    for i in range(1,11):
        page_queue.put(i)
    print(page_queue.queue)
    data_queue = Queue(maxsize=0)
    craw_list = ['采集线程1号','采集线程2号','采集线程3号','采集线程4号']
    thread_crawl = []
    for thread_name in craw_list:
        c_thread = ThreadCrawl(thread_name, page_queue,data_queue)
        c_thread.start()
        thread_crawl.append(c_thread)
    # 等待page_queue队列为空,也就是等待之前的操作执行完毕
    while not page_queue.empty():
        pass
    # 如果page_queue为空,采集线程退出循环
    global CRAWL_EXIT
    CRAWL_EXIT = True
    print(data_queue.empty())
    result_before = []
    while not data_queue.empty():
        result_before.append(data_queue.get())
    result = sort_time(result_before)
    for i in result:
        toCsv(i)


def sort_time(list):
    df = pd.DataFrame(columns=["用户名", "发表内容", "时间"], data=list)
    df['时间'] = pd.to_datetime(df['时间'])
    df.sort_values('时间', inplace=True)
    return df.values.tolist()


if __name__ == '__main__':
    main()

如果你有任何问题或者学习交流请联系我的邮箱:[email protected]

你可能感兴趣的:(学生,python,多线程,队列)