爬取百度贴吧的标题,发帖人,发帖时间

爬取百度贴吧的标题,发帖人,发帖时间

import re
import requests


def get(url):
    headers = {
     
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
    }

    # 发送请求
    request = requests.get(url=url, headers=headers)

    # 返回html源代码
    html = request.text

    # 标题
    titel = re.compile('"j_th_tit ">(.*)')
    titles = re.findall(titel, html)
    # 发帖人
    author = re.compile('"主题作者: (.*)"')
    authors = re.findall(author, html)
    # 发帖时间
    time = re.compile('时间">(.*)')
    times = re.findall(time, html)

    with open("作业.csv", 'a', encoding='utf-8') as f:
        # csv文件第一行标题
        f.write("标题,发帖人,发帖时间\n")
        # for循环写入文件
        for i in range(len(titles)):
            f.write(f'"{titles[i]}"'+","+authors[i]+","+times[i]+"\n")

    print("ok")


url = f"https://tieba.baidu.com/f?kw=%E7%88%AC%E8%99%AB&ie=utf-8&pn=50"
get(url)

结果图
爬取百度贴吧的标题,发帖人,发帖时间_第1张图片

你可能感兴趣的:(爬取百度贴吧的标题,发帖人,发帖时间)