使用python3 爬取微博评论(作者,时间,点赞数,楼数,评论)存储为excel格式

其中的Cookie需要根据自己电脑端的数据进行处理。

https://m.weibo.cn/detail/*****和路径值记得修改

使用python3 爬取微博评论(作者,时间,点赞数,楼数,评论)存储为excel格式_第1张图片

 

 代码如下:


import requests
import time
import os
import csv
import codecs
import sys
import json
import importlib
from openpyxl import load_workbook
import openpyxl
from bs4 import BeautifulSoup
importlib.reload(sys)

 
#要爬取热评的起始url
url='https://m.weibo.cn/comments/hotflow?id=4393956248857472&mid=4393956248857472&max_id='
headers={
        'Cookie':'自己电脑端的cookie',
        'Referer': 'https://m.weibo.cn/detail/4393956248857472',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36',
        'X-Requested-With': 'XMLHttpRequest'
}

 
def get_page(max_id, id_type):
#参数
    params={
            'max_id':max_id,
            'max_id_type': id_type
            }
    try:
        r=requests.get(url, params=params, headers=headers)
        if r.status_code==200:
            return r.json()
    except requests.ConnectionError as e:
        print('error',e.args)
 


def parse_page(jsondata):
    if jsondata:
        items=jsondata.get('data')
        item_max_id={}
        item_max_id['max_id']=items['max_id']
        item_max_id['max_id_type']=items['max_id_type']
        return item_max_id

def write_excel_title():
    path = 'E:/EmotionTest/weibo.xlsx'
    #excel左下角表单名称
    sheet_name_xlsx = '陈伟霆-垃圾分类'
    workbook = openpyxl.Workbook()
    sheet = workbook.active
    sheet.title = sheet_name_xlsx
    sheet.cell(row=1, column=1, value=str('作者'))
    sheet.cell(row=1, column=2, value=str('时间'))
    sheet.cell(row=1, column=3, value=str('点赞数'))
    sheet.cell(row=1, column=4, value=str('楼数'))
    sheet.cell(row=1, column=5, value=str('评论'))
    workbook.save(path)
    print("xlsx格式表格写入数据成功!")

def write_excel_xlsx(jsondata,count):
    path = 'E:/EmotionTest/weibo.xlsx'
    datas = jsondata.get('data').get('data')
    #在原数据的基础上加上新数据
    workbook = load_workbook(path)
    sheet = workbook.active
    j=count+1
    for data in datas:
        username = data.get("user").get("screen_name")
        sheet.cell(column=1, row=j+1, value=str(username))
        created_at = data.get("created_at")
        sheet.cell(column=2, row=j+1, value=str(created_at))
        like_count = data.get("like_count")
        sheet.cell(column=3, row=j+1, value=str(like_count))
        floor_number = data.get("floor_number")
        sheet.cell(column=4, row=j+1, value=str(floor_number))
        comment = data.get("text")
        comment = BeautifulSoup(comment,'lxml').get_text()
        sheet.cell(column=5, row=j+1, value=str(comment))
        j = j+1
    workbook.save(path)
    print("xlsx格式表格写入数据成功!")
        
        
        
maxpage = 5
m_id = 0
id_type = 0
write_excel_title()
i = 0
for page in range(0,maxpage):
    print(page)
    jsondata=get_page(m_id, id_type)
    write_excel_xlsx(jsondata,i*20)
    results=parse_page(jsondata)
    time.sleep(1)
    m_id=results['max_id']
    id_type=results['max_id_type']
    i = i+1

 

你可能感兴趣的:(技术点)