Python爬取微博数据

Python爬取微博数据

一、需求说明

       抓取某一用户的新浪微博数据,抓取的内容包括:id号、微博发布的时间、正文(仅提取文字)、转发数、评论数、点赞数。

在返回的json数据结构中,我们需要的是以下字段:

data
  cards
    mblog
      id #id号
      created_at # 发布时间
      text # 正文
      reposts_count # 转发数
      comments_count # 评论数
      attitudes_count # 点赞数

二、代码实现

        确保安装了requests和pyquery库。

pip install pyquery
pip install requests

       1)爬取微博数据写入weibo.txt文本中,具体代码解析:

import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq

host = 'm.weibo.cn'
base_url = 'https://%s/api/container/getIndex?' % host
user_agent = 'User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 wechatdevtools/0.7.0 MicroMessenger/6.3.9 Language/zh_CN webview/0'

headers = {
    'Host': host,
    'Referer': 'https://m.weibo.cn/u/XXX',
    'User-Agent': user_agent
}


# 按页数抓取数据
def get_single_page(page):
    params = {
        'type': 'uid',
        'value': XXX,
        'containerid': YYY,
        'page': page
    }
    url = base_url + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError as e:
        print('抓取错误', e.args)


# 解析页面返回的json数据
def parse_page(json):
    items = json.get('data').get('cards')
    for item in items:
        item = item.get('mblog')
        if item:
            data = {
                'id': item.get('id'),
                'created_at':item.get('created_at'),
                'text': pq(item.get("text")).text(),  # 仅提取内容中的文本
                'attitudes': item.get('attitudes_count'),
                'comments': item.get('comments_count'),
                'reposts': item.get('reposts_count')
            }
            yield data


if __name__ == '__main__':
    fw = open('./weibo.txt','w',encoding='UTF-8') #写入txt文件中
    for page in range(1, 10):  # 抓取前十页的数据
        json_r = get_single_page(page)
        results = parse_page(json_r)
        for result in results:
            print(result)
            fw.writelines(json.dumps((result),ensure_ascii=False))
            fw.writelines('\n')
    fw.close()

   其中value与containerid数据为进入所需要爬取微博号页面,右键->查看网页源代码中所对应数据。

  

  value = oid对应值,containerid = page_id对应值。

运行结果:

        2)读取weibo.txt文本中数据写入weibo.xls中,具体代码解析:

def getExcel():
    # 创建excel工作表
    workbook = xlwt.Workbook(encoding='utf-8')
    worksheet = workbook.add_sheet('Weibo')

    # 设置表头
    worksheet.write(0, 0, label='id')
    worksheet.write(0, 1, label='created_at')
    worksheet.write(0, 2, label='text')
    worksheet.write(0, 3, label='attitudes')
    worksheet.write(0, 4, label='comments')
    worksheet.write(0, 5, label='reposts')

    # 读取json文件
    data = []
    with open('./weibo.txt', 'r', encoding='UTF-8') as f:
        for i in f.readlines():
            data.append(json.loads(i))
    # 将json字典写入excel
    # 变量用来循环时控制写入单元格,感觉有更好的表达方式
    val = 1
    for list_item in data:
        for key, value in list_item.items():
            if key == "id":
                worksheet.write(val, 0, value)
            elif key == "created_at":
                worksheet.write(val, 1, value)
            elif key == "text":
                worksheet.write(val, 2, value)
            elif key == "attitudes":
                worksheet.write(val, 3, value)
            elif key == "comments":
                worksheet.write(val, 4, value)
            elif key == "reposts":
                worksheet.write(val, 5, value)
        val += 1
    # 保存
    workbook.save('./weibo.xls')

        运行结果:

       

3)代码整合完整实现如下:

# -*- coding: utf-8 -*-
import xlwt
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
from openpyxl import Workbook,load_workbook
import json

host = 'm.weibo.cn'
base_url = 'https://%s/api/container/getIndex?' % host
user_agent = 'User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 wechatdevtools/0.7.0 MicroMessenger/6.3.9 Language/zh_CN webview/0'

headers = {
    'Host': host,
    'Referer': 'https://m.weibo.cn/u/XXX',
    'User-Agent': user_agent
}

# 按页数抓取数据
def get_single_page(page):
    params = {
        'type': 'uid',
        'value': XXX,
        'containerid': YYY,
        'page': page
    }
    url = base_url + urlencode(params)
    try:
        response = requests.get(url, headers=headers)
        if response.status_code == 200:
            return response.json()
    except requests.ConnectionError as e:
        print('抓取错误', e.args)

# 解析页面返回的json数据
def parse_page(json):
    items = json.get('data').get('cards')
    for item in items:
        item = item.get('mblog')
        if item:
            data = {
                'id': item.get('id'),
                'created_at':item.get('created_at'),
                'text': pq(item.get("text")).text(),  # 仅提取内容中的文本
                'attitudes': item.get('attitudes_count'),
                'comments': item.get('comments_count'),
                'reposts': item.get('reposts_count')
            }
            yield data

def getTXT():
    """写入txt文件中"""
    fw = open('./weibo.txt','w',encoding='UTF-8')
    for page in range(1, 10):  # 抓取前十页的数据
        json_r = get_single_page(page)
        results = parse_page(json_r)
        for result in results:
            print(result)
            fw.writelines(json.dumps((result),ensure_ascii=False))
            fw.writelines('\n')
    fw.close()

def getExcel():
    # 创建excel工作表
    workbook = xlwt.Workbook(encoding='utf-8')
    worksheet = workbook.add_sheet('罗小黑')

    # 设置表头
    worksheet.write(0, 0, label='id')
    worksheet.write(0, 1, label='created_at')
    worksheet.write(0, 2, label='text')
    worksheet.write(0, 3, label='attitudes')
    worksheet.write(0, 4, label='comments')
    worksheet.write(0, 5, label='reposts')

    # 读取json文件
    data = []
    with open('./weibo.txt', 'r', encoding='UTF-8') as f:
        for i in f.readlines():
            data.append(json.loads(i))
    # 将json字典写入excel
    # 变量用来循环时控制写入单元格,感觉有更好的表达方式
    val = 1
    for list_item in data:
        for key, value in list_item.items():
            if key == "id":
                worksheet.write(val, 0, value)
            elif key == "created_at":
                worksheet.write(val, 1, value)
            elif key == "text":
                worksheet.write(val, 2, value)
            elif key == "attitudes":
                worksheet.write(val, 3, value)
            elif key == "comments":
                worksheet.write(val, 4, value)
            elif key == "reposts":
                worksheet.write(val, 5, value)
        val += 1
    # 保存
    workbook.save('./weibo.xls')

if __name__ == '__main__':
    getTXT()
    getExcel()

 

你可能感兴趣的:(Python)