Python爬取微博数据
一、需求说明
抓取某一用户的新浪微博数据,抓取的内容包括:id号、微博发布的时间、正文(仅提取文字)、转发数、评论数、点赞数。
在返回的json数据结构中,我们需要的是以下字段:
data
cards
mblog
id #id号
created_at # 发布时间
text # 正文
reposts_count # 转发数
comments_count # 评论数
attitudes_count # 点赞数
二、代码实现
确保安装了requests和pyquery库。
pip install pyquery
pip install requests
1)爬取微博数据写入weibo.txt文本中,具体代码解析:
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
host = 'm.weibo.cn'
base_url = 'https://%s/api/container/getIndex?' % host
user_agent = 'User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 wechatdevtools/0.7.0 MicroMessenger/6.3.9 Language/zh_CN webview/0'
headers = {
'Host': host,
'Referer': 'https://m.weibo.cn/u/XXX',
'User-Agent': user_agent
}
# 按页数抓取数据
def get_single_page(page):
params = {
'type': 'uid',
'value': XXX,
'containerid': YYY,
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('抓取错误', e.args)
# 解析页面返回的json数据
def parse_page(json):
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
if item:
data = {
'id': item.get('id'),
'created_at':item.get('created_at'),
'text': pq(item.get("text")).text(), # 仅提取内容中的文本
'attitudes': item.get('attitudes_count'),
'comments': item.get('comments_count'),
'reposts': item.get('reposts_count')
}
yield data
if __name__ == '__main__':
fw = open('./weibo.txt','w',encoding='UTF-8') #写入txt文件中
for page in range(1, 10): # 抓取前十页的数据
json_r = get_single_page(page)
results = parse_page(json_r)
for result in results:
print(result)
fw.writelines(json.dumps((result),ensure_ascii=False))
fw.writelines('\n')
fw.close()
其中value与containerid数据为进入所需要爬取微博号页面,右键->查看网页源代码中所对应数据。
value = oid对应值,containerid = page_id对应值。
运行结果:
2)读取weibo.txt文本中数据写入weibo.xls中,具体代码解析:
def getExcel():
# 创建excel工作表
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('Weibo')
# 设置表头
worksheet.write(0, 0, label='id')
worksheet.write(0, 1, label='created_at')
worksheet.write(0, 2, label='text')
worksheet.write(0, 3, label='attitudes')
worksheet.write(0, 4, label='comments')
worksheet.write(0, 5, label='reposts')
# 读取json文件
data = []
with open('./weibo.txt', 'r', encoding='UTF-8') as f:
for i in f.readlines():
data.append(json.loads(i))
# 将json字典写入excel
# 变量用来循环时控制写入单元格,感觉有更好的表达方式
val = 1
for list_item in data:
for key, value in list_item.items():
if key == "id":
worksheet.write(val, 0, value)
elif key == "created_at":
worksheet.write(val, 1, value)
elif key == "text":
worksheet.write(val, 2, value)
elif key == "attitudes":
worksheet.write(val, 3, value)
elif key == "comments":
worksheet.write(val, 4, value)
elif key == "reposts":
worksheet.write(val, 5, value)
val += 1
# 保存
workbook.save('./weibo.xls')
运行结果:
3)代码整合完整实现如下:
# -*- coding: utf-8 -*-
import xlwt
import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
from openpyxl import Workbook,load_workbook
import json
host = 'm.weibo.cn'
base_url = 'https://%s/api/container/getIndex?' % host
user_agent = 'User-Agent: Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1 wechatdevtools/0.7.0 MicroMessenger/6.3.9 Language/zh_CN webview/0'
headers = {
'Host': host,
'Referer': 'https://m.weibo.cn/u/XXX',
'User-Agent': user_agent
}
# 按页数抓取数据
def get_single_page(page):
params = {
'type': 'uid',
'value': XXX,
'containerid': YYY,
'page': page
}
url = base_url + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError as e:
print('抓取错误', e.args)
# 解析页面返回的json数据
def parse_page(json):
items = json.get('data').get('cards')
for item in items:
item = item.get('mblog')
if item:
data = {
'id': item.get('id'),
'created_at':item.get('created_at'),
'text': pq(item.get("text")).text(), # 仅提取内容中的文本
'attitudes': item.get('attitudes_count'),
'comments': item.get('comments_count'),
'reposts': item.get('reposts_count')
}
yield data
def getTXT():
"""写入txt文件中"""
fw = open('./weibo.txt','w',encoding='UTF-8')
for page in range(1, 10): # 抓取前十页的数据
json_r = get_single_page(page)
results = parse_page(json_r)
for result in results:
print(result)
fw.writelines(json.dumps((result),ensure_ascii=False))
fw.writelines('\n')
fw.close()
def getExcel():
# 创建excel工作表
workbook = xlwt.Workbook(encoding='utf-8')
worksheet = workbook.add_sheet('罗小黑')
# 设置表头
worksheet.write(0, 0, label='id')
worksheet.write(0, 1, label='created_at')
worksheet.write(0, 2, label='text')
worksheet.write(0, 3, label='attitudes')
worksheet.write(0, 4, label='comments')
worksheet.write(0, 5, label='reposts')
# 读取json文件
data = []
with open('./weibo.txt', 'r', encoding='UTF-8') as f:
for i in f.readlines():
data.append(json.loads(i))
# 将json字典写入excel
# 变量用来循环时控制写入单元格,感觉有更好的表达方式
val = 1
for list_item in data:
for key, value in list_item.items():
if key == "id":
worksheet.write(val, 0, value)
elif key == "created_at":
worksheet.write(val, 1, value)
elif key == "text":
worksheet.write(val, 2, value)
elif key == "attitudes":
worksheet.write(val, 3, value)
elif key == "comments":
worksheet.write(val, 4, value)
elif key == "reposts":
worksheet.write(val, 5, value)
val += 1
# 保存
workbook.save('./weibo.xls')
if __name__ == '__main__':
getTXT()
getExcel()