B站有视频讲解,有不动的地方,可以去看视频
一、手机微博爬虫(一)数据抓取
二、手机微博爬虫(二)解析、保存数据
三、手机微博爬虫(三)数据存到数据库
四、手机微博爬虫(四)词云制作
import requests
import re
import os
class Main():
url = 'https://m.weibo.cn/comments/hotflow?'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.9 Safari/537.36',
'cookie': 'ALF=1586350518; _T_WM=18754506867; WEIBOCN_FROM=1110006030; MLOGIN=1; SCF=AhxeeyC_-eMbxpwb8KLVrN7-04xwJvc9ATC-09VkEqDz38nXmtqkQbVSt5VvmWZcNldfr8k_Nf8P4SEdVDS_E84.; SUB=_2A25zaEzqDeRhGeBP61oT8SvMwz2IHXVQk1SirDV6PUNbktANLWTikW1NRWvMdQ0ctsSre5zQ_DZmBR8BBLNsxlVv; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWNVTTW1HmFFlVFFRI_IKF55JpX5KMhUgL.FoqpehnEeK-71h22dJLoIE5LxKML12-L12zLxKBLB.2L1-2LxK-LBo.LBoeN1hzpSntt; SUHB=0gHiwhGI-JYGEg; SSOLoginState=1584151738; M_WEIBOCN_PARAMS=oid%3D4481983306764793%26luicode%3D20000061%26lfid%3D4481983306764793%26uicode%3D20000061%26fid%3D4481983306764793; XSRF-TOKEN=c1db22',
'Referer': 'https://m.weibo.cn/detail/4481983306764793'
}
params = {}
list_username = []
list_text = []
list_all = []
file = './'
def __init__(self):
num_page = int(input('请输入您要爬取的微博内容的页数:\n'))
ID_MID = input('请输入您要爬取的微博内容的ID:\n')
return_info = ('0','0')
for i in range(0,num_page):
print(f'正在爬取第{i + 1}页数据')
self.params = {
'id': ID_MID,
'mid': ID_MID,
'max_id': return_info[0],
'max_id_type': return_info[1]
}
return_info = self.get_max_id()
self.parseData()
self.writeData()
def get_max_id(self):
res = requests.get(url = self.url,headers = self.headers,params = self.params).json()['data']
max_id = res['max_id']
max_id_type = res['max_id_type']
data = res['data']
# print(data)
for i in range(0,len(data)):
text = data[i]['text']
username = data[i]['user']['screen_name']
self.list_text.append(text)
self.list_username.append(username)
return max_id,max_id_type
def parseData(self):
for i in range(0, len(self.list_text)):
res = re.sub('''[ ,/,",',=,.,\],\[,\-,_,;,:,?,%,&,+]''', '', self.list_text[i])
res = re.sub('<\w*>', '', res)
res = re.sub('@\w*', '', res)
self.list_text[i] = res
self.list_all = list(zip(self.list_username, self.list_text))
def writeData(self):
for i in self.list_all:
if not os.path.exists(self.file):
os.mkdir(self.file)
with open(f'{self.file}/weibocomments1.txt', 'a', encoding='utf-8') as fp:
fp.write(i[0])
fp.write(' ')
fp.write(i[1])
fp.write('\n')
# 4481983306764793
if __name__ == '__main__':
Main()