最近周杰伦的新歌《说好不哭》发布,我的朋友圈也是不断被刷屏,那就趁着热度,我们来看看网友们对这首歌的评论如何吧。
目标网址:https://y.qq.com/n/yqq/song/001qvvgF38HVc4.html?ADTAG=baiduald&play=1#comment_box
我们打开QQ音乐找到《说好不哭》,页面下拉可以看到不少网友评论,起码在我爬的时候就已经有8800多页共22万条记录。
pagenum
和lasthotcommentid
是变化的,pagenum
是当前评论的页数,lasthotcommentid
是上一页评论列表里面的最后一条评论id。Request URL
(即在网址栏直接输入这个url),可以看到返回的都是json数据,我们要爬取的就是rootcommentcontent
,每次调用API返回的一个json里面包括25条评论,所以最后一条评论的索引应该是24,就像response.json()['comment']['commentlist'][24]
这样。"""
爬取周杰伦新歌《说好不哭》的评论
采用线程池,但好像并没有提速,直接存储在txt里面
自建的免费代理ip只有500多个所以我也就for循环了500页爬取了12500条数据
"""
import datetime
import re
from threading import RLock
import redis
import requests
from fake_useragent import UserAgent
from gevent.threadpool import ThreadPoolExecutor
class ShuoHaoBuKu(object):
def __init__(self):
"""
初始化线程池、锁、文件存储路径
"""
self.redis_conn = redis.StrictRedis(host='localhost', port=6379, db=1)
self.COMMENTT_TEXT = "./说好不哭评论.txt"
self.threadpool = ThreadPoolExecutor(max_workers=50)
self.lock = RLock()
self.count = 0 # 记录爬取了多少条评论
def get_proxy(self):
"""
弹出一个代理并拼接成一个字典进行返回
:return:
"""
proxy = str(self.redis_conn.spop("http_proxy", 1)[0])[2:-1] # 这边根据实际的ip进行更改
print(proxy)
return {
"http": proxy,
}
def start_request(self, url, querystring):
"""
调用self.get_comment_s()函数,返回最后一个comment_id
:param url:
:param querystring:
:return:
"""
last_comment_id = self.get_comment_s(url, querystring)
return last_comment_id
def get_comment_s(self, url, querystring):
"""
获取querystring,请求url
解析json数据
调用self.save_comment_list()循环解析comment_list
:return:
"""
try:
headers = {
'cache-control': "no-cache",
'User-Agent': UserAgent().random,
'postman-token': "87c8f8ff-3604-0736-8658-b4a9ae179516",
}
res = requests.get(url=url, headers=headers, params=querystring, proxies=self.get_proxy())
res.raise_for_status()
res.encoding = "UTF-8"
comment_list = res.json()['comment']['commentlist']
self.threadpool.submit(self.save_comment_list, comment_list)
return comment_list[24]['commentid']
except Exception as e:
print(e)
pass
def save_comment_list(self, comment_list):
"""
获得comment_list
循环comment_list获得每个comment的rootcommentcontent并存入本地
:param json_data:
:return:
"""
self.lock.acquire()
with open(self.COMMENTT_TEXT, "a", encoding='utf-8') as f:
for comment in comment_list:
content = comment['rootcommentcontent']
# 将所有的表情符号都替换为空
r = re.compile(r"\[em].*[/em].", re.S)
content = re.sub(r, '', content)
f.write(content + "\n")
self.count += 1
print(f"\r以爬取{self.count}条评论", end="")
self.lock.release()
def main():
s = ShuoHaoBuKu()
url = "https://c.y.qq.com/base/fcgi-bin/fcg_global_comment_h5.fcg"
querystring = {
"g_tk": "5381", "loginUin": "0", "hostUin": "0", "format": "json", "inCharset": "utf8",
"outCharset": "GB2312", "notice": "0", "platform": "yqq.json", "needNewCode": "0",
"cid": "205360772", "reqtype": "2", "biztype": "1", "topid": "237773700", "cmd": "8",
"needmusiccrit": "0", "pagenum": "0", "pagesize": "25",
"lasthotcommentid": "song_237773700_642435955_1569067902", "domain": "qq.com", "ct": "24",
"cv": "10101010"}
for i in range(500):
querystring['pagenum'] = str(i)
last_comment_id = s.start_request(url=url, querystring=querystring)
# 因为每次访问下一个API都要带上上一个评论列表的最后一个评论id,所以我能想到的也只有这么写了
querystring['lasthotcommentid'] = last_comment_id
if __name__ == '__main__':
start = datetime.datetime.now()
main()
print("共耗时:", datetime.datetime.now() - start)
import jieba
from wordcloud import WordCloud
file = "./说好不哭评论.txt"
img = "shbk.png"
f = open(file, 'r', encoding='utf-8').read()
text = jieba.cut(f)
jieba_text = " ".join(text)
wordcloud = WordCloud(
font_path='./font.ttf',
width=1920,
height=1080,
).generate(jieba_text)
wordcloud.to_file(img)
https://mp.weixin.qq.com/s?__biz=MzU2ODYzNTkwMg==&mid=2247485571&idx=1&sn=094517114b22a4684988008aecab2639&chksm=fc8bb012cbfc39046339dc5a5711081c8df2ef8217b8acce8ba5c005c6a658d0d1a0c5b465bf&scene=0&xtrack=1&key=d6525bec48886bec7ef1ca46848cf85a7fe078275417e341aa12807abb8c00778f5477ba4de51822d4ba26d8566c8aa7266f89a4007bdc343ae36bba59ffbf11231d5c5dab5d934f8ba92b6cfc785ada&ascene=1&uin=MTM0NTcyMTA0Mg%3D%3D&devicetype=Windows+10&version=62060841&lang=zh_CN&pass_ticket=AkqLQ2zEWlu%2BMqD%2FbanS7gxBaoZ1PzBuxpNW6Fc3zWnaPFIgMAWCmdroW9Y2qnxx