链接:https://v.qq.com/detail/j/j6cgzhtkuonf6te.html
因为f12源代码无法找到关于热评信息的代码,推测是存储在js文件,只能抓包获取
左键查看更多(触发网络请求)
查找对应的js文件,解码并对比热评
完全一致,开始研究网页规律,并校验
发现网页的reqnum字段的值是评论个数
增大这个值相当于按查看更多
import urllib.request
import re
num="20"
headers={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.22 Safari/537.36 SE 2.X MetaSr 1.0",
"Content-Type":"application/javascript",
}
opener=urllib.request.build_opener()
headall=[]
for key,value in headers.items():
item=(key,value)
headall.append(item)
opener.addheaders=headall
urllib.request.install_opener(opener)
for j in range(0,100):
#爬取当前评论页面
print("第"+str(j)+"页")
#thisurl="https://video.coral.qq.com/filmreviewr/c/upcomment/"+vid+"?commentid="+cid+"&reqnum="+num
thisurl="https://video.coral.qq.com/filmreviewr/c/upcomment/j6cgzhtkuonf6te?reqnum="+num+"&callback=jQuery11240697380881603586_1573822838280&_=1573822838281"
data=urllib.request.urlopen(thisurl).read().decode("utf-8")
titlepat='"title":"(.*?)","abstract":"'
#commentpat='"content":"(.*?)"'
commentpat='"abstract":"(.*?)...",'
titleall=re.compile(titlepat,re.S).findall(data)
commentall=re.compile(commentpat,re.S).findall(data)
lastpat='"last":"(.*?)"'
cid=re.compile(lastpat,re.S).findall(data)[0]
for i in range(0,len(titleall)):
try:
print("评论标题是:"+eval('u"'+titleall[i]+'"'))
print("评论内容是:"+eval('u"'+commentall[i]+'"'))
print("------")
except Exception as err:
print(err)