爬虫-网易云评论
# post请求体
params: HMtP7KwWWgctb71g3T8v7b5SzlO1qN5JDI6WC8AqPYoakAYrpw1hm99wsn0Hp6AfP1ZNlp494Z+4XGXKiYyEXYTSoHvYTVhYpgDxUuSBdgNcZE0IXkkoA5YUEnQf2ESWO3bmt09k2ogKLOoQNWxEnXRewB0Oy2lPEdo52CVVNkUTMMd/gVPq4Zhj4LUvyjDh
encSecKey: 83e7a7f8bf53186b5c224d2732d86fb41a6366b8fb3c61b7dd4e630f6c5199e5c98732ab6fef399a8b4d08ece5a338e132c7cbc4a86a7f2d8c768431b408671acac04d05010406784afad5c36a904a784478bbc5a1fb29e46df26dc49fea70e6015d1a5409dec5a2f1bc0c997ffc3642177034138d7c2b9c872b35b81e95da7d
# js文件中
var bLq2x = window.asrsea(JSON.stringify(i8a), bvc9T(["流泪", "强"]), bvc9T(TQ2x.md), bvc9T(["爱心", "女孩", "惊恐", "大笑"]));
e8e.data = k8c.cy9p({
params: bLq2x.encText,
encSecKey: bLq2x.encSecKey
})
# 进一步找
function() {
function a(a) {
var d, e, b = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789", c = "";
for (d = 0; a > d; d += 1)
e = Math.random() * b.length,
e = Math.floor(e),
c += b.charAt(e);
return c
}
// AES加密
function b(a, b) {
var c = CryptoJS.enc.Utf8.parse(b)
, d = CryptoJS.enc.Utf8.parse("0102030405060708")
, e = CryptoJS.enc.Utf8.parse(a)
, f = CryptoJS.AES.encrypt(e, c, {
iv: d,
mode: CryptoJS.mode.CBC
});
return f.toString()
}
// RSA加密
function c(a, b, c) {
var d, e;
return setMaxDigits(131), // n的十六进制位数
d = new RSAKeyPair(b,"",c), // d key
e = encryptedString(d, a) // e 为 a的加密结果
}
// 得到加密后的结果
function d(d, e, f, g) {
var h = {}
, i = a(16);
return h.encText = b(d, g), # 第一次AES加密(msg,key)
h.encText = b(h.encText, i), # 第二次AES加密
h.encSecKey = c(i, e, f), # 第一次RSA加密
h
}
function e(a, b, d, e) {
var f = {};
return f.encText = c(a + e, b, d),
f
}
d = "{"csrf_token":""}", e = "010001", f = "00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7", g = "0CoJUm6Qyw8W8jud"
python 代码
'''爬取指定歌曲的评论信息
点入歌曲详情页面,通过以下链接取出评论
/weapi/v1/resource/comments/R_SO_4_254574?csrf_token= HTTP/1.1
'''
import base64
import random
from math import floor, ceil
from multiprocessing import Pool
import jieba
from Crypto.Cipher import AES
import codecs
import requests
from wordcloud import WordCloud
class CommentSpider(object):
def __init__(self, song_name, song_id):
self.song_name = song_name
self.song_id = song_id
self.headers = {'Host': 'music.163.com',
'Referer': 'http://music.163.com/',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/66.0.3359.181 Safari/537.36'
}
def generate_random_string(self, length):
'''从string字符串中随机取出length个字母'''
string = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
random_str = ''
for i in range(length):
random_str += string[floor(int(random.random() * len(string)))]
return random_str
def aes_encrypt(self,msg, key):
vi = '0102030405060708'
pad = lambda s: s + (16 - len(s) % 16) * chr(16 - len(s) % 16)
msg = pad(msg)
cipher = AES.new(key.encode('utf8'), AES.MODE_CBC, vi.encode('utf8'))
encryptedbytes = cipher.encrypt(msg.encode('utf8'))
encodestrs = base64.b64encode(encryptedbytes)
enctext = encodestrs.decode('utf8')
return enctext
def rsa_encrypt(self, random_string, key, f):
# 随机字符串逆序排序
string = random_string[::-1]
# 转成bytes类型
text = bytes(string, 'utf-8')
# RSA加密
sec_key = int(codecs.encode(text, encoding='hex'), 16) ** int(key, 16) % int(f, 16)
# 返回结果, x填充到256位
return format(sec_key, 'x').zfill(256)
def get_params(self, page):
offset = (page - 1) * 20
# msg = '{"offset"' + str(offset) + ',"total":"True","limit":"20","csrf_token":""}'
# 在d方法处打断点来找到d方法的三个参数
msg = '{rid:"", offset:"%s", total:"%s", limit:"20", csrf_token:""}' % (offset, 'false')
# msg ='{"csrf_token":""}'
key = '0CoJUm6Qyw8W8jud'
f = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a87' \
'6aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9' \
'd05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b' \
'8e289dc6935b3ece0462db0a22b8e7'
e = '010001'
# 生成长度为16的随机字符串,aes加密用的key
str_16 = self.generate_random_string(16)
# 第一次AES加密
first_aes = self.aes_encrypt(msg, key)
# 第二次AES加密
encText = self.aes_encrypt(first_aes, str_16)
# RSA加密得到encSecKey
encSecKey = self.rsa_encrypt(str_16, e, f)
return encText, encSecKey
def get_comment(self,song_data): #[song_name,id,page]
'''
获取第page页的评论
post请求url,先组参数
'''
comment_url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + self.song_id + '?csrf_token='
params, encSecKey = self.get_params(song_data[2])
res = requests.post(comment_url, data={'params': params, 'encSecKey': encSecKey}, headers=self.headers,
verify=False)
# 总页数
# page_count = ceil((res.json()['total']-15) % 20)
if res.status_code == 200:
print('正在爬取第%s页的内容'%song_data[2])
comments = res.json()['comments']
with open(song_data[0] + '.txt','a',encoding='utf-8') as f:
for i in comments:
f.write(i['content']+'\n')
else:
print('爬取第%s页失败'%song_data[2])
def make_wordcloud(self,file_name):
with open('%s.txt'%file_name,'r',encoding='utf-8') as f:
txt = f.read()
# 进行结巴分词
text = ''.join(jieba.cut(txt))
# 定义词云
wc = WordCloud(
font_path="simhei.ttf", # 这里的字体要电脑上有的 C:\Windows\Fonts
width=1200,
height=800,
max_words=100,
max_font_size=200,
min_font_size=10
)
# 生成词云
wc.generate(text)
# 保存图片
wc.to_file(file_name + '.png')
def run(self):
'''首先要拿到总共多少页,然后进程池一页一页的爬'''
url = 'https://music.163.com/weapi/v1/resource/comments/R_SO_4_' + self.song_id + '?csrf_token='
params, encSecKey = self.get_params(1)
res = requests.post(url, data={'params': params, 'encSecKey': encSecKey}, headers=self.headers,
verify=False)
# 总页数
page_count = ceil((res.json()['total']-15) % 20)
song_data = [(self.song_name,self.song_id,i+1) for i in range(int(page_count))]
# 构造进程池
pool = Pool(processes=4)
pool.map(self.get_comment,song_data)
# 所有的数据写入文件完成后生成词云
self.make_wordcloud(self.song_name)
if __name__ == '__main__':
# song_name = input('enter song name: ').strip()
cs = CommentSpider('太多', '1339315554')
cs.run()