这是一首安河桥北,宇西唱的,个人感觉她和宋冬野唱的都很好,十分喜欢。
其中的评论也是十分有趣:
摘取几个看看:
如果直接对:https://music.163.com/#/song?id=416892296进行爬取,解析不到任何内容,参照知乎里面才知道评论被网易云加密了。。
参照知乎:https://www.zhihu.com/question/36081767进行了一下分析。
可以看到两个参数params和encSecKey都是加密的字符串,那么一定是通过js进行加密的,这里就查看core.js的源代码:
发现core.js是几千行,无法查看。则我们只需要搜索我们需要的函数,搜索:encSecKey
整理函数之后如下:
function()!{
function a(a){ //产生16位字符串
var d,e,b="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",c="";
for(d=0;a>d;d+=1)
e=Math.random()*b.length,e=Math.floor(e),c+=b.charAt(e);
return c
}
function b(a,b){ //AES 加密
var c=CryptoJS.enc.Utf8.parse(b), //b是json字符串进行一次加密
d=CryptoJS.enc.Utf8.parse("0102030405060708"),
e=CryptoJS.enc.Utf8.parse(a),
f=CryptoJS.AES.encrypt(e,c,{iv:d,mode:CryptoJS.mode.CBC}); //密钥偏移量iv是0102030405060708,模式是CBC
return f.toString()
}
function c(a,b,c){
var d,e;
return setMaxDigits(131),
d=new RSAKeyPair(b,"",c),
e=encryptedString(d,a)}
function d(d,e,f,g) //这是主要的加密params的函数 ,d是json字符串,g="0102030405060708"
{
var h={},i=a(16);
return h.encText=b(d,g),
h.encText=b(h.encText,i),
h.encSecKey=c(i,e,f),h
}
function e(a,b,d,e){
var f={};
return f.encText=c(a+e,b,d),f
}
window.asrsea=d,window.ecnonasr=e
}();
balabala也没搞懂,只知道进行了AES加密和RSA加密。
。。。
参考:https://www.zhihu.com/question/36081767,实现了能爬取评论的python代码。
EncryptUtil.py
# -*- coding:utf-8 -*-
import os
import base64
import time
from Crypto.Cipher import AES
def createSecretKey(size):
return (''.join(map(lambda xx: (hex(ord(xx))[2:]), os.urandom(size))))[0:size]
def aesEncrypt(text, secKey):
pad = 16 - len(text) % 16
text = text + pad * chr(pad)
encryptor = AES.new(secKey, 2, '0102030405060708')
ciphertext = encryptor.encrypt(text)
ciphertext = base64.b64encode(ciphertext)
return ciphertext
def rsaEncrypt(text, pubKey, modulus):
text = text[::-1]
rs = int(text.encode('hex'), 16)**int(pubKey, 16)%int(modulus, 16)
return format(rs, 'x').zfill(256)
def timeStamp(timeNum):
timeStamp = float(timeNum/1000)
timeArray = time.localtime(timeStamp)
reTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
return reTime
main.py
# -*- coding:utf-8 -*-
import EncryptUtil
import json
import requests
import chardet
import time
import sys
reload(sys)
sys.setdefaultencoding('utf8')
def getUTF8(str):
strEncode=chardet.detect(str)
return str.decode(strEncode['encoding'])#.encode('utf8')
filename=u"安河桥.txt"
f=open(filename,'w') #保存评论的文件
class Crawler(object):
def __init__(self,id):
modulus = '00e0b509f6259df8642dbc35662901477df22677ec152b5ff68ace615bb7b725152b3ab17a876aea8a5aa76d2e417629ec4ee341f56135fccf695280104e0312ecbda92557c93870114af6c9d05c4f7f0c3685b7a46bee255932575cce10b424d813cfe4875d3e82047b97ddef52741d546b8e289dc6935b3ece0462db0a22b8e7'
self.nonce = '0CoJUm6Qyw8W8jud'
pubKey = '010001'
self.secKey = EncryptUtil.createSecretKey(16)
#首先产生16位密钥,进行RSA加密
self.encSecKey = EncryptUtil.rsaEncrypt(self.secKey, pubKey, modulus)
self.musicId = id
self.requestUrl = "http://music.163.com/weapi/v1/resource/comments/R_SO_4_%d/" % int(id)
self.headers = {
'Host': 'music.163.com',
'Connection': 'keep-alive',
'Content-Length': '484',
'Cache-Control': 'max-age=0',
'Origin': 'http://music.163.com',
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.84 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': '*/*',
'DNT': '1',
'Accept-Encoding': 'gzip, deflate',
'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4',
'Cookie':'Province=010; City=010; UM_distinctid=1602578c1d752a-05caafbbb6b4ed-5d1b3316-100200-1602578c1d8125; __gads=ID=df9439195f8812f3:T=1512457091:S=ALNI_Mbq3NxhcJNKV34hJxpeRl1xHJu_uw; vjuids=-24b9e9c81.1602578f3da.0.916edb1478579; vjlast=1512457172.1512457172.30; _ntes_nnid=4a0115d9d11deb7f4f95440f93d3485a,1512457171947; _ntes_nuid=4a0115d9d11deb7f4f95440f93d3485a; vinfo_n_f_l_n3=f1e95f51a84140be.1.0.1512457171960.0.1512458909433; usertrack=ezq0pVoqLxBf/xF7BigcAg==; _ga=GA1.2.236401197.1512714080; JSESSIONID-WYYY=Y1%2BxxraahUi5J9%2B%2FJ%5CwldEQPDTPuyfQ%2Fk6dv4P%2Bujl1jQI5%5CdnWYMDq%5Cg1%2FEsbOl6ifFsv5a8tulM%5Cfrcc5jzKr2m%5Cnb5k5DkY4g2d7F44rJIAhuQyw73k6yw%2B8wIrsSxUnkXKW7Ok%5C9YEu8g2weGjqhZu4%2BgaEb49z0kz8%2BPSH8b72a%3A1513251636847; _iuqxldmzr_=32; __remember_me=true; MUSIC_U=bd0b51d45f62f43e3c84edd001c989bf487f8e44ab9624124ab3dff983848bd1903b0f8306f156520fc32027149e300f8bafcdfe5ad2b092; __csrf=6c14615855e66814d82b9c1a3bb9bb6f; __utma=94650624.236401197.1512714080.1513249837.1513249837.1; __utmb=94650624.3.10.1513249837; __utmc=94650624; __utmz=94650624.1513249837.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic'
} #注意这里面的cookie 是从浏览器复制的,不要直接粘贴cookie代码,根据个人登录信息进行复制
def getComment(self, offset):
text = {
'username': "", #可以为空
'password': "", #可以为空
'rememberLogin': 'true',
'offset': offset
}
text = json.dumps(text)
#进行两次AES加密
encText = EncryptUtil.aesEncrypt(EncryptUtil.aesEncrypt(text, self.nonce), self.secKey)
data = {
'params': encText,
'encSecKey': self.encSecKey
}
res = requests.post(self.requestUrl, headers=self.headers, data=data)
jsonData = res.json()
self.databaseSave(jsonData)
return int(jsonData["total"])
def databaseSave(self, jsonData):
for comment in jsonData["comments"]:
commentData = {
'id': str(comment["commentId"]),
'user': str(comment["user"]["userId"]),
'content': (comment["content"]),
'likeCount': str(comment["likedCount"]),
'commentTime': str(EncryptUtil.timeStamp(comment["time"])),
'musicId': str(self.musicId)
}
userData = {
'id': str(comment["user"]["userId"]),
'username': comment["user"]["nickname"],
'avatarUrl': comment["user"]["avatarUrl"]
}
if not comment["beReplied"] == []:
commentData["reComment"] = str(comment["beReplied"][0]["user"]["userId"])
#将评论写入文件
f.write(comment['content']+'\n')
def process(self, offset):
if offset == -1:
return
off = offset
total = self.getComment(off)
print('评论的总数:'+total)
while off < total:
off += 10
self.getComment(off)
def main(id=416892296):
c = Crawler(id)
c.process(1)
if __name__ == '__main__':
main(416892296)
f.close()
WHAT FK! 这里就发现了爬虫,禁止爬取。。。。。
发现可以再次爬取。。。。
贴上github地址:https://github.com/wu-yy/163music
yes!这是安河桥(宇西)评论生成的词云,大家还是很有情怀的嘛!