利用暑假闲暇时间学习了一下爬虫,也尝试了几个爬取图片的小项目,但感觉爬取图片有些枯燥,就想着试试爬取酷狗音乐。
用的方法可能有点老套,欢迎大家指出,也希望同大家一起学习交流。
首先,打开酷狗官网链接:link,搜索一下许嵩,
点一下第一首
再右键检查一下,可以看到这样一个请求,返回歌曲有何不可的直链‘https://webfs.ali.kugou.com/202109122212/147ac6f126a3764f33a9ebaf7b785014/G209/M06/0C/04/cYcBAF55hfOAae55ADqQAB_WtKE603.mp3’
我们再观察该请求:
刷新一下,发现变化的只有最后的时间戳
换一首歌,发现除了时间戳,hash和album_id也发生了变化
所以我们只需要知道hash和album_id,就可以得到歌曲的直链了。
那我们回到刚开始搜索歌曲的界面
然后右键检查,点击network,并刷新,可以看到这样一个请求
该请求会返回歌曲的相关信息,刚好30首歌曲的信息
点开第一个,可以看到我们所需要的hash和album_id
观察该请求
可以猜测时间戳和最后的signature: 是变化的
刷新一下:验证刚刚的猜想
快捷键ctrl+f,搜索signature
点击第一个试试:在该界面快捷键ctrl+f,搜索signature,可以看到生成signature的方法,将该返回数据全部复制粘贴,保存为js文件(这个之后会用到)
右键,open in sources panel
找到signature的位置,设置断点,将o添加到watch
F5刷新网页,可以看到o的值,这个后面会用到。
至此,对网站的分析就差不多结束了。
class kugou(object):
def __init__(self):
pass
def get_num(self):
pass
def get_time(self):
pass
def get_signature(self,keyword:str = '有点甜' ,time:str=''):
pass
def get_mp3_url(self):
pass
def get_mp3(self,url_list):
pass
def run(self):
pass
if __name__=='__main__':
kugou = kugou()
kugou.run()
首先,想要请求https://complexsearch.kugou.com/v2/search/song?callback=callback123&keyword=%E8%AE%B8%E5%B5%A9&page=1&pagesize=30&bitrate=0&isfuzzy=0&tag=em&inputtype=0&platform=WebFilter&userid=0&clientver=2000&iscorrection=1&privilege_filter=0&srcappid=2919&clienttime=1631458588374&mid=1631458588374&uuid=1631458588374&dfid=-&signature=4198110DA5D11DAFA8F187FA2C97AD52
就必须先获取signature,
def get_signature(self,keyword:str = '有点甜' ,time:str=''):
"""获取signature"""
sign = ["NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt",
"bitrate=0",
"callback=callback123",
"clienttime=" + time,
"clientver=2000",
"dfid=-",
"inputtype=0",
"iscorrection=1",
"isfuzzy=0",
"keyword="+keyword,
"mid=" + time,
"page=1",
"pagesize=30",
"platform=WebFilter",
"privilege_filter=0",
"srcappid=2919",
"tag=em",
"userid=0",
"uuid=" + time,
"NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt",]
def gen_sign(sign:list):
gen = execjs.get().compile(open('kugou.js',encoding='utf-8').read())
print(gen)
fun_name = 'faultylabs.MD5({})'.format(sign)
return gen.eval(fun_name)
return gen_sign(sign)
其中,keyword指搜索的关键词,time指值生成的时间戳,kugou.js指前面保存的js文件
"undefined" == typeof faultylabs && (faultylabs = {}),
faultylabs.MD5 = function(a) {
function b(a) {
var b = (a >>> 0).toString(16);
return "00000000".substr(0, 8 - b.length) + b
}
function c(a) {
for (var b = [], c = 0; c < a.length; c++)
b = b.concat(k(a[c]));
return b
}
function d(a) {
for (var b = [], c = 0; 8 > c; c++)
b.push(255 & a),
a >>>= 8;
return b
}
function e(a, b) {
return a << b & 4294967295 | a >>> 32 - b
}
function f(a, b, c) {
return a & b | ~a & c
}
function g(a, b, c) {
return c & a | ~c & b
}
function h(a, b, c) {
return a ^ b ^ c
}
function i(a, b, c) {
return b ^ (a | ~c)
}
function j(a, b) {
return a[b + 3] << 24 | a[b + 2] << 16 | a[b + 1] << 8 | a[b]
}
function k(a) {
for (var b = [], c = 0; c < a.length; c++)
if (a.charCodeAt(c) <= 127)
b.push(a.charCodeAt(c));
else
for (var d = encodeURIComponent(a.charAt(c)).substr(1).split("%"), e = 0; e < d.length; e++)
b.push(parseInt(d[e], 16));
return b
}
function l() {
for (var a = "", c = 0, d = 0, e = 3; e >= 0; e--)
d = arguments[e],
c = 255 & d,
d >>>= 8,
c <<= 8,
c |= 255 & d,
d >>>= 8,
c <<= 8,
c |= 255 & d,
d >>>= 8,
c <<= 8,
c |= d,
a += b(c);
return a
}
function m(a) {
for (var b = new Array(a.length), c = 0; c < a.length; c++)
b[c] = a[c];
return b
}
function n(a, b) {
return 4294967295 & a + b
}
function o() {
function a(a, b, c, d) {
var f = v;
v = u,
u = t,
t = n(t, e(n(s, n(a, n(b, c))), d)),
s = f
}
var b = p.length;
p.push(128);
var c = p.length % 64;
if (c > 56) {
for (var k = 0; 64 - c > k; k++)
p.push(0);
c = p.length % 64
}
for (k = 0; 56 - c > k; k++)
p.push(0);
p = p.concat(d(8 * b));
var m = 1732584193
, o = 4023233417
, q = 2562383102
, r = 271733878
, s = 0
, t = 0
, u = 0
, v = 0;
for (k = 0; k < p.length / 64; k++) {
s = m,
t = o,
u = q,
v = r;
var w = 64 * k;
a(f(t, u, v), 3614090360, j(p, w), 7),
a(f(t, u, v), 3905402710, j(p, w + 4), 12),
a(f(t, u, v), 606105819, j(p, w + 8), 17),
a(f(t, u, v), 3250441966, j(p, w + 12), 22),
a(f(t, u, v), 4118548399, j(p, w + 16), 7),
a(f(t, u, v), 1200080426, j(p, w + 20), 12),
a(f(t, u, v), 2821735955, j(p, w + 24), 17),
a(f(t, u, v), 4249261313, j(p, w + 28), 22),
a(f(t, u, v), 1770035416, j(p, w + 32), 7),
a(f(t, u, v), 2336552879, j(p, w + 36), 12),
a(f(t, u, v), 4294925233, j(p, w + 40), 17),
a(f(t, u, v), 2304563134, j(p, w + 44), 22),
a(f(t, u, v), 1804603682, j(p, w + 48), 7),
a(f(t, u, v), 4254626195, j(p, w + 52), 12),
a(f(t, u, v), 2792965006, j(p, w + 56), 17),
a(f(t, u, v), 1236535329, j(p, w + 60), 22),
a(g(t, u, v), 4129170786, j(p, w + 4), 5),
a(g(t, u, v), 3225465664, j(p, w + 24), 9),
a(g(t, u, v), 643717713, j(p, w + 44), 14),
a(g(t, u, v), 3921069994, j(p, w), 20),
a(g(t, u, v), 3593408605, j(p, w + 20), 5),
a(g(t, u, v), 38016083, j(p, w + 40), 9),
a(g(t, u, v), 3634488961, j(p, w + 60), 14),
a(g(t, u, v), 3889429448, j(p, w + 16), 20),
a(g(t, u, v), 568446438, j(p, w + 36), 5),
a(g(t, u, v), 3275163606, j(p, w + 56), 9),
a(g(t, u, v), 4107603335, j(p, w + 12), 14),
a(g(t, u, v), 1163531501, j(p, w + 32), 20),
a(g(t, u, v), 2850285829, j(p, w + 52), 5),
a(g(t, u, v), 4243563512, j(p, w + 8), 9),
a(g(t, u, v), 1735328473, j(p, w + 28), 14),
a(g(t, u, v), 2368359562, j(p, w + 48), 20),
a(h(t, u, v), 4294588738, j(p, w + 20), 4),
a(h(t, u, v), 2272392833, j(p, w + 32), 11),
a(h(t, u, v), 1839030562, j(p, w + 44), 16),
a(h(t, u, v), 4259657740, j(p, w + 56), 23),
a(h(t, u, v), 2763975236, j(p, w + 4), 4),
a(h(t, u, v), 1272893353, j(p, w + 16), 11),
a(h(t, u, v), 4139469664, j(p, w + 28), 16),
a(h(t, u, v), 3200236656, j(p, w + 40), 23),
a(h(t, u, v), 681279174, j(p, w + 52), 4),
a(h(t, u, v), 3936430074, j(p, w), 11),
a(h(t, u, v), 3572445317, j(p, w + 12), 16),
a(h(t, u, v), 76029189, j(p, w + 24), 23),
a(h(t, u, v), 3654602809, j(p, w + 36), 4),
a(h(t, u, v), 3873151461, j(p, w + 48), 11),
a(h(t, u, v), 530742520, j(p, w + 60), 16),
a(h(t, u, v), 3299628645, j(p, w + 8), 23),
a(i(t, u, v), 4096336452, j(p, w), 6),
a(i(t, u, v), 1126891415, j(p, w + 28), 10),
a(i(t, u, v), 2878612391, j(p, w + 56), 15),
a(i(t, u, v), 4237533241, j(p, w + 20), 21),
a(i(t, u, v), 1700485571, j(p, w + 48), 6),
a(i(t, u, v), 2399980690, j(p, w + 12), 10),
a(i(t, u, v), 4293915773, j(p, w + 40), 15),
a(i(t, u, v), 2240044497, j(p, w + 4), 21),
a(i(t, u, v), 1873313359, j(p, w + 32), 6),
a(i(t, u, v), 4264355552, j(p, w + 60), 10),
a(i(t, u, v), 2734768916, j(p, w + 24), 15),
a(i(t, u, v), 1309151649, j(p, w + 52), 21),
a(i(t, u, v), 4149444226, j(p, w + 16), 6),
a(i(t, u, v), 3174756917, j(p, w + 44), 10),
a(i(t, u, v), 718787259, j(p, w + 8), 15),
a(i(t, u, v), 3951481745, j(p, w + 36), 21),
m = n(m, s),
o = n(o, t),
q = n(q, u),
r = n(r, v)
}
return l(r, q, o, m).toUpperCase()
}
var p = null
, q = null;
return "string" == typeof a ? p = k(a) : a.constructor == Array ? 0 === a.length ? p = a : "string" == typeof a[0] ? p = c(a) : "number" == typeof a[0] ? p = a : q = typeof a[0] : "undefined" != typeof ArrayBuffer ? a instanceof ArrayBuffer ? p = m(new Uint8Array(a)) : a instanceof Uint8Array || a instanceof Int8Array ? p = m(a) : a instanceof Uint32Array || a instanceof Int32Array || a instanceof Uint16Array || a instanceof Int16Array || a instanceof Float32Array || a instanceof Float64Array ? p = m(new Uint8Array(a.buffer)) : q = typeof a : q = typeof a,
q && alert("MD5 type mismatch, cannot process " + q),
o()
}
,
function() {
function a(a) {
if (window.KgMobileCall)
a && a();
else {
var b = document.createElement("script");
b.src = "https://m3ws.kugou.com/static/js/common/mobilecall_3.0.js",
b.onload = function() {
this.readyState && "loaded" != this.readyState && "complete" != this.readyState || a && a()
}
,
document.body.appendChild(b)
}
}
function b(b, c, d) {
b = b || {},
c = c || "",
d = d || {};
var e, f = !1, g = "json";
"function" == typeof d ? e = d : (e = d.callback,
f = d.useH5 || !1,
g = d.postType || "json");
var h = {};
for (var i in b)
!h[i] && (h[i] = b[i]);
var j = function() {
var a = navigator.userAgent.match(/KGBrowser/gi) ? !0 : !1
, b = navigator.userAgent.match(/kugouandroid/gi) ? !0 : !1
, c = "undefined" == typeof external ? !1 : "undefined" == typeof external.superCall ? !1 : !0;
return c || b || a ? !0 : !1
}()
, k = (new Date).getTime()
, l = []
, m = {}
, n = []
, o = []
, p = "NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt"
, q = {
appid: function(a) {
return a()
},
srcappid: function(a) {
return a("2919")
},
clientver: function(a) {
return a("20000")
},
"clienttime,mid,uuid,dfid": function(a) {
return a({
clienttime: k,
mid: k,
uuid: k,
dfid: "-"
})
}
}
, r = function() {
for (var a in q)
l.push(a);
!function(a) {
function b(a) {
if (a < l.length)
q[l[a]](function(c) {
if (c)
if ("[object Object]" == Object.prototype.toString.call(c))
for (var d in c)
m[d] = c[d];
else
m[l[a]] = c;
b(a + 1)
});
else {
for (var d in m)
!h[d] && (h[d] = m[d]);
for (var d in h)
n.push(d);
if (n.sort(),
n.forEach(function(a) {
o.push(a + "=" + h[a])
}),
c)
if ("[object Object]" == Object.prototype.toString.call(c))
if ("json" == g)
o.push(JSON.stringify(c));
else {
var f = [];
for (var d in c)
f.push(d + "=" + c[d]);
o.push(f.join("&"))
}
else
o.push(c);
o.unshift(p),
o.push(p),
h.signature = faultylabs.MD5(o.join("")),
e && e(h)
}
}
b(a)
}(0)
};
if (c && ("[object Object]" != Object.prototype.toString.call(c) ? j = !1 : "urlencoded" == g && (j = !1)),
j && !f) {
var s = !1;
a(function() {
KgMobileCall.callCmd({
cmd: 764,
jsonStr: JSON.stringify({
get: h,
post: c
}),
callback: function(a) {
if (s)
return !1;
if (s = !0,
a && a.status) {
delete a.status;
for (var b in a)
!h[b] && (h[b] = a[b]);
return e && e(h)
}
j = !1,
r()
}
})
})
} else
j = !1,
r()
}
"undefined" != typeof module && module.exports ? module.exports = b : "function" == typeof define && define.amd ? define(function() {
return b
}) : getInterFacePublic = b
}();
倒数第二行的’window.'需要删掉,不然会报错
"""2021-9-12"""
#酷狗音乐爬虫
import json
import os
import re
import time
import execjs
import requests
class kugou(object):
def __init__(self):
self.base_url = 'https://complexsearch.kugou.com/v2/search/song?'
self.time = self.get_time()
self.keyword = input('输入要下载的歌曲名:')
self.num = self.get_num()
self.signature = self.get_signature(self.keyword,self.time)
self.base_paramr = {
'callback': 'callback123',
'keyword': self.keyword,
'page': '1',
'pagesize':'30',
'bitrate': '0',
'isfuzzy': '0',
'tag': 'em',
'inputtype': '0',
'platform': 'WebFilter',
'userid': '0',
'clientver': '2000',
'iscorrection': '1',
'privilege_filter': '0',
'srcappid': '2919',
'clienttime': self.time,
'mid': self.time,
'uuid': self.time,
'dfid': '-',
'signature': self.signature
}
self.base_headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
}
self.headers = {
'cookie': 'kg_mid=671056f931bdd92c3d9cfbc1996e21ec; kg_dfid=34dT144TKakk4bjeOl0X7qkb; kg_dfid_collect=d41d8cd98f00b204e9800998ecf8427e; Hm_lvt_aedee6983d4cfc62f509129360d6bb3d=1630837212,1630926700,1630938173; Hm_lpvt_aedee6983d4cfc62f509129360d6bb3d=1630941313; kg_mid_temp=671056f931bdd92c3d9cfbc1996e21ec; ACK_SERVER_10016=%7B%22list%22%3A%5B%5B%22gzreg-user.kugou.com%22%5D%5D%7D; ACK_SERVER_10017=%7B%22list%22%3A%5B%5B%22gzverifycode.service.kugou.com%22%5D%5D%7D; ACK_SERVER_10015=%7B%22list%22%3A%5B%5B%22gzlogin-user.kugou.com%22%5D%5D%7D',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
}
self.name_list = []
self.url_list = []
def get_num(self):
num = int(input('输入要下载的歌曲数(1~30):'))
if num < 1:
num = 1
elif num > 30:
num = 30
return num
def get_time(self):
"""获取时间戳"""
return str(time.time()).replace('.','')[:13]
def get_signature(self,keyword:str = '有点甜' ,time:str=''):
"""获取signature"""
sign = ["NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt",
"bitrate=0",
"callback=callback123",
"clienttime=" + time,
"clientver=2000",
"dfid=-",
"inputtype=0",
"iscorrection=1",
"isfuzzy=0",
"keyword="+keyword,
"mid=" + time,
"page=1",
"pagesize=30",
"platform=WebFilter",
"privilege_filter=0",
"srcappid=2919",
"tag=em",
"userid=0",
"uuid=" + time,
"NVPh5oo715z5DIWAeQlhMDsWXXQV4hwt",]
def gen_sign(sign:list):
gen = execjs.get().compile(open('kugou.js',encoding='utf-8').read())
print(gen)
fun_name = 'faultylabs.MD5({})'.format(sign)
return gen.eval(fun_name)
return gen_sign(sign)
def get_mp3_url(self):
"""获取MP3播放链接列表"""
res = requests.get(url = self.base_url,headers=self.base_headers,params=self.base_paramr)
str = re.findall(r'callback123\((.*)\)',res.text)[0]
data = json.loads(str)
#print(data['data']['lists'][0]['FileName'])
# print(len(data['data']['lists']))
for i in range(self.num):
self.name_list.append(data['data']['lists'][i]['FileName'])
self.url_list.append('https://wwwapi.kugou.com/yy/index.php?r=play/getdata&hash={}&album_id={}&_={}'.format(data['data']['lists'][i]['FileHash'],data['data']['lists'][i]['AlbumID'],self.time))
return self.url_list
def get_mp3(self,url_list):
"""获取mp3,并保存"""
for i in range(len(url_list)):
r = requests.get(url = url_list[i],headers=self.headers)
data = json.loads(r.text)
url = data['data']['play_url']
file_name = './酷狗下载/{}.mp3'.format(self.name_list[i]).replace('', '').replace('', '')
print('开始下载:'+file_name)
try:
res = requests.get(url = url)
with open(file_name,'wb')as f:
f.write(res.content)
print('下载成功:' + file_name)
except:
print('下载失败:'+file_name)
def run(self):
path = './酷狗下载'
if not os.path.exists(path):
os.mkdir(path)
url_list = self.get_mp3_url()
self.get_mp3(url_list)
if __name__=='__main__':
kugou = kugou()
kugou.run()