最近公司需要爬取直播商品的一些数据,其中就有淘宝直播。
很显然,直接搞难度很大,想到找h5页面,半天没找见;然后搞淘宝直播app,先抓包,请求尝试:
import requests
headers = {
# 'x-region-channel': 'CN',
'x-appkey': '25443018',
# 'x-mini-wua': 'HHnB_eYSxY%2FP5kUxgq%2FQIgCU83XxVv3KoZpJ1h0fB7V4SuWhLGBKoYfOcFZ%2Boc6tZRGIOpO8Sny1d1%2F628GMP%2FEQvKWUOclKK8p2HPkBcnjc3DZPF8mmebb6IvVyrcNyl%2BH%2BJ',
# 'x-c-traceid': 'Wpf1jzs8U2ADAF5M9MvdM4PH1563432256319079117686',
# 'x-m-biz-live-bizcode': 'TAOBAO',
# 'x-app-conf-v': '0',
# 'content-type': 'application/x-www-form-urlencoded;charset=UTF-8',
'x-features': '27', # todo 不变
'x-pv': '6.2', # todo 不变
'x-t': '1563432256', # todo 时间戳,每次发生变化与x-sign对应
# 'x-page-name': 'g.alicdn.com%2Fmtb%2Fapp-live-weex-v2%2F0.5.7%2Fitem-list-brand%2Fnative-min.js',
# 'f-refer': 'mtop',
# 'user-agent': 'MTOPSDK%2F3.1.1.7+%28Android%3B6.0%3Bvivo%3Bvivo+Y67%29',
'x-ttid': '10003993%40taobaolive_android_1.2.30', # todo 不变
'x-sid': '1b651f345dee3937d09e4c16d25fe9ec', # todo 不变
# 'a-orange-q': 'appKey=25443018&appVersion=1.2.30&clientAppIndexVersion=1120190620234201338&clientVersionIndexVersion=0',
'x-utdid': 'Wpf1jzs8U2ADAF5M9MvdM4PH', # todo 不变
# 'x-umt': '6JJL0dNLOr3przVr%2FnJBT7jD3q0TUef7',
'x-devid': 'AoUuvlqqu09D4BPSlNsUeHBX5-8ghDemAGdYTmfqis0V', # todo 不变
'x-sign': 'ab29200090ecf2ca4c44eac78895d87c8c107e6bb1f237ecfc', # todo 每次发生变化
# 'x-page-url': 'https%3A%2F%2Fg.alicdn.com%2Fmtb%2Fapp-live-weex-v2%2F0.5.7%2Fitem-list-brand%2Fnative-min.js',
'x-uid': '2240029942', # todo 不变
# 'Host': 'acs.m.taobao.com',
}
params = (
('data',
'{"searchID":"281e42e0b963b37d2f03251a1372aa13","n":"10","s":"0","PARCELABLE_WRITE_RETURN_VALUE":"1","CREATOR":"{}","searchKey":"呈悦礼品定制","CONTENTS_FILE_DESCRIPTOR":"1","type":"accountLive"}'),
)
response = requests.get('https://acs.m.taobao.com/gw/mtop.mediaplatform.live.searchv2/1.0/', headers=headers,
params=params)
print(response.text)
发现x-sign与searchID都是动态生成的加密参数。
对淘宝直播app反编译,共二十多兆,searchID产生的方法很容易就找到了,但是x-sign方法不知道是如何生成的,查看smail语言也没有找见。进度陷入停滞,很尴尬。。
问朋友找到淘宝直播h5的页面:https://h5.m.taobao.com/taolive/video.html?userId=2234284001 发现替换userId就可以爬取了。
相比app反编译来说,js逆向要简单很多。如图所示,出现sign值,借助浏览器进行分析,可以搜索全局:
发现已经找到了sign生成的位置,sign:m,是由m生成的,而m = af(d.token + “&” + v + “&” + x + “&” + g.data),对这里进行断点调试:
如图,可以清晰的看到各个值是多少,这里也可以到console当中进行验证。进入af()方法:
发现有很多方法,在此为了简单并不将它改写,用execjs这个包去执行这段js,这里要注意,执行会报错,原因是由于\r\n,处理一下就可以了:
处理过后,就不会报错了。
import hashlib
import execjs
# todo 淘宝h5 js破解
js = """
function af(aR) {function aC(b, a) {return b << a | b >>> 32 - a}
function aI(f, b) {
var h, g, d, a, c;
return d = 2147483648 & f,
a = 2147483648 & b,
h = 1073741824 & f,
g = 1073741824 & b,
c = (1073741823 & f) + (1073741823 & b),
h & g ? 2147483648 ^ c ^ d ^ a : h | g ? 1073741824 & c ? 3221225472 ^ c ^ d ^ a : 1073741824 ^ c ^ d ^ a : c ^ d ^ a
}
function aH(b, a, c) {
return b & a | ~b & c
}
function aE(b, a, c) {
return b & c | a & ~c
}
function aN(b, a, c) {
return b ^ a ^ c
}
function aD(b, a, c) {
return a ^ (b | ~c)
}
function aV(l, h, f, g, b, d, m) {
return l = aI(l, aI(aI(aH(h, f, g), b), m)),
aI(aC(l, d), h)
}
function aB(h, l, f, g, b, d, m) {
return h = aI(h, aI(aI(aE(l, f, g), b), m)),
aI(aC(h, d), l)
}
function aT(h, i, g, f, b, d, l) {
return h = aI(h, aI(aI(aN(i, g, f), b), l)),
aI(aC(h, d), i)
}
function aG(h, l, g, f, b, d, m) {
return h = aI(h, aI(aI(aD(l, g, f), b), m)),
aI(aC(h, d), l)
}
function aQ(g) {
for (var m, d = g.length, c = d + 8, b = (c - c % 64) / 64, f = 16 * (b + 1), p = new Array(f - 1), h = 0, l = 0; d > l; ) {
m = (l - l % 4) / 4,
h = l % 4 * 8,
p[m] = p[m] | g.charCodeAt(l) << h,
l++
}
return m = (l - l % 4) / 4,
h = l % 4 * 8,
p[m] = p[m] | 128 << h,
p[f - 2] = d << 3,
p[f - 1] = d >>> 29,
p
}
function aK(c) {
var a, f, d = "", b = "";
for (f = 0; 3 >= f; f++) {
a = c >>> 8 * f & 255,
b = "0" + a.toString(16),
d += b.substr(b.length - 2, 2)
}
return d
}
function aS(b) {
for (var a = "", d = 0; d < b.length; d++) {
var c = b.charCodeAt(d);
128 > c ? a += String.fromCharCode(c) : c > 127 && 2048 > c ? (a += String.fromCharCode(c >> 6 | 192),
a += String.fromCharCode(63 & c | 128)) : (a += String.fromCharCode(c >> 12 | 224),
a += String.fromCharCode(c >> 6 & 63 | 128),
a += String.fromCharCode(63 & c | 128))
}
return a
}
var aJ, aO, aA, aP, az, ax, aW, at, aj, aU = [], ah = 7, al = 12, ai = 17, ay = 22, av = 5, aw = 9, aF = 14, am = 20, aL = 4, ak = 11, au = 16, ao = 23, aq = 6, aM = 10, ap = 15, ar = 21;
for (aR = aS(aR),
aU = aQ(aR),
ax = 1732584193,
aW = 4023233417,
at = 2562383102,
aj = 271733878,
aJ = 0; aJ < aU.length; aJ += 16) {
aO = ax,
aA = aW,
aP = at,
az = aj,
ax = aV(ax, aW, at, aj, aU[aJ + 0], ah, 3614090360),
aj = aV(aj, ax, aW, at, aU[aJ + 1], al, 3905402710),
at = aV(at, aj, ax, aW, aU[aJ + 2], ai, 606105819),
aW = aV(aW, at, aj, ax, aU[aJ + 3], ay, 3250441966),
ax = aV(ax, aW, at, aj, aU[aJ + 4], ah, 4118548399),
aj = aV(aj, ax, aW, at, aU[aJ + 5], al, 1200080426),
at = aV(at, aj, ax, aW, aU[aJ + 6], ai, 2821735955),
aW = aV(aW, at, aj, ax, aU[aJ + 7], ay, 4249261313),
ax = aV(ax, aW, at, aj, aU[aJ + 8], ah, 1770035416),
aj = aV(aj, ax, aW, at, aU[aJ + 9], al, 2336552879),
at = aV(at, aj, ax, aW, aU[aJ + 10], ai, 4294925233),
aW = aV(aW, at, aj, ax, aU[aJ + 11], ay, 2304563134),
ax = aV(ax, aW, at, aj, aU[aJ + 12], ah, 1804603682),
aj = aV(aj, ax, aW, at, aU[aJ + 13], al, 4254626195),
at = aV(at, aj, ax, aW, aU[aJ + 14], ai, 2792965006),
aW = aV(aW, at, aj, ax, aU[aJ + 15], ay, 1236535329),
ax = aB(ax, aW, at, aj, aU[aJ + 1], av, 4129170786),
aj = aB(aj, ax, aW, at, aU[aJ + 6], aw, 3225465664),
at = aB(at, aj, ax, aW, aU[aJ + 11], aF, 643717713),
aW = aB(aW, at, aj, ax, aU[aJ + 0], am, 3921069994),
ax = aB(ax, aW, at, aj, aU[aJ + 5], av, 3593408605),
aj = aB(aj, ax, aW, at, aU[aJ + 10], aw, 38016083),
at = aB(at, aj, ax, aW, aU[aJ + 15], aF, 3634488961),
aW = aB(aW, at, aj, ax, aU[aJ + 4], am, 3889429448),
ax = aB(ax, aW, at, aj, aU[aJ + 9], av, 568446438),
aj = aB(aj, ax, aW, at, aU[aJ + 14], aw, 3275163606),
at = aB(at, aj, ax, aW, aU[aJ + 3], aF, 4107603335),
aW = aB(aW, at, aj, ax, aU[aJ + 8], am, 1163531501),
ax = aB(ax, aW, at, aj, aU[aJ + 13], av, 2850285829),
aj = aB(aj, ax, aW, at, aU[aJ + 2], aw, 4243563512),
at = aB(at, aj, ax, aW, aU[aJ + 7], aF, 1735328473),
aW = aB(aW, at, aj, ax, aU[aJ + 12], am, 2368359562),
ax = aT(ax, aW, at, aj, aU[aJ + 5], aL, 4294588738),
aj = aT(aj, ax, aW, at, aU[aJ + 8], ak, 2272392833),
at = aT(at, aj, ax, aW, aU[aJ + 11], au, 1839030562),
aW = aT(aW, at, aj, ax, aU[aJ + 14], ao, 4259657740),
ax = aT(ax, aW, at, aj, aU[aJ + 1], aL, 2763975236),
aj = aT(aj, ax, aW, at, aU[aJ + 4], ak, 1272893353),
at = aT(at, aj, ax, aW, aU[aJ + 7], au, 4139469664),
aW = aT(aW, at, aj, ax, aU[aJ + 10], ao, 3200236656),
ax = aT(ax, aW, at, aj, aU[aJ + 13], aL, 681279174),
aj = aT(aj, ax, aW, at, aU[aJ + 0], ak, 3936430074),
at = aT(at, aj, ax, aW, aU[aJ + 3], au, 3572445317),
aW = aT(aW, at, aj, ax, aU[aJ + 6], ao, 76029189),
ax = aT(ax, aW, at, aj, aU[aJ + 9], aL, 3654602809),
aj = aT(aj, ax, aW, at, aU[aJ + 12], ak, 3873151461),
at = aT(at, aj, ax, aW, aU[aJ + 15], au, 530742520),
aW = aT(aW, at, aj, ax, aU[aJ + 2], ao, 3299628645),
ax = aG(ax, aW, at, aj, aU[aJ + 0], aq, 4096336452),
aj = aG(aj, ax, aW, at, aU[aJ + 7], aM, 1126891415),
at = aG(at, aj, ax, aW, aU[aJ + 14], ap, 2878612391),
aW = aG(aW, at, aj, ax, aU[aJ + 5], ar, 4237533241),
ax = aG(ax, aW, at, aj, aU[aJ + 12], aq, 1700485571),
aj = aG(aj, ax, aW, at, aU[aJ + 3], aM, 2399980690),
at = aG(at, aj, ax, aW, aU[aJ + 10], ap, 4293915773),
aW = aG(aW, at, aj, ax, aU[aJ + 1], ar, 2240044497),
ax = aG(ax, aW, at, aj, aU[aJ + 8], aq, 1873313359),
aj = aG(aj, ax, aW, at, aU[aJ + 15], aM, 4264355552),
at = aG(at, aj, ax, aW, aU[aJ + 6], ap, 2734768916),
aW = aG(aW, at, aj, ax, aU[aJ + 13], ar, 1309151649),
ax = aG(ax, aW, at, aj, aU[aJ + 4], aq, 4149444226),
aj = aG(aj, ax, aW, at, aU[aJ + 11], aM, 3174756917),
at = aG(at, aj, ax, aW, aU[aJ + 2], ap, 718787259),
aW = aG(aW, at, aj, ax, aU[aJ + 9], ar, 3951481745),
ax = aI(ax, aO),
aW = aI(aW, aA),
at = aI(at, aP),
aj = aI(aj, az)
}
var an = aK(ax) + aK(aW) + aK(at) + aK(aj);
return an.toLowerCase()
}
"""
comjs = execjs.compile(js)
x = '12574478'
v = '1563849365204'
token = "9e460ec0197a1c8edd8eea35d68e5c4c"
#data = '{"type":"0","liveId":"231820090607","creatorId":"379092709"}'
# sign = af(d.token + "&" + v + "&" + x + "&" + g.data)
aR = token + "&" + v + "&" + x + "&" + '{}'.format(data)
sign = comjs.call('af',aR)
print(sign)
可以看到,最终就生成了sign值,其实加密方式就是md5,两种方法得到的sign是一样的:
import hashlib
data = '{"type":"0","liveId":"231820090607","creatorId":"379092709"}
aR = token + "&" + v + "&" + x + "&" + '{}'.format(data)
sign = hashlib.md5(aR.encode()).hexdigest()
sign解决了,d.token还没有解决,而d = this.options,那么this.options是什么呢?经过寻找发现:
搜索d.token的值,发现这个值在cookie当中出现了,与时间戳结合在一起,这个token值会过期,大概是2小时:
那么只要获取到_m_h5_tk的值,取出token不就可以了吗?那么如何获取登录淘宝的cookie呢?
淘宝对selenium进行了反扒,通过selenium动态登录会弹出永远也不可能验证成功的滑框,这种方法不可行。换用pyppeteer,进行请求,可以完美进行登录,但是获取的cookie没有_m_h5_tk,尴尬了。。
最后经过分析,请求数据接口的时候请求头必须带cookie,且包含_m_h5_tk与_m_h5_tk_enc这两个字段,而且发现不需要登录就可以获取这个cookie,响应返回的信息当中也包括了这两个字段,这样就可以获取到token了:
获取到token,就可以获取到sign了,至此h5的请求已经基本解决,下一步就是获取data当中的liveId与creatorId了。源源不断的获取sign,进行数据请求。