1.使用selenium进行模拟浏览器进行选择元素,然后进行爬取,这种方法最简单,但是牺牲了速度,爬虫关键就在速度,因此针对大量的数据的话,就远远不能满足了
2.使用js逆向破解,这个方法的要求高,需要你懂得chrome的调试工具和JavaScricp的语法,至少得看东js代码,但是这个方法对于爬去大量数据速度回比第一种方法快的很多
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
var i="320305.131321201"
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
import execjs
query = 'hello' #是要翻译的内容
with open('baidu_translate_js.js', 'r', encoding='utf-8') as f:
ctx = execjs.compile(f.read())
sign = ctx.call('e', query)
print(sign)
10.得到了sign值,基本上已经把这个js搞定了,然后就是编写代码进行抓取所翻译的内容了,代码如下:
import execjs
import requests
headers={'accept': '*/*', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'zh-CN,zh;q=0.9', 'content-length': '106', 'content-type': 'application/x-www-form-urlencoded; charset=UTF-8', 'cookie': 'BIDUPSID=EF5D2DCB95CD02713C504B965E680572; PSTM=1508391259; BAIDUID=FE94A1C6870007735C0EA30CA092352A:FG=1; BDUSS=HhpVTc3VjZrQ2ppRX5RcVFoQW9-WExTQ29zYWR-TUluOUQxRGVaWHZrWGlOWmRkRVFBQUFBJCQAAAAAAAAAAAEAAAAUxiG2ZnJlZc31vNG~pQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAOKob13iqG9dW; locale=zh; __guid=37525047.783289347368707300.1568961749022.282; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; to_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; yjs_js_security_passport=67080cbdf7d8d4ad0eb8f1513b5feb52c128c29b_1569324592_js; monitor_count=3; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1568961749,1569324577,1569324592,1569324674; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1569324674; __yjsv5_shitong=1.0_7_9055159b9a5e975fcd2c2c48931b3bc7b406_300_1569324677995_117.32.216.70_70981334', 'origin': 'https://fanyi.baidu.com', 'referer': 'https://fanyi.baidu.com/', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', 'x-requested-with': 'XMLHttpRequest'}
def signs(n):
query = n
with open('baidu_translate_js.js', 'r', encoding='utf-8') as f:
ctx = execjs.compile(f.read())
sign = ctx.call('e', query)
return sign
def datas(sign,n,zh,en):
data={
'from':zh,
'to': en,
'query': n,
'simple_means_flag':' 3',
'sign': sign,
'token':'8d588b57816e1213f2bcfaf52bddbbe2',
}
return data
def xinxi(url,data):
r=requests .post(url=url,headers=headers,data=data).json()
return r
if __name__=='__main__':
url='https://fanyi.baidu.com/v2transapi'
print("请输入需要的要求(1或者2):")
print("1.英译中 2.中译英")
q=input()
if q=='2':
print("输入翻译内容:")
n=input()
zh='zh'
en='en'
sign=signs(n)
data=datas(sign,n,zh,en)
r=xinxi(url,data)
print("翻译结果:"+'\n'+r['trans_result']['data'][0]['dst'])
else:
print("输入翻译内容:")
n = input()
zh='en'
en='zh'
sign = signs(n)
data = datas(sign, n,zh,en)
r = xinxi(url, data)
print("翻译结果:" + '\n' + r['trans_result']['data'][0]['dst'])