确定url这一块是进行爬虫的第一步,确定URL的最简单的方法就是:找到你要下载资源的页面–>按F12–>点击network—>点击XHR(xml and http and request)–>然后刷新当前页面,就会出现下图的页面:
然后点击第二个文件,点击Headers,就会出现百度翻译的地址:
Request URL: https://fanyi.baidu.com/v2transapi?from=zh&to=en
请求头里面携带的数据都是关于浏览器的一些相关信息,服务器可以通过请求头来确定发送请求的是不是一个爬虫,从而达到一个反爬的效果。针对请求头的一些信息一般是从这三个方面来进行
百度翻译中携带的参数有以下这些参数,不难发现真正在变化的,除了query,就是sign
而query就是你要查询的内容,所以真正在变化的就是sign值。那怎么查找sign值了?
找到js文件后,在文件中搜索sign。找到这个的sign的实现,当你找到sign的实现后,发现sign是由f(n)函数实现的
点击这个地址,就是进入到f(n) 函数的具体实现中。进入到函数的具体实现后,就要确定函数的范围,确定函数范围就需要你掌握js的知识,我就直接贴出你要复制的内容
function a(r) {
if (Array.isArray(r)) {
for (var o = 0, t = Array(r.length); o < r.length; o++)
t[o] = r[o];
return t
}
return Array.from(r)
}
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = null !== i ? i : (i = window[l] || "") || "";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
/*这个变量 是1 不是i*/
/*变量1 是由这三个函数拼接而来,这三个函数表示返回ascii码103、116、107对应的字符g、t、K*/
u = null !== i ? i : (i = window[l] || "") || "";
/*变量u 是通过js中的三目运算计算出来的
规则是:如果i不为null,u的值等于i的值
如果等于null,u的值就等于wndow对象下面的gtk的属性值
*/
所以通过分析就要确定window[‘gtk’]的值。
确定好了以后,上述复制的js代码就可以改写成以下的形式了。
function a(r) {
if (Array.isArray(r)) {
for (var o = 0, t = Array(r.length); o < r.length; o++)
t[o] = r[o];
return t
}
return Array.from(r)
}
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}
function e(r) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = "320305.131321201";
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
import requests
import execjs
# 执行js文件
def exec_js(name):
with open(file=r'C:\Users\ID190699\Documents\Untitled-1.js', encoding="utf-8") as f:
js_file = f.read()
com = execjs.compile(js_file) # 编译js文件
results = com.call("e", name) # 调用js里面的方法
return results
# 获取参数
def get_data(name):
datas = exec_js(name)
data = {
'from': 'zh',
'to': 'en',
'query': name,
'transtype': 'realtime',
'simple_means_flag': '3',
'sign': datas,
'token': '3ace51c83513d2432da6e2a913a82257',
'domain': 'common'
}
return data
# 获取翻译的结果
def get_result(url, data, header):
response = requests.post(url=Request_url, data=data, headers=headers)
if response.status_code == 200:
result = response.json()
print(result['trans_result']['data'][0]['dst'])
else:
print("获取失败!")
if __name__ == '__main__':
Request_url = 'https://fanyi.baidu.com/v2transapi?from=zh&to=en'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/88.0.4324.190 Safari/537.36',
'Cookie': 'BIDUPSID=1FD3274E2FF3F7C35C340F43A12DE10E; PSTM=1588919751; '
'BAIDUID=1FD3274E2FF3F7C32B5BAA32525071AA:SL=0:NR=10:FG=1; REALTIME_TRANS_SWITCH=1; '
'SOUND_SPD_SWITCH=1; '
'SOUND_PREFER_SWITCH=1; HISTORY_SWITCH=1; FANYI_WORD_SWITCH=1; '
'__yjs_duid=1_7f2b3040c832212b33cfa3e60427da1e1614326336767; '
'H_PS_PSSID=33517_33256_33344_31253_33594_33392_26350_22159; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; '
'BA_HECTOR=a10lagag8h818h2l3s1g4gcpa0r; BCLID=11472529502261507793; '
'BDSFRCVID=PN_OJexroG3VC5QesaCAboR1JLweG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu'
'-EHtdogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; '
'H_BDCLCKID_SF=tR3aQ5rtKRTffjrnhPF30MCmXP6-hnjy3b7p5K5l54OHEpjPhpKayxAWbttf5q3RymJ42'
'-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD--g3-AqBM5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3'
'-oJqCDMhI_x3J; BCLID_BFESS=11472529502261507793; '
'BDSFRCVID_BFESS=PN_OJexroG3VC5QesaCAboR1JLweG7bTDYLEOwXPsp3LGJLVJeC6EG0Pts1-dEu'
'-EHtdogKK0gOTH6KF_2uxOjjg8UtVJeC6EG0Ptf8g0M5; '
'H_BDCLCKID_SF_BFESS=tR3aQ5rtKRTffjrnhPF30MCmXP6-hnjy3b7p5K5l54OHEpjPhpKayxAWbttf5q3RymJ42'
'-39LPO2hpRjyxv4y4Ldj4oxJpOJ-bCL0p5aHl51fbbvbURvD--g3-AqBM5dtjTO2bc_5KnlfMQ_bf--QfbQ0hOhqP-jBRIE3'
'-oJqCDMhI_x3J; BAIDUID_BFESS=1FD3274E2FF3F7C32B5BAA32525071AA:SL=0:NR=10:FG=1; '
'Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1613614413,1613799762,1615344429; '
'Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1615344429; '
'__yjsv5_shitong=1.0_7_557b220c28dbfef5a72d23ca80485b2cea83_300_1615344429709_120.236.69'
'.35_e85e8313; '
'ab_sr=1.0'
'.0_ODJjM2Q0Mjc4NjgxYmRiYzRjNzY2ODZhYmU4YmY0OWFhNjNlM2QzOTc0ZTFmZjg4M2Y5YzExZDczNDVlYzIxZTk4MmU5YTFiNDA2MWNmNzY1YzY5OWVkODAwODQ1ZDQy',
'Referer': 'https://fanyi.baidu.com/?aldtype=16047',
}
data = get_data("爱情")
get_result(Request_url, data, headers)