baidutranslator.py
import requests
import json
import re
import execjs
class BaiduTranslator:
def __init__(self):
self.trans_str = None
self.trans_url = "https://fanyi.baidu.com/v2transapi"
self.headers = {
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
# 必须带上cookie,否则请求返回的是错误信息
"Cookie": "BAIDUID=AFB6E3FB47D3EEA8C525D02E728E0991:FG=1; BIDUPSID=AFB6E3FB47D3EEA8C525D02E728E0991; PSTM=1556177968; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; MCITY=-131%3A; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1556507678,1557287607,1557714510,1557972796; BDSFRCVID=xr-sJeCCxG3twro9YX2saOEfCZPT14fNd2s33J; H_BDCLCKID_SF=tR3fL-08abrqqbRGKITjhPrM2hKLbMT-027OKKO2b-oobfTyDRbHXULELn6TLT_J5eobot8bthF0HPonHj85j6bQ3J; PSINO=2; delPer=0; H_PS_PSSID=1450_28937_21095_18560_29064_28518_29098_28722_28963_28836_28584_26350; locale=zh; yjs_js_security_passport=5b9f340d92cf7400bd5ba82b49a65bc0520935cc_1558490261_js; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1558493551; to_lang_often=%5B%7B%22value%22%3A%22jp%22%2C%22text%22%3A%22%u65E5%u8BED%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; from_lang_often=%5B%7B%22value%22%3A%22jp%22%2C%22text%22%3A%22%u65E5%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D"
}
self.pattern = re.compile(r"window\['common'\]\W*?=\W*?{\W*?.*?token.*?:.*?'(\w+)',")
self.pattern_gtk = re.compile(r"window.gtk\W*?=\W*?'(.*?)'")
def parse_url(self, data, url="https://fanyi.baidu.com/langdetect"):
response = requests.post(url, data=data, headers=self.headers)
print(response.text)
return json.loads(response.content.decode())
def get_token_or_gtk(self, url="https://fanyi.baidu.com/translate"):
response = requests.get(url, headers=self.headers)
str = response.content.decode()
token = self.pattern.search(str).group(1)
gtk = self.pattern_gtk.search(str).group(1)
return token, gtk
def get_sign(self, gtk):
with open("./cs.js", 'r') as f:
js_code = f.read()
ctx = execjs.compile(js_code)
return ctx.call("e", self.trans_str, gtk)
def run(self):
token, gtk = self.get_token_or_gtk()
print("""----------您好!欢迎来到你好骚翻译系统----------
-----------输入777退出系统---------""")
while True:
self.trans_str = input("请输入您想翻译的文字:\b")
if self.trans_str == "777":
break
lang_detect_data = {"query": self.trans_str}
lang = self.parse_url(lang_detect_data)["lan"]
trans_data = {"query": self.trans_str, "from": "zh", "to": "en"} if lang == "zh" else {
"query": self.trans_str,
"from": "en", "to": "zh"}
sign = self.get_sign(gtk)
trans_data.update({"sign": sign, "token": token, "transtype": "translang", "simple_means_flag": 3})
dict_response = self.parse_url(trans_data, self.trans_url)
print(dict_response)
if __name__ == '__main__':
baidutranslator = BaiduTranslator()
baidutranslator.run()
移动端
baidutranslatorphone.py
import requests
import json
import re
import execjs
class BaiduTranslator:
def __init__(self):
self.trans_str=None
self.trans_url = "https://fanyi.baidu.com/basetrans"
self.headers = {
"user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36",
"Cookie": "BAIDUID=028CEF0300D9D86D36300716BA814A18:FG=1; BIDUPSID=028CEF0300D9D86D36300716BA814A18; PSTM=1561573459; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1561605146; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1561605146; from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; to_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; yjs_js_security_passport=839902afbd8121c0a7838e3c3f141e733705fd43_1561605147_js; H_PS_PSSID=26523_1457_21089_29135_29238_28518_29099_28831_29220_29072_28702; delPer=0; PSINO=2; PMS_JT=%28%7B%22s%22%3A1561606919625%2C%22r%22%3A%22https%3A//fanyi.baidu.com/translate%3Faldtype%3D16047%26query%3D%25E9%25A1%25B6%25E9%25A1%25B6%25E9%25A1%25B6+%26keyfrom%3Dbaidu%26smartresult%3Ddict%26lang%3Dauto2zh%22%7D%29"
}
self.pattern = re.compile(r"page\.common\W*?=\W*?{\W*?.*?token.*?:.*?'(\w+)',")
self.pattern_gtk = re.compile(r"gtk\W*?:\W*?'(.+)'")
def parse_url(self, data, url="https://fanyi.baidu.com/langdetect"):
response = requests.post(url, data=data, headers=self.headers)
return json.loads(response.content.decode())
def get_token_or_gtk(self, url="https://fanyi.baidu.com/"):
response = requests.get(url, headers=self.headers)
str = response.content.decode()
token = self.pattern.search(str).group(1)
gtk = self.pattern_gtk.search(str).group(1)
return token, gtk
def get_sign(self, gtk):
with open("./cs.js", 'r') as f:
js_code = f.read()
ctx = execjs.compile(js_code)
return ctx.call("e", self.trans_str, gtk)
def run(self):
token, gtk = self.get_token_or_gtk()
print("""----------您好!欢迎来到你好骚翻译系统----------
-----------输入777推出系统---------""")
while True:
self.trans_str = input("请输入您想翻译的文字:\b")
if self.trans_str == "777":
break
lang_detect_data = {"query": self.trans_str}
lang = self.parse_url(lang_detect_data)["lan"]
trans_data = {"query": self.trans_str, "from": "zh", "to": "en"} if lang == "zh" else {
"query": self.trans_str,
"from": "en", "to": "zh"}
sign = self.get_sign(gtk)
trans_data.update({"sign": sign, "token": token})
dict_response = self.parse_url(trans_data, self.trans_url)
print(dict_response)
if __name__ == '__main__':
baidutranslator = BaiduTranslator()
baidutranslator.run()
按常理来说,爬取移动端要比PC端更加简单。但实际情况是两者几乎一样,具体爬哪个平台按自己需求吧。不过有一点需要格外注意,百度翻译处理验证签名信息以外,还针对cookie信息做了反爬虫策略,我们使用请求头时必须带上cookie信息,否则会返回一条错误的信息。也就是说携带cookie,但是这样又很容易被识别为爬虫,没有办法我们必须增加开发成本,创建一个cookie池并设计策略(在保持使用次数大致相等的情况下随机调用)。token和gtk也需要按照实际使用情况设计更新规则进行更新。(这些功能均为实现,抱歉我懒。。。)
这个是从浏览器上爬下来的js代码
function e(r, gtk) {
var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
if (null === o) {
var t = r.length;
t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
} else {
for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
"" !== e[C] && f.push.apply(f, a(e[C].split(""))),
C !== h - 1 && f.push(o[C]);
var g = f.length;
g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
}
var u = void 0
, l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
u = gtk;
for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
var A = r.charCodeAt(v);
128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
S[c++] = A >> 18 | 240,
S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
S[c++] = A >> 6 & 63 | 128),
S[c++] = 63 & A | 128)
}
for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
p += S[b],
p = n(p, F);
return p = n(p, D),
p ^= s,
0 > p && (p = (2147483647 & p) + 2147483648),
p %= 1e6,
p.toString() + "." + (p ^ m)
}
function a(r) {
if (Array.isArray(r)) {
for (var o = 0, t = Array(r.length); o < r.length; o++)
t[o] = r[o];
return t
}
return Array.from(r)
}
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var a = o.charAt(t + 2);
a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
}
return r
}