抓取百度翻译的接口(通过反编译js获取token、gtk、sign等签名信息)

废话少说直接上代码。

pc版

baidutranslator.py

import requests
import json
import re
import execjs


class BaiduTranslator:
    def __init__(self):
        self.trans_str = None
        self.trans_url = "https://fanyi.baidu.com/v2transapi"
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36",
            # 必须带上cookie,否则请求返回的是错误信息
            "Cookie": "BAIDUID=AFB6E3FB47D3EEA8C525D02E728E0991:FG=1; BIDUPSID=AFB6E3FB47D3EEA8C525D02E728E0991; PSTM=1556177968; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; MCITY=-131%3A; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1556507678,1557287607,1557714510,1557972796; BDSFRCVID=xr-sJeCCxG3twro9YX2saOEfCZPT14fNd2s33J; H_BDCLCKID_SF=tR3fL-08abrqqbRGKITjhPrM2hKLbMT-027OKKO2b-oobfTyDRbHXULELn6TLT_J5eobot8bthF0HPonHj85j6bQ3J; PSINO=2; delPer=0; H_PS_PSSID=1450_28937_21095_18560_29064_28518_29098_28722_28963_28836_28584_26350; locale=zh; yjs_js_security_passport=5b9f340d92cf7400bd5ba82b49a65bc0520935cc_1558490261_js; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1558493551; to_lang_often=%5B%7B%22value%22%3A%22jp%22%2C%22text%22%3A%22%u65E5%u8BED%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; from_lang_often=%5B%7B%22value%22%3A%22jp%22%2C%22text%22%3A%22%u65E5%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D"
        }
        self.pattern = re.compile(r"window\['common'\]\W*?=\W*?{\W*?.*?token.*?:.*?'(\w+)',")
        self.pattern_gtk = re.compile(r"window.gtk\W*?=\W*?'(.*?)'")

    def parse_url(self, data, url="https://fanyi.baidu.com/langdetect"):
        response = requests.post(url, data=data, headers=self.headers)
        print(response.text)
        return json.loads(response.content.decode())

    def get_token_or_gtk(self, url="https://fanyi.baidu.com/translate"):
        response = requests.get(url, headers=self.headers)
        str = response.content.decode()
        token = self.pattern.search(str).group(1)
        gtk = self.pattern_gtk.search(str).group(1)
        return token, gtk

    def get_sign(self, gtk):
        with open("./cs.js", 'r') as f:
            js_code = f.read()
        ctx = execjs.compile(js_code)
        return ctx.call("e", self.trans_str, gtk)

    def run(self):
        token, gtk = self.get_token_or_gtk()
        print("""----------您好!欢迎来到你好骚翻译系统----------
                -----------输入777退出系统---------""")
        while True:
            self.trans_str = input("请输入您想翻译的文字:\b")
            if self.trans_str == "777":
                break
            lang_detect_data = {"query": self.trans_str}
            lang = self.parse_url(lang_detect_data)["lan"]
            trans_data = {"query": self.trans_str, "from": "zh", "to": "en"} if lang == "zh" else {
                "query": self.trans_str,
                "from": "en", "to": "zh"}

            sign = self.get_sign(gtk)
            trans_data.update({"sign": sign, "token": token, "transtype": "translang", "simple_means_flag": 3})
            dict_response = self.parse_url(trans_data, self.trans_url)
            print(dict_response)


if __name__ == '__main__':
    baidutranslator = BaiduTranslator()
    baidutranslator.run()

移动端
baidutranslatorphone.py

import requests
import json
import re
import execjs


class BaiduTranslator:
    def __init__(self):
        self.trans_str=None
        self.trans_url = "https://fanyi.baidu.com/basetrans"
        self.headers = {
            "user-agent": "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Mobile Safari/537.36",
            "Cookie": "BAIDUID=028CEF0300D9D86D36300716BA814A18:FG=1; BIDUPSID=028CEF0300D9D86D36300716BA814A18; PSTM=1561573459; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; Hm_lvt_64ecd82404c51e03dc91cb9e8c025574=1561605146; Hm_lpvt_64ecd82404c51e03dc91cb9e8c025574=1561605146; from_lang_often=%5B%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%2C%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%5D; REALTIME_TRANS_SWITCH=1; FANYI_WORD_SWITCH=1; HISTORY_SWITCH=1; SOUND_SPD_SWITCH=1; SOUND_PREFER_SWITCH=1; to_lang_often=%5B%7B%22value%22%3A%22zh%22%2C%22text%22%3A%22%u4E2D%u6587%22%7D%2C%7B%22value%22%3A%22en%22%2C%22text%22%3A%22%u82F1%u8BED%22%7D%5D; yjs_js_security_passport=839902afbd8121c0a7838e3c3f141e733705fd43_1561605147_js; H_PS_PSSID=26523_1457_21089_29135_29238_28518_29099_28831_29220_29072_28702; delPer=0; PSINO=2; PMS_JT=%28%7B%22s%22%3A1561606919625%2C%22r%22%3A%22https%3A//fanyi.baidu.com/translate%3Faldtype%3D16047%26query%3D%25E9%25A1%25B6%25E9%25A1%25B6%25E9%25A1%25B6+%26keyfrom%3Dbaidu%26smartresult%3Ddict%26lang%3Dauto2zh%22%7D%29"
        }
        self.pattern = re.compile(r"page\.common\W*?=\W*?{\W*?.*?token.*?:.*?'(\w+)',")
        self.pattern_gtk = re.compile(r"gtk\W*?:\W*?'(.+)'")

    def parse_url(self, data, url="https://fanyi.baidu.com/langdetect"):
        response = requests.post(url, data=data, headers=self.headers)
        return json.loads(response.content.decode())

    def get_token_or_gtk(self, url="https://fanyi.baidu.com/"):
        response = requests.get(url, headers=self.headers)
        str = response.content.decode()
        token = self.pattern.search(str).group(1)
        gtk = self.pattern_gtk.search(str).group(1)
        return token, gtk

    def get_sign(self, gtk):
        with open("./cs.js", 'r') as f:
            js_code = f.read()
        ctx = execjs.compile(js_code)
        return ctx.call("e", self.trans_str, gtk)

    def run(self):
        token, gtk = self.get_token_or_gtk()
        print("""----------您好!欢迎来到你好骚翻译系统----------
        -----------输入777推出系统---------""")
        while True:
            self.trans_str = input("请输入您想翻译的文字:\b")
            if self.trans_str == "777":
                break
            lang_detect_data = {"query": self.trans_str}
            lang = self.parse_url(lang_detect_data)["lan"]
            trans_data = {"query": self.trans_str, "from": "zh", "to": "en"} if lang == "zh" else {
                "query": self.trans_str,
                "from": "en", "to": "zh"}
            sign = self.get_sign(gtk)
            trans_data.update({"sign": sign, "token": token})
            dict_response = self.parse_url(trans_data, self.trans_url)
            print(dict_response)


if __name__ == '__main__':
    baidutranslator = BaiduTranslator()
    baidutranslator.run()

总结

        按常理来说,爬取移动端要比PC端更加简单。但实际情况是两者几乎一样,具体爬哪个平台按自己需求吧。不过有一点需要格外注意,百度翻译处理验证签名信息以外,还针对cookie信息做了反爬虫策略,我们使用请求头时必须带上cookie信息,否则会返回一条错误的信息。也就是说携带cookie,但是这样又很容易被识别为爬虫,没有办法我们必须增加开发成本,创建一个cookie池并设计策略(在保持使用次数大致相等的情况下随机调用)。token和gtk也需要按照实际使用情况设计更新规则进行更新。(这些功能均为实现,抱歉我懒。。。)

之前缺的js文件

这个是从浏览器上爬下来的js代码

function e(r, gtk) {
    var o = r.match(/[\uD800-\uDBFF][\uDC00-\uDFFF]/g);
    if (null === o) {
        var t = r.length;
        t > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(t / 2) - 5, 10) + r.substr(-10, 10))
    } else {
        for (var e = r.split(/[\uD800-\uDBFF][\uDC00-\uDFFF]/), C = 0, h = e.length, f = []; h > C; C++)
            "" !== e[C] && f.push.apply(f, a(e[C].split(""))),
            C !== h - 1 && f.push(o[C]);
        var g = f.length;
        g > 30 && (r = f.slice(0, 10).join("") + f.slice(Math.floor(g / 2) - 5, Math.floor(g / 2) + 5).join("") + f.slice(-10).join(""))
    }
    var u = void 0
        , l = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
    u = gtk;
    for (var d = u.split("."), m = Number(d[0]) || 0, s = Number(d[1]) || 0, S = [], c = 0, v = 0; v < r.length; v++) {
        var A = r.charCodeAt(v);
        128 > A ? S[c++] = A : (2048 > A ? S[c++] = A >> 6 | 192 : (55296 === (64512 & A) && v + 1 < r.length && 56320 === (64512 & r.charCodeAt(v + 1)) ? (A = 65536 + ((1023 & A) << 10) + (1023 & r.charCodeAt(++v)),
            S[c++] = A >> 18 | 240,
            S[c++] = A >> 12 & 63 | 128) : S[c++] = A >> 12 | 224,
            S[c++] = A >> 6 & 63 | 128),
            S[c++] = 63 & A | 128)
    }
    for (var p = m, F = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), D = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), b = 0; b < S.length; b++)
        p += S[b],
            p = n(p, F);
    return p = n(p, D),
        p ^= s,
    0 > p && (p = (2147483647 & p) + 2147483648),
        p %= 1e6,
    p.toString() + "." + (p ^ m)
}

function a(r) {
    if (Array.isArray(r)) {
        for (var o = 0, t = Array(r.length); o < r.length; o++)
            t[o] = r[o];
        return t
    }
    return Array.from(r)
}

function n(r, o) {
    for (var t = 0; t < o.length - 2; t += 3) {
        var a = o.charAt(t + 2);
        a = a >= "a" ? a.charCodeAt(0) - 87 : Number(a),
            a = "+" === o.charAt(t + 1) ? r >>> a : r << a,
            r = "+" === o.charAt(t) ? r + a & 4294967295 : r ^ a
    }
    return r
}

你可能感兴趣的:(网络爬虫)