纯 Python 实现的 Google 批量翻译!

首先声明,没有什么不良动机,因为经常会用 translate.google.cn,就想着用 Python 模拟网页提交实现文档的批量翻译。据说有 API,可是要收费。

生成 Token

Google 为防爬虫而生成 token 的代码是 Javascript 的,且是根据网站的 TKK 值和提交的文本动态生成。

网上搜到的一段 Python 代码 有点小 Bug,且缺少动态获取 TKK 的步骤。

最后还是对照 Javascript 代码自己改成 Python 了。方法很简单,先转成易懂的 Javascript,再转成 Python。Javascript 代码来自C#实现谷歌翻译API。

原始(晦涩) Javascript 代码

Python资源共享群:626017123

var b = function (a, b) {
    for (var d = 0; d < b.length - 2; d += 3) {
        var c = b.charAt(d + 2),
            c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c),
            c = "+" == b.charAt(d + 1) ? a >>> c : a << c;
        a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c
    }
    return a
}
 
var tk =  function (a,TKK) {
    for (var e = TKK.split("."), h = Number(e[0]) || 0, g = [], d = 0, f = 0; f < a.length; f++) {
        var c = a.charCodeAt(f);
        128 > c ? g[d++] = c : (2048 > c ? g[d++] = c >> 6 | 192 : (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512) ? (c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023), g[d++] = c >> 18 | 240, g[d++] = c >> 12 & 63 | 128) : g[d++] = c >> 12 | 224, g[d++] = c >> 6 & 63 | 128), g[d++] = c & 63 | 128)
    }
    a = h;
    for (d = 0; d < g.length; d++) a += g[d], a = b(a, "+-a^+6");
    a = b(a, "+-3^+b+-f");
    a ^= Number(e[1]) || 0;
    0 > a && (a = (a & 2147483647) + 2147483648);
    a %= 1E6;
    return a.toString() + "." + (a ^ h)
}

易懂的 Javascript 代码

function RL(a, b) {
    for (var d = 0; d < b.length - 2; d += 3) {
        var c = b.charAt(d + 2);
        c = "a" <= c ? c.charCodeAt(0) - 87 : Number(c);
        c = "+" == b.charAt(d + 1) ? a >>> c : a << c;
        a = "+" == b.charAt(d) ? a + c & 4294967295 : a ^ c;
    }
    return a
}
 
function TL(a,TKK) {
    var e = TKK.split(".");
    var h = Number(e[0]) || 0;
    var g = [];
    var d = 0;
    for (var f = 0; f < a.length; f++) {
        var c = a.charCodeAt(f);
        if (128 > c)
        {
            g[d++] = c;
        } 
        else
        {
            if (2048 > c)
            {
                g[d++] = c >> 6 | 192;
            }
            else
            {
                if (55296 == (c & 64512) && f + 1 < a.length && 56320 == (a.charCodeAt(f + 1) & 64512))
                {
                    c = 65536 + ((c & 1023) << 10) + (a.charCodeAt(++f) & 1023);
                    g[d++] = c >> 18 | 240;
                    g[d++] = c >> 12 & 63 | 128;
                }
                else
                {
                    g[d++] = c >> 12 | 224;
                    g[d++] = c >> 6 & 63 | 128;
                }
            }
            g[d++] = c & 63 | 128;
        }
    }
    a = h;
    for (var d = 0; d < g.length; d++) {
        a += g[d];
        a = b(a, "+-a^+6");
    }
    a = b(a, "+-3^+b+-f");
    a ^= Number(e[1]) || 0;
    0 > a && (a = (a & 2147483647) + 2147483648);
    a %= 1E6;
    return a.toString() + "." + (a ^ h)
}

Python 代码

def getGoogleToken(a, TKK):
    def RL(a, b):
        for d in range(0, len(b)-2, 3):
            c = b[d + 2]
            c = ord(c[0]) - 87 if 'a' <= c else int(c)
            c = a >> c if '+' == b[d + 1] else a << c
            a = a + c & 4294967295 if '+' == b[d] else a ^ c
        return a
    g = []
    f = 0
    while f < len(a):
        c = ord(a[f])
        if 128 > c:
            g.append(c)
        else:
            if 2048 > c:
                g.append((c >> 6) | 192)
            else:
                if (55296 == (c & 64512)) and (f + 1 < len(a)) and (56320 == (ord(a[f+1]) & 64512)):
                    f += 1
                    c = 65536 + ((c & 1023) << 10) + (ord(a[f]) & 1023)
                    g.append((c >> 18) | 240)
                    g.append((c >> 12) & 63 | 128)
                else:
                    g.append((c >> 12) | 224)
                    g.append((c >> 6) & 63 | 128)
            g.append((c & 63) | 128)
        f += 1
    e = TKK.split('.')
    h = int(e[0]) or 0
    t = h
    for item in g:
        t += item
        t = RL(t, '+-a^+6')
    t = RL(t, '+-3^+b+-f')
    t ^= int(e[1]) or 0
    if 0 > t:
        t = (t & 2147483647) + 2147483648
    result = t % 1000000
    return str(result) + '.' + str(result ^ h)

获取 Token Key

Google 的 TKK 可以通过访问网站 https://translate.google.cn 获取,里面有段脚本里包含了“tkk:('xxxxxx.xxxxxx')”,用正则表达式截取即可。

res = requests.get('https://translate.google.cn', timeout = 3)
    res.raise_for_status()
    result = re.search(r'tkk\:\'(\d+\.\d+)?\'', res.text).group(1)

划分文章段落

因为常从 PDF 里复制文本翻译,这样就不能依赖换行符来划分段落了。只能判断空行,作为段落的分界。

另外 Google 返回的结果 Json 里,会以英文句点作为分隔符,每一句译文均作为数组的一项分开。所以最后得合并一下,成为一个段落。

完整代码

代码不长,全文黏贴如下。

GoogleTranslator.py:

import requests
import re
import json
import time
class GoogleTranslator ():
    _host = 'translate.google.cn'
    _headers = {
        'Host': _host,
        'User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.100 Mobile Safari/537.36',
        'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
        'Accept-Encoding': 'gzip, deflate, br',
        'Content-Type': 'application/x-www-form-urlencoded;charset=utf-8',
        'Referer': 'https://' + _host,
        'Connection': 'keep-alive',
        'Cache-Control': 'max-age=0'
    }
    _language = {
        'afrikaans': 'af',
        'arabic': 'ar',
        'belarusian': 'be',
        'bulgarian': 'bg',
        'catalan': 'ca',
        'czech': 'cs',
        'welsh': 'cy',
        'danish': 'da',
        'german': 'de',
        'greek': 'el',
        'english': 'en',
        'esperanto': 'eo',
        'spanish': 'es',
        'estonian': 'et',
        'persian': 'fa',
        'finnish': 'fi',
        'french': 'fr',
        'irish': 'ga',
        'galician': 'gl',
        'hindi': 'hi',
        'croatian': 'hr',
        'hungarian': 'hu',
        'indonesian': 'id',
        'icelandic': 'is',
        'italian': 'it',
        'hebrew': 'iw',
        'japanese': 'ja',
        'korean': 'ko',
        'latin': 'la',
        'lithuanian': 'lt',
        'latvian': 'lv',
        'macedonian': 'mk',
        'malay': 'ms',
        'maltese': 'mt',
        'dutch': 'nl',
        'norwegian': 'no',
        'polish': 'pl',
        'portuguese': 'pt',
        'romanian': 'ro',
        'russian': 'ru',
        'slovak': 'sk',
        'slovenian': 'sl',
        'albanian': 'sq',
        'serbian': 'sr',
        'swedish': 'sv',
        'swahili': 'sw',
        'thai': 'th',
        'filipino': 'tl',
        'turkish': 'tr',
        'ukrainian': 'uk',
        'vietnamese': 'vi',
        'yiddish': 'yi',
        'chinese_simplified': 'zh-CN',
        'chinese_traditional': 'zh-TW',
        'auto': 'auto'
    }
    _url = 'https://' + _host + '/translate_a/single'
    _params = {
            'client': 'webapp',
            'sl': 'en',
            'tl': 'zh-CN',
            'hl': 'zh-CN',
            'dt': 'at',
            'dt': 'bd',
            'dt': 'ex',
            'dt': 'ld',
            'dt': 'md',
            'dt': 'qca',
            'dt': 'rw',
            'dt': 'rm',
            'dt': 'ss',
            'dt': 't',
            'otf': '1',
            'ssel': '0',
            'tsel': '0',
            'kc': '1'
    }
    __cookies = None
    __googleTokenKey = '376032.257956'
    __googleTokenKeyUpdataTime = 600.0
    __googleTokenKeyRetireTime = time.time() + 600.0
    def __init__(self, src = 'en', dest = 'zh-CN', tkkUpdataTime = 600.0):
        if src not in self._language and src not in self._language.values():
            src = 'auto'
        if dest not in self._language and dest not in self._language.values():
            dest = 'auto'
        self._params['sl'] = src
        self._params['tl'] = dest
        self.googleTokenKeyUpdataTime = tkkUpdataTime
        self.__updateGoogleTokenKey()
    def __updateGoogleTokenKey(self):
        self.__googleTokenKey = self.__getGoogleTokenKey()
        self.__googleTokenKeyRetireTime = time.time() + self.__googleTokenKeyUpdataTime
    def __getGoogleTokenKey(self):
        """Get the Google TKK from https://translate.google.cn"""
        # TKK example: '435075.3634891900'
        result = ''
        try:
            res = requests.get('https://' + self._host, timeout = 3)
            res.raise_for_status()
            self.__cookies = res.cookies
            result = re.search(r'tkk\:\'(\d+\.\d+)?\'', res.text).group(1)
        except requests.exceptions.ReadTimeout as ex:
            print('ERROR: ' + str(ex))
            time.sleep(1)
        return result
    def __getGoogleToken(self, a, TKK):
        """Calculate Google tk from TKK """
        # https://www.cnblogs.com/chicsky/p/7443830.html
        # if text = 'Tablet Developer' and TKK = '435102.3120524463', then tk = '315066.159012'
        def RL(a, b):
            for d in range(0, len(b)-2, 3):
                c = b[d + 2]
                c = ord(c[0]) - 87 if 'a' <= c else int(c)
                c = a >> c if '+' == b[d + 1] else a << c
                a = a + c & 4294967295 if '+' == b[d] else a ^ c
            return a
        g = []
        f = 0
        while f < len(a):
            c = ord(a[f])
            if 128 > c:
                g.append(c)
            else:
                if 2048 > c:
                    g.append((c >> 6) | 192)
                else:
                    if (55296 == (c & 64512)) and (f + 1 < len(a)) and (56320 == (ord(a[f+1]) & 64512)):
                        f += 1
                        c = 65536 + ((c & 1023) << 10) + (ord(a[f]) & 1023)
                        g.append((c >> 18) | 240)
                        g.append((c >> 12) & 63 | 128)
                    else:
                        g.append((c >> 12) | 224)
                        g.append((c >> 6) & 63 | 128)
                g.append((c & 63) | 128)
            f += 1
        e = TKK.split('.')
        h = int(e[0]) or 0
        t = h
        for item in g:
            t += item
            t = RL(t, '+-a^+6')
        t = RL(t, '+-3^+b+-f')
        t ^= int(e[1]) or 0
        if 0 > t:
            t = (t & 2147483647) + 2147483648
        result = t % 1000000
        return str(result) + '.' + str(result ^ h)
    def translate(self, text):
        if time.time() > self.__googleTokenKeyRetireTime:
            self.__updateGoogleTokenKey()
        data = {'q': text}
        self._params['tk'] = self.__getGoogleToken(text, self.__googleTokenKey)
        result = ''
        try:
            res = requests.post(self._url,
                            headers = self._headers,
                            cookies = self.__cookies,
                            data = data,
                            params = self._params,
                            timeout = 6)
            res.raise_for_status()
            jsonText = res.text
            if len(jsonText)>0:
                jsonResult = json.loads(jsonText)
                if len(jsonResult[0])>0:
                    for item in jsonResult[0]:
                        result += item[0]
            return result
        except Exception as ex:
            print('ERROR: ' + str(ex))
            return ''
import time
from GoogleTranslator import GoogleTranslator
def readFile(fileName):
    with open(fileName, 'r') as f:
        paragraph = ''
        for line in f:
            if line[0]!='\n':
                paragraph += line.strip('\n')
            else:
                if len(paragraph)>0:
                    yield paragraph
                    paragraph = ''
        if len(paragraph)>0:
            yield paragraph

main.py:

def main():
    translator = GoogleTranslator()
    count = 0
    with open('C:\\dx\\python\\d.txt', 'w', encoding='utf-8') as df:
        for line in readFile('C:\\dx\\python\\s.txt'):
            if len(line) > 1:
                count += 1
                print('\r' + str(count), end = '', flush = True)
                df.write(line.strip() + "\n")
                result = translator.translate(line)
                df.write(result.strip() + "\n\n")
if __name__ == "__main__":
    startTime = time.time()
    main()
    print()
    print('%.2f seconds' % (time.time() - startTime))

结束语

求人不如求己。不能怕烦,代码都是人敲出来的,找不到现成的还得靠自己编。

你可能感兴趣的:(Python)