PDF文档全文翻译，保留原有的页面布局

PDF文档全文翻译，保留原有的PDF页面布局

pymupdf
- pip install pymupdf
- https://pymupdf.readthedocs.io/en/latest/index.html
youdao api

1. 采用 pymupdf 识别 pdf 的图片和文字

import fitz
import re
from pprint import pprint

pdf_name = 'xxx-en.pdf'
print(f'Source pdf file: {pdf_name} \n')
src_pdf = fitz.open(pdf_name)
new_pdf = fitz.open()

for p, page in enumerate(src_pdf):
    print(f'\n- translating PAGE -{p}- ...')

    # 1.1 创建大小相同的新页面
    new_page = new_pdf.new_page(width=page.rect.width, height=page.rect.height)

    blocks = page.get_text('dict')['blocks']

    # 1.2 图片
    img_blks = [b for b in blocks if b['type'] == 1]
    for img in img_blks:
        # pprint(img)
        new_page.insert_image(img['bbox'], stream=img['image'])

    # 1.3 文字
    txt_blks = [b for b in blocks if b['type'] != 1]
    for txt in txt_blks:
        text_tmp = ''.join([s['text'] for l in txt['lines'] for s in l['spans']])
        text_tmp = re.sub('[@#$%^&*\'\"\n\r\t]', ' ', text_tmp).strip()

        if text_tmp:
            # print(txt['bbox'], text_tmp)

            text_translate = '中国 ' + text_tmp
            # text_translate = youdao(text_tmp)
            new_page.insert_textbox(txt['bbox'], text_translate,
                                    fontsize=6,
                                    fontname='simhei',
                                    fontfile=r'C:\Windows\Fonts\simhei.ttf')

    # if p == 1:
    #     break

new_name = pdf_name.replace('.pdf', '-zh.pdf')
new_pdf.save(new_name)

print('\n------Done!-------')

2. 有道翻译

# %%
# %%
import requests
import json
import time


def youdao(en_txt=''):
    api_url = 'http://fanyi.youdao.com/translate?&i={}&doctype=json'

    res = requests.get(api_url.format(en_txt)).json()
    time.sleep(3.0)
    # print(res)

    zh_txt = ''.join([seq['tgt'] for seq in res['translateResult'][0]])

    print(f'*** {en_txt} \n--> {zh_txt}')


    return zh_txt


en_txt = 'so we beat on, boats against the current, borne back ceaselessly into the past.'
youdao(en_txt)

3. 百度翻译

# %%
import requests
import random
import json
from hashlib import md5
import time


# ref: https://api.fanyi.baidu.com/doc/
appid = '2222222222222222'
appkey = 'ooooooooooooooooooo'

# Generate salt and sign
def make_md5(s, encoding='utf-8'):
    return md5(s.encode(encoding)).hexdigest()


def baidu(en_txt=''):
    salt = random.randint(32768, 65536)
    sign = make_md5(appid + en_txt + str(salt) + appkey)

    api_url = 'http://api.fanyi.baidu.com/api/trans/vip/translate'
    headers = {'Content-Type': 'application/x-www-form-urlencoded'}
    payload = {'appid': appid, 'q': en_txt, 'from': 'en', 'to': 'zh', 'salt': salt, 'sign': sign}

    res = requests.get(api_url, params=payload, headers=headers, timeout=3.0).json()
    time.sleep(3.0)

    if 'trans_result' in res.keys():
        zh_txt = ''.join([seq['dst'] for seq in res['trans_result']])
        print(f'*** {en_txt} \n--> {zh_txt}')
        return zh_txt

    if 'error_code' in res.keys():
        print(f'*** {en_txt} \n??? {res}')


en_txt = 'so we beat on, boats against the current, borne back ceaselessly into the past.'
baidu(en_txt)

PDF文档全文翻译，保留原有的页面布局

1. 采用 pymupdf 识别 pdf 的图片和文字

2. 有道翻译

3. 百度翻译

你可能感兴趣的:(PDF文档全文翻译，保留原有的页面布局)