不是很精确、有点慢,但是也够用,胜在免费free
https://huggingface.co/models
运行此段代码,执行翻译
一些简单的设置在这里控制
pdf2chines.py
import os
import cv2
import easyocr
from PIL import Image
from PIL import Image, ImageDraw, ImageFont
import rect_dealer
from img_text import ImgText
cut_model_path = r"F:\ocr\cut_model"
detect_model_jap_path = r"F:\ocr\meta_model\manga-ocr-base" # 检测漫画的文本用的,好烂,还不如easyocr
trans_model_path = r"F:\ocr\meta_model\m2m100_1.2B" # meta的模型
pdf2png_save_path = r"F:\ocr\pdf2png"
pdf_path = r"F:\ocr\pdfs"
pass_point = 0.05
blank_png_path = r"F:\ocr\blank.png"
DEFUALT_FONT_SIZE = 60
MIN_FONT_SIZE = 20
height_sub = 0.1 # 检测到位置后,高度减少一丢丢来找每个文本块
include_height_sub = 0.3
include_width_sub = 0.3
finished_list = "finished_list.txt"
def generate_mask(png, graph_infos):
"""
生成一张mask图
:param png:
:param graph_infos:
:return:
"""
image = Image.open(png)
im_width, im_height = image.size
fill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))
image.paste(fill_image, (0, 0)) # 全搞成黑色的
for info in graph_infos:
pos_info = info[0]
left_up_point = pos_info[0] # [939, 791]
left_down_point = pos_info[3] # [939, 805]
right_up_point = pos_info[1] # [1007, 791]
right_down_point = pos_info[2] # [1007, 805]
up_margin = left_up_point[0] # 上间距
left_margin = left_up_point[1] # 左侧间距
width = right_up_point[0] - left_up_point[0]
height = right_down_point[1] - right_up_point[1]
height_sub_num = height * 0.1
blank_png = Image.new('RGBA', (int(width), int(height - height_sub_num)), (255, 255, 255))
image.paste(blank_png, (int(up_margin + height_sub_num), int(left_margin)))
image.save("{}_filled.png".format(png))
return "{}_filled.png".format(png)
def merge_neighbers(png, graph_infos):
"""
需要把邻近的行都合并了
:param graph_infos:
:return:
"""
filled_path = generate_mask(png, graph_infos) # 生成mask图
rects = rect_dealer.getHoleRects(filled_path) # 获取分割关系
for info in graph_infos:
# 检测包含关系
detect_include(rects, info[0], info[-2], info[-1])
return rects
def detect_include(rects, pos_info, words, acc):
left_up_point = pos_info[0] # [939, 791]
left_down_point = pos_info[3] # [939, 805]
right_up_point = pos_info[1] # [1007, 791]
right_down_point = pos_info[2] # [1007, 805]
up_margin = left_up_point[0] # 上间距
left_margin = left_up_point[1] # 左侧间距
width = right_up_point[0] - left_up_point[0]
height = right_down_point[1] - right_up_point[1]
height_sub_num = min(height * include_height_sub, 20)
width_sub_num = min(include_width_sub * width, 10)
for rect in rects:
if width - width_sub_num < rect.w and height - height_sub_num < rect.h:
print("minus:{},{}".format(rect, pos_info))
if width - width_sub_num < rect.w and height - height_sub_num < rect.h and left_up_point[0] > \
rect.x - width_sub_num and left_up_point[1] > rect.y - height_sub_num:
rect.words += words
rect.acc += float(acc)
rect.acc /= 2.0
rect.line_num += 1 # 行数+1
# print("include:{},{}".format(rect, pos_info))
return
def change_graph2words(graph_path, languages):
"""
图片转成词
:param graph_path:
:param languages:
:return:
"""
reader = easyocr.Reader(languages, model_storage_directory=cut_model_path, download_enabled=False, gpu=True)
result = reader.readtext(graph_path)
return result
def words2chinese(words, from_lang, tgt_lang):
from transformers import pipeline
translator = pipeline("translation", model=trans_model_path)
to_trans = "".join(words)
output = translator(to_trans, src_lang=from_lang, tgt_lang=tgt_lang)
print("翻译原文:{}\n翻译结果:{}".format(to_trans, output))
return output
def pdf2png(pdf_name):
import fitz
# 打开PDF文件,生成一个对象
doc = fitz.open('{}'.format(pdf_name))
png_paths = []
for pg in range(doc.page_count):
page = doc[pg]
rotate = int(0)
# 每个尺寸的缩放系数为2,这将为我们生成分辨率提高四倍的图像。
zoom_x = 1.0
zoom_y = 1.0
trans = fitz.Matrix(zoom_x, zoom_y).prerotate(rotate)
pm = page.get_pixmap(matrix=trans, alpha=False)
graph_path = os.path.join(pdf2png_save_path, '%s.png' % pg)
pm.save(graph_path, output="png")
png_paths.append(graph_path)
return png_paths
def line_sep(sentense, line_num):
sep = int(len(sentense) / line_num)
new_sen = ""
next_start_index = 0
for i in range(0, line_num):
new_sen += sentense[next_start_index:line_num + sep].strip()
new_sen += "\n"
next_start_index = line_num + sep
new_sen += sentense[next_start_index:]
return new_sen
def draw_text(png, infos):
image = Image.open(png)
for info in infos:
blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))
image.paste(blank_png, (info.x, info.y))
n = ImgText(info.words, max(min(int(min(info.w, info.h) / (1.5 * info.line_num)), DEFUALT_FONT_SIZE)
, MIN_FONT_SIZE), info.w)
n.draw_text(image, info.x, info.y)
image.save("{}".format(png))
def clear_png_files():
pass
def translate_a_pdf(pdf_path, detectlang: list, translate_from_lang: str, translate_to_lang: str):
with open(finished_list, "r") as f:
finished = f.readlines()
for finish in finished:
finished[finished.index(finish)] = finish.strip()
clear_png_files() # 先清空png文件夹下面的全部图,然后就可以转换当前pdf的图了
png_paths = pdf2png(pdf_path)
for png in png_paths:
if png in finished:
continue
img_changes = []
words_result = change_graph2words(png, detectlang)
print("查找到的文本:{}".format(words_result))
rects = merge_neighbers(png, words_result) # 合并段
for rect in rects:
if float(rect.acc) < pass_point:
print("认为这个词正确度{}极低,不进行翻译:{}".format(rect.acc, rect.words))
continue
transed_words = words2chinese(rect.words, translate_from_lang, translate_to_lang)
translation_text = ""
for trans in transed_words:
translation_text += trans["translation_text"]
rect.words = translation_text
print("存储位置:{}".format(str(rect)))
img_changes.append(rect) # 更新一下图像数据
draw_text(png, img_changes)
draw_text(png, img_changes)
print("输出图片:{}".format(png))
with open("finished_list.txt", "a+") as f:
f.write(png + "\n")
from PIL import Image
import os
def combine_imgs_pdf(folder_path, pdf_file_path):
"""
合成文件夹下的所有图片为pdf
Args:
folder_path (str): 源文件夹
pdf_file_path (str): 输出路径
"""
with open(finished_list,"r") as f:
png_list = f.readlines()
for png in png_list:
png_list[png_list.index(png)] = png.strip()
sources = []
png_list.sort()
output = Image.open(png_list[0])
png_list.pop(0)
for file in png_list:
png_file = Image.open(file)
if png_file.mode == "RGB":
png_file = png_file.convert("RGB")
sources.append(png_file)
output.save(pdf_file_path, "pdf", save_all=True, append_images=sources)
with open(finished_list,"w") as f:
f.write("")
if __name__ == '__main__':
from_lang = ["ja", "en"]
to_lang = ["zh"]
pdf_name = "ポーズの定理_ダイジェスト.pdf"
translate_a_pdf(os.path.join(pdf_path, pdf_name), from_lang, "ja", "zh")
combine_imgs_pdf(pdf2png_save_path, os.path.join(pdf_path, "changed_"+pdf_name))
处理一下一些段落,按照段落去识别
rect_dealer.py
import math
import cv2
from PIL import Image
from PIL import Image, ImageDraw, ImageFont
# 定义一个边界表示
class Rec:
def __init__(self, x, y, w, h):
self.x = x
self.y = y
self.w = w
self.h = h
self.words = ""
self.acc = 0
self.line_num = 0
def __str__(self): # __str__(self)不可以添加参数(形参)
return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(
self.words)
def __repr__(self):
return "x:" + str(self.x) + " y:" + str(self.y) + " w:" + str(self.w) + " h:" + str(self.h) + " words: " + str(
self.words)
def include_other_recs(rec_in: Rec, recs):
"""
比较矩形REC:rec_in和矩形数组:recs
比较是否包含其他矩形,如果包含了,返回Ture ,否则返回False,表示不包含其他矩形区域,是单独的表格
:param rec_in:
:param recs:
:return:
"""
for rec in recs:
if rec_in != rec:
if rec_in.x <= rec.x and rec_in.x + rec_in.w >= rec.x + rec.w and rec_in.y <= rec.y \
and rec_in.y + rec_in.h >= rec.y + rec.h + 5:
# print(str(rec) + " in " + str(rec_in))
return True
# print(str(rec_in), "------not include other recs------")
return False
def hole_select(recs):
results = []
for rec in recs:
if not include_other_recs(rec, recs):
results.append(rec)
return results
class detectWords(object):
def __init__(self, src_img, width_max_scale=15, height_max_scale=15):
self.src_img = src_img
self.width_scale = width_max_scale
self.height_scale = height_max_scale
def run(self):
if len(self.src_img.shape) == 2: # 灰度图
gray_img = self.src_img
if len(self.src_img.shape) == 3:
gray_img = cv2.cvtColor(self.src_img, cv2.COLOR_BGR2GRAY)
# 处理图像,灰度化,二值化
# erode_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))
dilated_kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (self.height_scale,self.width_scale))
# eroded = cv2.erode(gray_img.copy(), erode_kernel, 3)
dilated = cv2.dilate(gray_img.copy(), dilated_kernel, 10)
return dilated
# 判断是否区域为表格,返回可能包含表格的矩形若干个(它们可能存在重叠包含关系):
def region_hole(image):
recs = [] # 保存表格结果矩形
contours_mask, hierarchy_mask = cv2.findContours(image, cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
draw_img_in = cv2.drawContours(image.copy(), contours_mask, -1, (153, 153, 0), 2, maxLevel=2)
cv2.imwrite("region_table.png", draw_img_in)
for contour in contours_mask: # 遍历轮廓
# 只保留需要的轮廓,去掉误读的噪点 和 外轮廓
# 绘制矩形
area = cv2.contourArea(contour)
if area < 150:
# 获取区域的面积,如果小于某个值就忽略,代表是杂线不是表格
continue
approx = cv2.approxPolyDP(contour, 3, True) # 趋近矩形
x, y, width, height = cv2.boundingRect(approx) # 得到矩形面积、
rec = Rec(x, y, width, height)
recs.append((rec))
return recs
def draw_rects(png, recs):
image = Image.open(png)
im_width, im_height = image.size
fill_image = Image.new('RGBA', (im_width, im_height), (0, 0, 0))
image.paste(fill_image, (0, 0)) # 全搞成黑色的
for info in recs:
# print(info)
blank_png = Image.new('RGBA', (info.w, info.h), (255, 255, 255))
image.paste(blank_png, (info.x, info.y))
image.save("{}_filled.png".format(png))
def getHoleRects(png_path):
origin_image = cv2.imread(png_path)
h_dilated_img = detectWords(origin_image).run() # 稍微膨胀
recs = region_hole(h_dilated_img) # 检测候选洞区域
results = hole_select(recs) # 筛选出洞
draw_rects('region_table.png', results)
return results
if __name__ == '__main__':
file_name = r'F:\ocr\pdf2png\1.png_filled.png'
getHoleRects(file_name)
img_text.py (这段代码抄的网上、实现了图片文本换行的效果)
from PIL import Image, ImageDraw, ImageFont
class ImgText:
def __init__(self, text, font_size, width):
self.font = ImageFont.truetype(r'C:\Windows\Fonts\simhei.ttf', font_size)
# 预设宽度 可以修改成你需要的图片宽度
self.width = width
# 文本
self.text = text
# 段落 , 行数, 行高
self.duanluo, self.note_height, self.line_height = self.split_text()
def get_duanluo(self, text):
txt = Image.new('RGBA', (100, 100), (255, 255, 255, 0))
draw = ImageDraw.Draw(txt)
# 所有文字的段落
duanluo = ""
# 宽度总和
sum_width = 0
# 几行
line_count = 1
# 行高
line_height = 0
for char in text:
width, height = draw.textsize(char, self.font)
sum_width += width
if sum_width > self.width: # 超过预设宽度就修改段落 以及当前行数
line_count += 1
sum_width = 0
duanluo += '\n'
duanluo += char
line_height = max(height, line_height)
if not duanluo.endswith('\n'):
duanluo += '\n'
return duanluo, line_height, line_count
def split_text(self):
# 按规定宽度分组
max_line_height, total_lines = 0, 0
allText = []
for text in self.text.split('\n'):
duanluo, line_height, line_count = self.get_duanluo(text)
max_line_height = max(line_height, max_line_height)
total_lines += line_count
allText.append((duanluo, line_count))
line_height = max_line_height
total_height = total_lines * line_height
return allText, total_height, line_height
def draw_text(self,note_img,x,y):
"""
绘图以及文字
:return:
"""
draw = ImageDraw.Draw(note_img)
# 左上角开始
for duanluo, line_count in self.duanluo:
draw.text((x, y), duanluo, fill=(255, 0, 0), font=self.font)
y += self.line_height * line_count
note_img.save("result.png")
1.先用easyocr识别文本,easyocr需要下载easyocr的模型,放在cut_model文件夹里
下载地址:https://www.jaided.ai/easyocr/modelhub/ 可能需要科学上w、
2.在这里可以控制easyocr识别的文本语言:
我这里输入ja、en,代表日语(japanese)和英语(english),所以会从图片中检测出日语和英语的文本
3.简单地处理一下块,把一个段落的文本,合并起来
4.输入到翻译模型中,这里可以是任何模型,我试过下面几个模型
绿色框住的是好,其他的由于各种原因,比如太慢、比如性能太差,被我残忍抛弃,
(ps:opus-mt-XX的模型是真的好用,又小又准确,但是它!没有ja-zh,所以……好气!)
例如:m2m100_418M,这个模型在:https://toscode.gitee.com/mirrors_UKPLab/EasyNMT 可以看到,
它的节点和大小没有m2m100_1.2B多,我下载了试了试,真的不能用
这俩的翻译对比:m2m100_418M,右边m2m100_1.2B
性能差了很多,而且会出现奇怪的表现,速度也没有快多少。
模型排行榜:
(排行靠前的一大堆,没一个开源的,我只能说,感谢meta,小扎还是良心企业嗷)
网易有道词典小语种翻译实现思路
网易有道的小语种翻译真的很牛,微信在它面前被揍得像个弟弟,可惜模型都不公开,毕竟都是核心资源……
其他语种模型可以去下面的笑脸中心找,很牛的企业,可能需要科学上网,模型太大的话可以用迅雷下载器(或者用别的下载器),
下载器下载能快许多:
也可以用讯飞的api直接就翻译日语了
https://www.xfyun.cn/services/xftrans
给的200万字免费调用,够用一段时间了
1.速度很慢:慢的我有点受不了了
2.正确率还不够好(虽然也不太差了):
—————————————————————————————
后来换了讯飞的接口试了下,也不怎么样(调用接口还很麻烦)
讯飞翻译:
唯一好使的只有有道图片翻译,感觉错误率明显低;而且提供了任意体验的服务,真的很好,如果不是想一键pdf2pdf,那么用有道去翻译一下也可以。