图像转文字
代码一:(文件run_batch.py)
批量读取多个目录下的所有pdf多分页图片和jpg图片,并实现OCR识别图片文字,分别保存到多个json文件中
import os
import json
import base64, re
from tqdm import tqdm
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models
import fitz # PyMuPDF
import numpy as np
secret_id = "" #腾讯云OCR服务密钥,开通服务可以免费使用1000次
secret_key = ""
def get_imges(pdf_path):
if pdf_path.endswith('pdf'):
pdf_document = fitz.open(pdf_path)
page_count = pdf_document.page_count
# pdf_writer = fitz.open()
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
pix = page.get_pixmap().tobytes('png',jpg_quality=1680)
base64_str = base64.b64encode(pix).decode()
# return pix
yield base64_str
# pdf_writer.save(output_path, deflate=True, jpg_quality=30)
pdf_document.close()
print("page_count",page_count)
return page_count
else:
with open(pdf_path, 'rb') as i_file:
base64_str = base64.b64encode(i_file.read()).decode()
yield base64_str
def make_api_call(jpg_fpath, jsn_fpath, json_data):
for base64_str in get_imges(jpg_fpath):
try:
cred = credential.Credential(secret_id, secret_key)
httpProfile = HttpProfile()
httpProfile.endpoint = "ocr.tencentcloudapi.com"
clientProfile = ClientProfile()
clientProfile.httpProfile = httpProfile
client = ocr_client.OcrClient(cred, "ap-guangzhou", clientProfile)
req = models.GeneralAccurateOCRRequest()
params = {
'LanguageType': 'zh',
'IsPdf': True,
"PdfPageNumber": 5,
'ImageBase64': f'data:image/jpeg;base64,{base64_str}',
# 'EnableDetectText': True
}
req.from_json_string(json.dumps(params))
resp = client.GeneralBasicOCR(req)
res = json.loads(resp.to_json_string()).get('TextDetections')
print("res lenght:",len(res))
json_data['TextDetections'].append(res)
# break
except TencentCloudSDKException as err:
print(err)
if json_data:
print("len(json_data['TextDetections']):",len(json_data['TextDetections']))
os.makedirs(os.path.dirname(os.path.realpath(jsn_fpath)), exist_ok=True)
with open(jsn_fpath, 'w', encoding='UTF-8') as o_file:
o_file.write(json.dumps(json_data, ensure_ascii=False))
def ocr_api_json(jpg_dpath):
curr_dir = os.path.dirname(os.path.realpath(__file__))
image_path = os.path.join(curr_dir, 'raw.images', jpg_dpath)
jsn_dpath = os.path.join(curr_dir, 'api.results')
print("jpg_dpath",jpg_dpath)
regex = re.compile(r'[0-9]+')
json_data = {"TextDetections":[]}
files = os.listdir(image_path)
files.sort(key=lambda x: int(re.search(regex, x).group()))
for jpg_fname in files:
jpg_fpath = os.path.join(image_path, jpg_fname)
print("jpg_fpath", jpg_fpath)
type_ = jpg_fpath.split(r'.')[-1]
print("type_",type_)
jsn_fpath = os.path.join(jsn_dpath, f'{jpg_dpath}.json')
print("jsn_fpath", jsn_fpath)
make_api_call(jpg_fpath, jsn_fpath, json_data)
if __name__ == '__main__':
jpg_dpath = '19岁的矜持女孩'
ocr_api_json(jpg_dpath)
代码二:(文件名:get_text_boxes.py)
图像处理与轮廓识别
这段代码通过对图像进行腐蚀和膨胀等操作,成功提取出图像中的主要轮廓。
import numpy as np
import cv2
def dilate_line(binary, type='vertical', x_scale=10, y_scale=5):
'''
获取竖线/横线腐蚀后的二值图
'''
rows_z, cols_z = binary.shape
if type == 'horizontal':
size = (cols_z // x_scale, 1)
else:
size = (1, rows_z // y_scale)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, size)
eroded = cv2.erode(binary, kernel, iterations=1) # 腐蚀
dilated = cv2.dilate(eroded, kernel, iterations=1) # 膨胀
return dilated
def get_contours(image):
srcPic = image
# Apply thresholding to replace black pixels with white pixels
hsv = cv2.cvtColor(srcPic, cv2.COLOR_BGR2HSV)
# Define the lower and upper bounds for black color in HSV
lower_black = np.array([0, 0, 0], dtype=np.uint8)
upper_black = np.array([180, 255, 86], dtype=np.uint8)
# Create a binary mask for black pixels
black_mask = cv2.inRange(hsv, lower_black, upper_black)
# Replace black pixels with white pixels
srcPic[black_mask > 0] = [255, 255, 255]
gray = cv2.cvtColor(srcPic, cv2.COLOR_BGR2GRAY)
# 二值化
# _, binPic = cv2.threshold(greyPic, 229, 255, cv2.THRESH_BINARY)
binPic = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
kernel = np.ones((6, 6), np.uint8)
binPic = cv2.morphologyEx(binPic, cv2.MORPH_OPEN, kernel, iterations=3)
border_size = 12
pattern = np.array([0, 255] * (border_size // 2), dtype=np.uint8)
# Apply the dashed line pattern to the right side of the image
binPic[:, -border_size:] = pattern[:binPic.shape[0]]
# binPic[:, :border_size] = pattern[:binPic.shape[0]]
# binPic = cv2.copyMakeBorder(binPic, top=0, bottom=0, left=0, right=border_size, borderType=cv2.BORDER_CONSTANT, value=0)
binPic = dilate_line(binPic, 'horizontal', 120, 900) #vertical
# 中值滤波
median = cv2.medianBlur(binPic, 5)
# 边缘检测
cannyPic = cv2.Canny(median, 10, 200)
# 找出轮廓
contours, hierarchy = cv2.findContours(cannyPic, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# contours, hierarchy = cv2.findContours(cannyPic, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
# 根据轮廓面积排序
contours = sorted(contours, key=cv2.contourArea, reverse=True) #[:10]
min_contour_area = 500 # Set your desired area threshold
contours = [cnt for cnt in contours if cv2.contourArea(cnt) > min_contour_area]
# 画出所有轮廓并返回矩形坐标
rectangles = []
for i in range(min(26, len(contours))):
x, y, w, h = cv2.boundingRect(contours[i])
# cv2.imshow(srcPic)
cv2.rectangle(srcPic, (x, y), (x + w, y + h), (0, 255, 0), 2)
rectangles.append(((x, y), (x + w, y + h)))
roi = srcPic[y:y+h, x:x+w]
# if i == 5:
# # 显示图像
# cv2.namedWindow(str("D"), cv2.WINDOW_NORMAL)
# cv2.resizeWindow(str("D"), 800, 2000)
# cv2.imshow("D", roi)
# cv2.waitKey(0)
# cv2.destroyAllWindows()
# break
# return roi
#cv2.namedWindow(str("C"), cv2.WINDOW_NORMAL)
#cv2.resizeWindow(str("C"), 800, 1000)
#cv2.namedWindow(str("M"), cv2.WINDOW_NORMAL)
#cv2.resizeWindow(str("M"), 800, 1000)
#cv2.imshow('C', srcPic)
#cv2.imshow('M', median) #srcPic
#cv2.waitKey(0)
#cv2.destroyAllWindows()
return rectangles
def get_color(img):
if img.shape[0] == 0 or img.shape[1] == 0:
return "UNK"
#颜色提取
hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV) # 色彩空间转换为hsv,便于分离
average_hue = np.median(hsv[:,:,0]) #色相
average_saturation = np.median(hsv[:,:,1]) #饱和度
average_value = np.median(hsv[:,:,2]) #亮度
print('RBG--hsv:',average_hue, average_saturation, average_value)
green_min = [35, 43, 46]
green_max = [77, 255, 255]
back_min = [0,0,40]
back_max = [180,43,220]
white_min = [0,0,239]
white_max = [180,30,255]
if average_hue >= green_min[0] and average_hue <= green_max[0] and average_saturation >= green_min[1] and \
average_saturation <= green_max[1] and average_value >= green_min[2] and average_value <= green_max[2]:
print("color is green")
return 'RIGHT'
elif average_hue >= white_min[0] and average_hue <= white_max[0] and average_saturation >= white_min[1] and \
average_saturation <= white_max[1] and average_value >= white_min[2] and average_value <= white_max[2]:
print("color is white")
return "LEFT"
elif average_hue >= back_min[0] and average_hue <= back_max[0] and average_saturation >= back_min[1] and \
average_saturation <= back_max[1] and average_value >= back_min[2] and average_value <= back_max[2]:
print("color is gray")
return "LEFT"
else:
print("not white and green",average_hue,average_saturation, average_value )
return "UNK"
if __name__ == "__main__":
img = cv2.imread('123.jpg') #20 31 64
img = get_contours(img)
# img_separate = get_color(img) #设置get_color 返回roi后,判断轮廓所在的颜色区域
代码三:(主入口)
这段代码将OCR识别的文字信息转化为有趣的对话,通过颜色和位置信息,判断文字所在的位置(左侧/右侧)
import os
import re
import json
import fitz
import cv2
import numpy as np
from run_batch import ocr_api_json
from get_text_boxes import get_contours, get_color
CONF_THRES = 95 # OCR confidence threshold
def get_pdf_page_count(pdf_paths):
for pdf_path in pdf_paths:
pdf_path = f'{pdf_path}'
print("pdf_path:",pdf_path)
if pdf_path.endswith('pdf'):
pdf_document = fitz.open(pdf_path)
page_count = pdf_document.page_count
for page_num in range(pdf_document.page_count):
page = pdf_document[page_num]
pix = page.get_pixmap().tobytes('png',jpg_quality=1680)
# return pix
yield pix
# pdf_writer.save(output_path, deflate=True, jpg_quality=30)
pdf_document.close()
else:
with open(pdf_path, 'rb') as i_file:
pix = i_file.read()
yield pix
def get_bbox_bounds(bbox):
x_min = min([x['X'] for x in bbox])
x_max = max([x['X'] for x in bbox])
y_min = min([x['Y'] for x in bbox])
y_max = max([x['Y'] for x in bbox])
return x_min, x_max, y_min, y_max
def judge_side(img, bbox, rectangles, detectedtext):
"""Judge the left/right side based on the occurance of white/green pixels."""
x_min, x_max, y_min, y_max = get_bbox_bounds(bbox)
side = None
for rect in rectangles:
if rect[0][0] -50 <= x_min <= rect[1][0] + 50 and rect[0][1] -50 <= y_min <= rect[1][1] + 50 \
and rect[0][0] -50 <= x_max <= rect[1][0] + 50 and rect[0][1] -50 <= y_max <= rect[1][1] + 50:
side = rect
if side or (detectedtext and len(detectedtext) >= 3):
side = get_color(img[x_min-60:x_max+60,y_min-60:y_max+60])
return side
def generate_page(jpg_fpath):
return get_pdf_page_count(jpg_fpath)
def extract_conv(jpg_fnames, jsn_fpath):
"""Extract conversations from OCR raw results."""
print(jpg_fnames)
gen = generate_page(jpg_fnames)
with open(jsn_fpath, 'r', encoding='UTF-8') as i_file:
jsn_data = json.load(i_file)
# filter out irrelevant hits
conv_list = []
regex = re.compile(r'[0-9]+:[0-9]+$|中国移动|输入聊天')
print(len(jsn_data['TextDetections']), '-------------------')
for hits in jsn_data['TextDetections']:
pix = next(gen)
pix = np.frombuffer(pix, np.uint8)
img = cv2.imdecode(pix, cv2.IMREAD_COLOR)
rectangles = get_contours(img)
for hit in hits:
y_list = [y for y in hit.get('Polygon')] #获取句子y轴坐标
text = hit["DetectedText"]
#根据y轴坐标,使用极差计算句子高度:如果句子水平方向,高度在一个字的高度左右,如果是水印则会异常高
ptp = np.ptp(np.array([i.get('Y') for i in y_list]))
#过滤异常高度、异常字体大小和时间文本
if hit['Confidence'] <= CONF_THRES or ptp > ptp_max or ptp <= ptp_min or re.search(regex, text):
print("pass -----")
continue
print("text:",text)
side = judge_side(img, hit['Polygon'], rectangles,hit["DetectedText"])
if side == 'UNK' or not side:
# print("side:", hit["DetectedText"])
continue
else:
conv_list.append(f'{side}: {text}')
return conv_list
def main_start(jpg_dpath):
"""Main entry."""
curr_dir = os.path.dirname(os.path.realpath(__file__))
image_path = os.path.join(curr_dir, 'raw.images', jpg_dpath)
jsn_dpath = os.path.join(curr_dir, 'api.results')
txt_fpath = os.path.join(jsn_dpath, f'{jpg_dpath}.txt')
# extract conversations from OCR raw results
conv_list = []
regex = re.compile(r'[0-9]+')
jpg_fnames = list(os.listdir(image_path))
jpg_fnames.sort(key=lambda x: int(re.search(regex, x).group()))
jpg_fnames = [os.path.join(curr_dir, 'raw.images',jpg_dpath, i) for i in jpg_fnames]
# print('当前目录图片列表:', jpg_fnames)
jsn_fpath = os.path.join(jsn_dpath, f'{jpg_dpath}.json')
print("jsn_fpath:", jsn_fpath)
conv_list_addi = extract_conv(jpg_fnames, jsn_fpath)
conv_list.extend(conv_list_addi)
print('保存路径:',txt_fpath)
with open(txt_fpath, 'w', encoding='UTF-8') as o_file:
o_file.write('\n'.join(conv_list) + '\n')
if __name__ == '__main__':
curr_dir = os.path.dirname(os.path.realpath(__file__))
paths = os.listdir(os.path.join(curr_dir, 'raw.images')) #图片目录raw.images
ptp_max = 500 #水印异常高度
ptp_min = 16 #异常小字体高度 如时间 电量等
for jpg_dpath in paths:
print(jpg_dpath) #子目录,存放图片
ocr_api_json(jpg_dpath)
main_start(jpg_dpath)
# test
# jpg_dpath = ' 女生主动买好小雨伞'
# ocr_api_json(jpg_dpath)
# main_start(jpg_dpath)
代码详细讲解可以观看我另一篇博客:https://blog.csdn.net/qq_20163065/article/details/135048872
运行代码三后将输出到txt文件,结果如下:
left为左侧角色的文本,right为右侧角色的文本
LEFT: 我通过了你的朋友验证请求,现在
LEFT: 我们可以开始聊天了
RIGHT: 你好呀
LEFT: 你好呀,
LEFT: 朋友
LEFT: 备注祁晶
RIGHT: 好嘞
RIGHT: 我叫刘晓宇
非常感谢大家的热情收看!希望在这次的分享中,你们能够从中汲取到有趣的知识和灵感。如果在这个过程中,你们有任何关于AI虚拟角色、图像识别或其他技术问题的疑问,都欢迎随时提问哦!。
目前已经成功收集了上万个角色的聊天内容相关语料,感兴趣的朋友可以私聊我。