如何增加ai虚拟聊天伴侣趣味性——OpenCV识别大量真实聊天图片采集高质量语料

图像转文字

代码一:(文件run_batch.py)

批量读取多个目录下的所有pdf多分页图片和jpg图片,并实现OCR识别图片文字,分别保存到多个json文件中

import os
import json
import base64, re
from tqdm import tqdm
from tencentcloud.common import credential
from tencentcloud.common.profile.client_profile import ClientProfile
from tencentcloud.common.profile.http_profile import HttpProfile
from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
from tencentcloud.ocr.v20181119 import ocr_client, models
import fitz  # PyMuPDF
import numpy as np

secret_id = "" #腾讯云OCR服务密钥,开通服务可以免费使用1000次
secret_key = ""

def get_imges(pdf_path):
    if pdf_path.endswith('pdf'):
        pdf_document = fitz.open(pdf_path)
        page_count = pdf_document.page_count
        # pdf_writer = fitz.open()

        for page_num in range(pdf_document.page_count):
            page = pdf_document[page_num]
            pix = page.get_pixmap().tobytes('png',jpg_quality=1680)
            base64_str = base64.b64encode(pix).decode()
            # return pix
            yield base64_str
        # pdf_writer.save(output_path, deflate=True, jpg_quality=30)
        pdf_document.close()
        print("page_count",page_count)
        return page_count
    else:
        with open(pdf_path, 'rb') as i_file:
            base64_str = base64.b64encode(i_file.read()).decode()
            yield base64_str
        


def make_api_call(jpg_fpath, jsn_fpath, json_data):
    for base64_str in get_imges(jpg_fpath):
        try:
            cred = credential.Credential(secret_id, secret_key)
            httpProfile = HttpProfile()
            httpProfile.endpoint = "ocr.tencentcloudapi.com"
            clientProfile = ClientProfile()
            clientProfile.httpProfile = httpProfile
            client = ocr_client.OcrClient(cred, "ap-guangzhou", clientProfile)
            req = models.GeneralAccurateOCRRequest()
            params = {
                'LanguageType': 'zh',
                'IsPdf': True,
                "PdfPageNumber": 5,
                'ImageBase64': f'data:image/jpeg;base64,{base64_str}',
                # 'EnableDetectText': True
            }
            req.from_json_string(json.dumps(params))
            resp = client.GeneralBasicOCR(req)
            res = json.loads(resp.to_json_string()).get('TextDetections')
            print("res lenght:",len(res))
            json_data['TextDetections'].append(res)
            # break
        except TencentCloudSDKException as err:
            print(err)
    if json_data:
        print("len(json_data['TextDetections']):",len(json_data['TextDetections']))
        os.makedirs(os.path.dirname(os.path.realpath(jsn_fpath)), exist_ok=True)
        with open(jsn_fpath, 'w', encoding='UTF-8') as o_file:
            o_file.write(json.dumps(json_data, ensure_ascii=False))

def ocr_api_json(jpg_dpath):

    curr_dir = os.path.dirname(os.path.realpath(__file__))
    image_path = os.path.join(curr_dir, 'raw.images', jpg_dpath)  
    jsn_dpath = os.path.join(curr_dir, 'api.results')
    print("jpg_dpath",jpg_dpath)

    regex = re.compile(r'[0-9]+')
    json_data = {"TextDetections":[]}
    files = os.listdir(image_path)
    files.sort(key=lambda x: int(re.search(regex, x).group()))

    for jpg_fname in files:
        jpg_fpath = os.path.join(image_path, jpg_fname)
        print("jpg_fpath", jpg_fpath)
        type_ = jpg_fpath.split(r'.')[-1]
        print("type_",type_)
        jsn_fpath = os.path.join(jsn_dpath, f'{jpg_dpath}.json')
        print("jsn_fpath", jsn_fpath)
        make_api_call(jpg_fpath, jsn_fpath, json_data)

if __name__ == '__main__':
    
    jpg_dpath = '19岁的矜持女孩'
    ocr_api_json(jpg_dpath)


代码二:(文件名:get_text_boxes.py)

图像处理与轮廓识别

这段代码通过对图像进行腐蚀和膨胀等操作,成功提取出图像中的主要轮廓。

import numpy as np
import cv2

def dilate_line(binary, type='vertical', x_scale=10, y_scale=5):
    '''
    获取竖线/横线腐蚀后的二值图
    '''
    rows_z, cols_z = binary.shape
    if type == 'horizontal':
        size = (cols_z // x_scale, 1)
    else:
        size = (1, rows_z // y_scale)
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, size) 
    eroded = cv2.erode(binary, kernel, iterations=1)  # 腐蚀
    dilated = cv2.dilate(eroded, kernel, iterations=1)  # 膨胀 
    return dilated

def get_contours(image):
    srcPic = image
    # Apply thresholding to replace black pixels with white pixels
    hsv = cv2.cvtColor(srcPic, cv2.COLOR_BGR2HSV)

    # Define the lower and upper bounds for black color in HSV
    lower_black = np.array([0, 0, 0], dtype=np.uint8)
    upper_black = np.array([180, 255, 86], dtype=np.uint8)

    # Create a binary mask for black pixels
    black_mask = cv2.inRange(hsv, lower_black, upper_black)
    # Replace black pixels with white pixels
    srcPic[black_mask > 0] = [255, 255, 255]
    gray = cv2.cvtColor(srcPic, cv2.COLOR_BGR2GRAY)
    # 二值化
    # _, binPic = cv2.threshold(greyPic, 229, 255, cv2.THRESH_BINARY)
    binPic = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 11, 2)
    kernel = np.ones((6, 6), np.uint8)
    binPic = cv2.morphologyEx(binPic, cv2.MORPH_OPEN, kernel, iterations=3)
    border_size = 12
    pattern = np.array([0, 255] * (border_size // 2), dtype=np.uint8)
    # Apply the dashed line pattern to the right side of the image
    binPic[:, -border_size:] = pattern[:binPic.shape[0]]
    # binPic[:, :border_size] = pattern[:binPic.shape[0]]
    # binPic = cv2.copyMakeBorder(binPic, top=0, bottom=0, left=0, right=border_size, borderType=cv2.BORDER_CONSTANT, value=0)
    binPic = dilate_line(binPic, 'horizontal', 120, 900) #vertical
    # 中值滤波
    median = cv2.medianBlur(binPic, 5)

    # 边缘检测
    cannyPic = cv2.Canny(median, 10, 200)

    # 找出轮廓
    contours, hierarchy = cv2.findContours(cannyPic, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    # contours, hierarchy = cv2.findContours(cannyPic, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE)
    # 根据轮廓面积排序
    contours = sorted(contours, key=cv2.contourArea, reverse=True) #[:10]
    min_contour_area = 500  # Set your desired area threshold
    contours = [cnt for cnt in contours if cv2.contourArea(cnt) > min_contour_area]

    # 画出所有轮廓并返回矩形坐标
    rectangles = []
    for i in range(min(26, len(contours))):
        x, y, w, h = cv2.boundingRect(contours[i])
        # cv2.imshow(srcPic)
        cv2.rectangle(srcPic, (x, y), (x + w, y + h), (0, 255, 0), 2)
        rectangles.append(((x, y), (x + w, y + h)))
        roi = srcPic[y:y+h, x:x+w]
    #     if i == 5:
    #         # 显示图像
    #         cv2.namedWindow(str("D"), cv2.WINDOW_NORMAL)
    #         cv2.resizeWindow(str("D"), 800, 2000) 
    #         cv2.imshow("D", roi)
    #         cv2.waitKey(0)
    #         cv2.destroyAllWindows()
    #         break
    # return roi
    #cv2.namedWindow(str("C"), cv2.WINDOW_NORMAL)
    #cv2.resizeWindow(str("C"), 800, 1000) 
    #cv2.namedWindow(str("M"), cv2.WINDOW_NORMAL)
    #cv2.resizeWindow(str("M"), 800, 1000) 
    #cv2.imshow('C', srcPic) 
    #cv2.imshow('M', median) #srcPic
    #cv2.waitKey(0)
    #cv2.destroyAllWindows()
    return rectangles

def get_color(img):
    if img.shape[0] == 0 or img.shape[1] == 0:
        return "UNK"
    #颜色提取
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)  # 色彩空间转换为hsv,便于分离
    average_hue = np.median(hsv[:,:,0])  #色相
    average_saturation = np.median(hsv[:,:,1])  #饱和度
    average_value = np.median(hsv[:,:,2])   #亮度
    print('RBG--hsv:',average_hue, average_saturation, average_value)
    green_min = [35, 43, 46]
    green_max = [77, 255, 255]
    back_min = [0,0,40]
    back_max = [180,43,220]
    white_min = [0,0,239]
    white_max = [180,30,255]
    if average_hue >= green_min[0]  and average_hue <= green_max[0] and average_saturation >= green_min[1]  and \
    average_saturation <= green_max[1] and average_value >= green_min[2] and average_value <= green_max[2]:
        print("color is green")
        return  'RIGHT'
    elif average_hue >= white_min[0]  and average_hue <= white_max[0] and average_saturation >= white_min[1]  and \
    average_saturation <= white_max[1] and average_value >= white_min[2] and average_value <= white_max[2]:
        print("color is white")
        return "LEFT"
    elif average_hue >= back_min[0]  and average_hue <= back_max[0] and average_saturation >= back_min[1]  and \
    average_saturation <= back_max[1] and average_value >= back_min[2] and average_value <= back_max[2]:
        print("color is gray")
        return "LEFT"
    else:
        print("not white and green",average_hue,average_saturation, average_value )
        return "UNK"

if __name__ == "__main__":
    img = cv2.imread('123.jpg') #20 31 64
    img = get_contours(img)
    # img_separate = get_color(img) #设置get_color 返回roi后,判断轮廓所在的颜色区域

代码三:(主入口)

这段代码将OCR识别的文字信息转化为有趣的对话,通过颜色和位置信息,判断文字所在的位置(左侧/右侧)

import os
import re
import json
import fitz 
import cv2
import numpy as np
from run_batch import ocr_api_json
from get_text_boxes import get_contours, get_color

CONF_THRES = 95  # OCR confidence threshold

def get_pdf_page_count(pdf_paths):
    for pdf_path in pdf_paths:
        pdf_path = f'{pdf_path}'
        print("pdf_path:",pdf_path)
        if pdf_path.endswith('pdf'):
            pdf_document = fitz.open(pdf_path)
            page_count = pdf_document.page_count

            for page_num in range(pdf_document.page_count):
                page = pdf_document[page_num]
                pix = page.get_pixmap().tobytes('png',jpg_quality=1680)
                # return pix
                yield pix
            # pdf_writer.save(output_path, deflate=True, jpg_quality=30)
            pdf_document.close()
        else:
            with open(pdf_path, 'rb') as i_file:
                pix = i_file.read()
                yield pix
    

def get_bbox_bounds(bbox):
    x_min = min([x['X'] for x in bbox])
    x_max = max([x['X'] for x in bbox])
    y_min = min([x['Y'] for x in bbox])
    y_max = max([x['Y'] for x in bbox])
    return x_min, x_max, y_min, y_max


def judge_side(img, bbox, rectangles, detectedtext):
    """Judge the left/right side based on the occurance of white/green pixels."""

    x_min, x_max, y_min, y_max = get_bbox_bounds(bbox)
    side = None
    for rect in rectangles:
        if rect[0][0] -50 <= x_min <= rect[1][0] + 50 and rect[0][1] -50 <= y_min <= rect[1][1] + 50 \
                and rect[0][0] -50 <= x_max <= rect[1][0] + 50 and rect[0][1] -50 <= y_max <= rect[1][1] + 50:
            side = rect

    if side or (detectedtext and len(detectedtext) >= 3):
        side = get_color(img[x_min-60:x_max+60,y_min-60:y_max+60])
    return side

def generate_page(jpg_fpath):
    return get_pdf_page_count(jpg_fpath)

def extract_conv(jpg_fnames, jsn_fpath): 
    """Extract conversations from OCR raw results."""
    print(jpg_fnames)
    gen = generate_page(jpg_fnames)
    with open(jsn_fpath, 'r', encoding='UTF-8') as i_file:
        jsn_data = json.load(i_file)

    # filter out irrelevant hits
    conv_list = []
    regex = re.compile(r'[0-9]+:[0-9]+$|中国移动|输入聊天')
    print(len(jsn_data['TextDetections']), '-------------------')
    for hits in jsn_data['TextDetections']:
        pix = next(gen)
        pix = np.frombuffer(pix, np.uint8)
        img = cv2.imdecode(pix, cv2.IMREAD_COLOR)
        rectangles  = get_contours(img)
        for hit in hits:
            y_list = [y for y in hit.get('Polygon')]  #获取句子y轴坐标
            text = hit["DetectedText"]
            #根据y轴坐标,使用极差计算句子高度:如果句子水平方向,高度在一个字的高度左右,如果是水印则会异常高
            ptp = np.ptp(np.array([i.get('Y') for i in y_list]))  
            #过滤异常高度、异常字体大小和时间文本
            if hit['Confidence'] <= CONF_THRES or ptp > ptp_max or ptp <= ptp_min or re.search(regex, text):  
                print("pass -----")
                continue
            print("text:",text)
            side = judge_side(img, hit['Polygon'], rectangles,hit["DetectedText"])
            if side == 'UNK' or not side:
                # print("side:", hit["DetectedText"])
                continue
            else:
                conv_list.append(f'{side}: {text}')

    return conv_list


def main_start(jpg_dpath):
    """Main entry."""

    curr_dir = os.path.dirname(os.path.realpath(__file__))
    image_path = os.path.join(curr_dir, 'raw.images', jpg_dpath)
    jsn_dpath = os.path.join(curr_dir, 'api.results')
    txt_fpath = os.path.join(jsn_dpath, f'{jpg_dpath}.txt')
    
    # extract conversations from OCR raw results
    conv_list = []
    regex = re.compile(r'[0-9]+')
    jpg_fnames = list(os.listdir(image_path))
    jpg_fnames.sort(key=lambda x: int(re.search(regex, x).group()))
    jpg_fnames = [os.path.join(curr_dir, 'raw.images',jpg_dpath, i) for i in jpg_fnames]
    # print('当前目录图片列表:', jpg_fnames)
    
    jsn_fpath = os.path.join(jsn_dpath, f'{jpg_dpath}.json')
    print("jsn_fpath:", jsn_fpath)
    conv_list_addi = extract_conv(jpg_fnames, jsn_fpath)
    conv_list.extend(conv_list_addi)
        
    print('保存路径:',txt_fpath)
    with open(txt_fpath, 'w', encoding='UTF-8') as o_file:
        o_file.write('\n'.join(conv_list) + '\n')

if __name__ == '__main__':
    curr_dir = os.path.dirname(os.path.realpath(__file__))
    paths = os.listdir(os.path.join(curr_dir, 'raw.images')) #图片目录raw.images
    ptp_max = 500 #水印异常高度
    ptp_min = 16  #异常小字体高度 如时间 电量等
    for jpg_dpath in paths:
        print(jpg_dpath)  #子目录,存放图片
        ocr_api_json(jpg_dpath) 
        main_start(jpg_dpath)
        
    # test
    # jpg_dpath = ' 女生主动买好小雨伞'
    # ocr_api_json(jpg_dpath)
    # main_start(jpg_dpath)
    

代码详细讲解可以观看我另一篇博客:https://blog.csdn.net/qq_20163065/article/details/135048872

运行代码三后将输出到txt文件,结果如下:

left为左侧角色的文本,right为右侧角色的文本

LEFT: 我通过了你的朋友验证请求,现在
LEFT: 我们可以开始聊天了
RIGHT: 你好呀
LEFT: 你好呀,
LEFT: 朋友
LEFT: 备注祁晶
RIGHT: 好嘞
RIGHT: 我叫刘晓宇

非常感谢大家的热情收看!希望在这次的分享中,你们能够从中汲取到有趣的知识和灵感。如果在这个过程中,你们有任何关于AI虚拟角色、图像识别或其他技术问题的疑问,都欢迎随时提问哦!。

目前已经成功收集了上万个角色的聊天内容相关语料,感兴趣的朋友可以私聊我。

 

 

 

你可能感兴趣的:(ai语料采集,人工智能)