基于Python的离线OCR图片文字识别(五)——终极版本

至此,终于迎来了离线ocr的终极大结局,命令行后面参数既支持图像文件、图像文件夹,还支持PDF图像类型的文件,既支持通过json文件进行参数配置,又支持帮助文档,easyOCR包既支持允许字符集(也即仅支持字符集中的识别,例如在验证码识别场合),也支持排除字符集,还支持批处理尺寸大小、线程数目、分段结构保留(支持paragraph时,ocr结果就没有原来单句时的识别概率值了)等。

#!/home/super/miniconda3/bin/python
#encoding=utf-8
#author: superchao1982, [email protected]

#帮助信息
strhelp='''
img2txt is one program to get ocr texts from image or pdf files!

batchsize is the batch size, larger more faster but more merrory;
workernum is the number of threads, larger more faster;
maximgsize is the max height or width of the images to be passed to the ocr processing when extract from the pdf files;
paragraph is whether to keep the paragraph when ocring
langpath is the directory of the language data stored, '/home/langdata' for linux and 'C:\ocr\langdata' for win;
allowlist is chars allow to be recognized only, '' means allow all charactors;
removechar is char to be removed when ocr processing, for example '| _^~`&';
txtdir is the path to store the txt files, could be any legal absolute or relative path,'' means the same directory of the image files;

=== settings above can be changed in the file 'config.json' which stored in langpath ===
contents in config.json like:
{
    "batchsize": 2,
    "workernum": 4,
    "maximgsize": 1000,
    "paragraph": True
    "langpath": "/home/langdata",
    "allowlist": "",
    "removechar": " _^~`&"
    "txtdir": ""
}
------------------------------------
e.g.
./img2txt.py img1.jpg jmg2.jpg 001.pdf 002.pdf #follow by one or more image or pdf files
./img2txt.py ./pdfs home/usr/Document/imgs #follow by one or more directory contain image or pdf files
./img2txt.py --help #output the help info
./img2txt.py --config #generate the default config.json file in the langpath
------------------------------------
'''
import sys
import json
import os
import pdf2image
import numpy as np

#------------------默认参数设置----------------------
batchsize=2         # (default = 1) - Batch_size>1 will make EasyOCR faster but use more memory
workernum=4         # (default = 0) - Number thread used in of dataloader
maximgsize=1000     # (default = 1000) - Max image width & height when using pdf
paraend='\n'        # (default = '\n') - The paragraph ending char
allowlist=''        # (string) - Force EasyOCR to recognize only subset of characters
removechar='| _^~`&'#待删除无效字符
txtdir=''            #ocr识别后同名txt文件存放的位置:空表示同一目录,点表示相对目录,其他表示绝对目录
#根据系统设置默认的语言包路径
if sys.platform.lower().startswith('linux'):
    langpath='/home/langdata'
elif sys.platform.lower().startswith('win'):
    langpath='C:\ocr\langdata'
else:
    print('\tError: Unknow System!')
    sys.exit()

#根据默认参数生成配置字典
config={
    "batchsize": batchsize,
    "workernum": workernum,
    "maximgsize": maximgsize,
    "paraend": paraend,
    "allowlist": allowlist,
    "langpath": langpath,
    "removechar": removechar,
    "txtdir": txtdir
}

#------------------命令行参数处理----------------------
#首先对输入的命令行参数进行处理,在加载ocr包之前排查的好处是避免临处理时出错白白浪费时间
for i in range(1,len(sys.argv)):#获取命令行参数:argv[0]表示可执行文件本身
    if sys.argv[i] in ['-h', '--help']:
        print(strhelp)
        sys.exit()
    elif sys.argv[i] in ['-c', '--config']:
        #保存字典到文件
        try:
            with open(os.path.join(langpath,'config.json'), 'w') as jsonfile:
                json.dump(config, jsonfile, ensure_ascii=False,indent=4)
            print('Genrerating config.json success! ---> ', os.path.join(langpath,'config.json'))
        except(Exception) as e:
            print('\tSaving config file config.json Error: ', e)#输出异常错误
        sys.exit()
    else:
        #check the image file or directory is valid-提前校验,免得浪费时间加载easyocr模型
        if not os.path.exists(sys.argv[i]):
            print(sys.argv[i], ' is invalid, please input the correct file or directory path!')
            sys.exit()

#判断指定目录下是否存在配置文件config.json,存在就使用(不存在就使用上面的默认值):
configfile=os.path.join(langpath,'config.json')
if os.path.exists(configfile):
    try:
        with open(configfile, 'r') as jsonfile:
            config=json.load(jsonfile)
        batchsize=config['batchsize']
        workernum=config['workernum']
        maximgsize=config['maximgsize']
        paraend=config['paraend']
        langpath=config['langpath']
        allowlist=config['allowlist']
        removechar=config['removechar']
        txtdir=config['txtdir']
        print('Using the config in ', configfile)
    except(Exception) as e:
        print('\tReading config file ', configfile ,' Error: ', e)#输出异常错误
        print('\tCheck the json file, or remove the config.json file to use defaulting configs!')
        sys.exit()
else:
    print('Using the default config! You can make your own config.json in ', langpath, ' by using the "--config" option')
print(config)

#------------------OCR前准备工作----------------------
#检查语言包路径是否正确,语言包是必须的
if not os.path.exists(langpath):
    print('\tError: Invalid langpath! Checking the path again!')
    sys.exit()
    
#检查txt文件保存路径,不存在就生成一个
if len(txtdir)>0 and not os.path.exists(txtdir):
    print('txtdir in config.json is not exists, generating ', txtdir)
    try:
        os.system('mkdir '+txtdir)
        print('Making directory: ',txtdir)
    except(Exception) as e:
        print('\tMaking txt directory Error: ', e)#输出异常错误
        print('\tPlease input a legal txtdir in the config.json file and try again!')
        sys.exit()

#根据段落结尾符ocr时判断是否分段落
if len(paraend)>0:
    paragraph=True
else:
    paragraph=False

#导入ocr包及语言包——之所以不在前面导入,是因为导入包花费时间较多,如果前面由于配置出错就浪费了时间
import easyocr
ocrreader=easyocr.Reader(['ch_sim', 'en'], model_storage_directory=langpath)#Linux: r'/home/langdata', Windows: r'C:\ocr\langdata'

#------------------开始OCR识别----------------------
for ind in range(1,len(sys.argv)):#依次获取命令行参数:由于argv[0]表示可执行文件本身,所以忽略该参数
    argvalue=sys.argv[ind]
    
    #如果命令行参数是文件类型,就对该文件进行处理...
    if os.path.isfile(argvalue):
        paper=''
        #获取文件后缀名
        filext=os.path.splitext(argvalue)[-1]
        if filext.upper() not in ['.JPG','.JPEG','.PNG','.BMP','.PDF']:#转换为大写后再比对
            print('\t', argvalue, ' 不是有效的文件格式(jpg/jpeg/png/bmp/pdf)!')
            continue#下一个命令行参数
        #如果是pdf文档    
        if filext.upper() in['.PDF']:
            images=pdf2image.convert_from_path(argvalue)#将pdf文档转换为图像序列
            for i in range(len(images)):#如果pdf转换后的图片尺寸过大,为了避免内存崩溃,缩小到特定尺寸
                ratio=max(images[i].width, images[i].height)/maximgsize#需要缩小的倍数
                if ratio>1:
                    images[i]=images[i].resize((round(images[i].width/ratio),round(images[i].height/ratio)))
                #至此,需要进行ocr的图片数据准备完毕!
                if len(allowlist)>0:#如果设置了识别字符集
                    result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph,allowlist=allowlist)
                else:
                    result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph)
                for w in result:#识别结果是一个列表,对识别结果进行拼接
                    paper = paper+w+paraend
        else:#否则,本身就是图片数据
            if len(allowlist)>0:#如果设置了识别字符集
                result = ocrreader.readtext(argvalue,batch_size=batchsize,workers=workernumt,detail=0,paragraph=paragraph,allowlist=allowlis)
            else:
                result = ocrreader.readtext(argvalue,batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph)
            for w in result:#识别结果是一个列表,对识别结果进行拼接
                paper = paper+w+paraend #如果设置了段落结尾符,在拼接时加上
        #如果设置了删除字符集
        for item in removechar:#依次删除
            paper=paper.replace(item, '')
        #print(paper)#至此,文本识别全部完成!-------------------
        #下面开始存储识别结果txt文件
        #记录当前文件的识别结果,保存为同名的txt文件
        if(len(txtdir)>0):#如果设置了txt文件目录
            txtname=os.path.basename(argvalue)+'.txt'#与原文件同名的txt文件(不含目录仅文件名)
            txtpath=os.path.join(txtdir, txtname)
        else:
            txtpath=os.path.splitext(argvalue)[0]+'.txt'#与原文件同名的txt文件(包括目录)
        print('saving file ---> ', txtpath)#保存的文件名字
        try:
            with open(txtpath, 'w') as txtfile:
                txtfile.write(paper)
        except(Exception) as e:
            print('\t', txtpath, ' Saving txt File Error: ', e)#输出异常错误
            continue
            
    #如果是文件夹...
    if os.path.isdir(argvalue):
        for root, _, filenames in os.walk(argvalue):#依次遍历文件夹,由于不关心其中的文件夹,所以将文件夹设置为隐变量
            for imgname in filenames:#遍历的每个文件(不含路径,路径在root里)
                paper=''
                filext=os.path.splitext(imgname)[-1]#得到文件后缀名
                if filext.upper() not in ['.JPG','.JPEG','.PNG','.BMP','.PDF']:
                    print('\t', imgname, '的后缀名不是有效的文件格式,跳过该文件!')
                    continue
                #与root进行拼接,得到图像文件的绝对路径(含文件名和后缀名)
                imgpath=os.path.join(root, imgname)#文件绝对路径
                #如果是pdf文档
                if filext.upper() in['.PDF']:
                    images=pdf2image.convert_from_path(imgpath)#将pdf文档转换为图像序列
                    for i in range(len(images)):#如果pdf转换后的图片尺寸过大,为了避免内存崩溃,缩小到特定尺寸
                        ratio=max(images[i].width, images[i].height)/maximgsize#需要缩小的倍数
                        if ratio>1:
                            images[i]=images[i].resize((round(images[i].width/ratio),round(images[i].height/ratio)))
                        #至此,需要进行ocr的图片数据准备完毕!
                        if len(allowlist)>0:#如果设置了识别字符集
                            result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph,allowlist=allowlist)
                        else:
                            result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph)
                        for w in result:#识别结果是一个列表,对识别结果进行拼接
                            paper = paper+w+paraend #如果设置了段落结尾符,在拼接时加上
                else:#否则,本身就是图片数据
                    if len(allowlist)>0:#如果设置了识别字符集
                        result = ocrreader.readtext(imgpath,batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph,allowlist=allowlist)
                    else:
                        result = ocrreader.readtext(imgpath,batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph)
                    for w in result:#识别结果是一个列表,对识别结果进行拼接
                        paper = paper+w+paraend #如果设置了段落结尾符,在拼接时加上
                #如果设置了删除字符集
                for item in removechar:#依次删除
                    paper=paper.replace(item, '')
                #print(paper)
                #至此,文本识别全部完成!--------------------
                #下面开始存储识别结果txt文件
                #记录当前文件的识别结果,保存为同名的txt文件
                txtname=os.path.splitext(imgname)[0]+'.txt'#与原文件同名的txt文件(不包括目录)
                if(len(txtdir)>0):#如果设置了非空的txt文件目录
                    #原来的方式是直接把所有的txt全部放在指定的一个文件夹中,当不同文件夹中存在同名的图像文件时,会存在txt文件覆盖的情况
                    #txtpath=os.path.join(txtdir, txtname)#拼接得到txt文件的绝对路径
                    #下面的方式是在指定的文件夹下面按照原图像文件的目录结构新建相同的文件夹结构并存放txt文件
                    relativeimgpath=imgpath[len(argvalue)+1:]#图片绝对路径左减去命令行指定的路径argpath得到图像文件的内部相对路径,+1是去除\
                    imgtxtdir=os.path.join(txtdir,relativeimgpath)#指定txt文件路径+图像内部相对路径(还带有图像文件名和后缀名)
                    txtfiledir=os.path.dirname(imgtxtdir)#去掉图像文件名和后缀名
                    if not os.path.exists(txtfiledir):#上面的新文件路径不一定存在
                        try:
                            os.system('mkdir '+txtfiledir)#新建文件夹
                            print('Making directory: ',txtfiledir)
                        except(Exception) as e:
                            print('\tMaking txt directory Error: ', e)#输出异常错误
                            print('\tTxt file will be storded in the image file directory!')
                            txtpath=os.path.join(root, txtname)#路径+txt文件名
                    txtpath=os.path.join(txtfiledir, txtname)#新路径+txt文件名
                else:#否则就是默认的空的txt文件目录,表示txt文件就存储在图像对应的文件夹里
                    txtpath=os.path.join(root, txtname)#路径+txt文件名
                print('saving file ---> ', txtpath)#保存的文件名字
                try:
                    with open(txtpath, 'w') as txtfile:
                        txtfile.write(paper)
                except(Exception) as e:
                    print('\t', txtpath, ' Saving txt File Error: ', e)#输出异常错误
                    continue

最后,由于easyOCR自身的原因,总会给出一些很烦人的关于pytorch包的warnings,为了避免这些警告信息干扰,可以按照warnings中的信息对其中的_utils.py文件进行修改,路径如下:

基于Python的离线OCR图片文字识别(五)——终极版本_第1张图片

 将其中的warnings对应的语句删除即可,删除后的_utils.py文件内容如下(请谨慎删除,或者先备份源文件后再删除):

基于Python的离线OCR图片文字识别(五)——终极版本_第2张图片

 

import functools
import inspect
import warnings
from collections import OrderedDict
from typing import Any, Dict, Optional, TypeVar, Callable, Tuple, Union

from torch import nn

from .._utils import sequence_to_str
from ._api import WeightsEnum


class IntermediateLayerGetter(nn.ModuleDict):
    """
    Module wrapper that returns intermediate layers from a model

    It has a strong assumption that the modules have been registered
    into the model in the same order as they are used.
    This means that one should **not** reuse the same nn.Module
    twice in the forward if you want this to work.

    Additionally, it is only able to query submodules that are directly
    assigned to the model. So if `model` is passed, `model.feature1` can
    be returned, but not `model.feature1.layer2`.

    Args:
        model (nn.Module): model on which we will extract the features
        return_layers (Dict[name, new_name]): a dict containing the names
            of the modules for which the activations will be returned as
            the key of the dict, and the value of the dict is the name
            of the returned activation (which the user can specify).

    Examples::

        >>> m = torchvision.models.resnet18(weights=ResNet18_Weights.DEFAULT)
        >>> # extract layer1 and layer3, giving as names `feat1` and feat2`
        >>> new_m = torchvision.models._utils.IntermediateLayerGetter(m,
        >>>     {'layer1': 'feat1', 'layer3': 'feat2'})
        >>> out = new_m(torch.rand(1, 3, 224, 224))
        >>> print([(k, v.shape) for k, v in out.items()])
        >>>     [('feat1', torch.Size([1, 64, 56, 56])),
        >>>      ('feat2', torch.Size([1, 256, 14, 14]))]
    """

    _version = 2
    __annotations__ = {
        "return_layers": Dict[str, str],
    }

    def __init__(self, model: nn.Module, return_layers: Dict[str, str]) -> None:
        if not set(return_layers).issubset([name for name, _ in model.named_children()]):
            raise ValueError("return_layers are not present in model")
        orig_return_layers = return_layers
        return_layers = {str(k): str(v) for k, v in return_layers.items()}
        layers = OrderedDict()
        for name, module in model.named_children():
            layers[name] = module
            if name in return_layers:
                del return_layers[name]
            if not return_layers:
                break

        super().__init__(layers)
        self.return_layers = orig_return_layers

    def forward(self, x):
        out = OrderedDict()
        for name, module in self.items():
            x = module(x)
            if name in self.return_layers:
                out_name = self.return_layers[name]
                out[out_name] = x
        return out


def _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:
    """
    This function is taken from the original tf repo.
    It ensures that all layers have a channel number that is divisible by 8
    It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
    """
    if min_value is None:
        min_value = divisor
    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
    # Make sure that round down does not go down by more than 10%.
    if new_v < 0.9 * v:
        new_v += divisor
    return new_v


D = TypeVar("D")


def kwonly_to_pos_or_kw(fn: Callable[..., D]) -> Callable[..., D]:
    """Decorates a function that uses keyword only parameters to also allow them being passed as positionals.

    For example, consider the use case of changing the signature of ``old_fn`` into the one from ``new_fn``:

    .. code::

        def old_fn(foo, bar, baz=None):
            ...

        def new_fn(foo, *, bar, baz=None):
            ...

    Calling ``old_fn("foo", "bar, "baz")`` was valid, but the same call is no longer valid with ``new_fn``. To keep BC
    and at the same time warn the user of the deprecation, this decorator can be used:

    .. code::

        @kwonly_to_pos_or_kw
        def new_fn(foo, *, bar, baz=None):
            ...

        new_fn("foo", "bar, "baz")
    """
    params = inspect.signature(fn).parameters

    try:
        keyword_only_start_idx = next(
            idx for idx, param in enumerate(params.values()) if param.kind == param.KEYWORD_ONLY
        )
    except StopIteration:
        raise TypeError(f"Found no keyword-only parameter on function '{fn.__name__}'") from None

    keyword_only_params = tuple(inspect.signature(fn).parameters)[keyword_only_start_idx:]

    @functools.wraps(fn)
    def wrapper(*args: Any, **kwargs: Any) -> D:
        args, keyword_only_args = args[:keyword_only_start_idx], args[keyword_only_start_idx:]
        if keyword_only_args:
            keyword_only_kwargs = dict(zip(keyword_only_params, keyword_only_args))
            warnings.warn(
                f"Using {sequence_to_str(tuple(keyword_only_kwargs.keys()), separate_last='and ')} as positional "
                f"parameter(s) is deprecated since 0.13 and will be removed in 0.15. Please use keyword parameter(s) "
                f"instead."
            )
            kwargs.update(keyword_only_kwargs)

        return fn(*args, **kwargs)

    return wrapper


W = TypeVar("W", bound=WeightsEnum)
M = TypeVar("M", bound=nn.Module)
V = TypeVar("V")


def handle_legacy_interface(**weights: Tuple[str, Union[Optional[W], Callable[[Dict[str, Any]], Optional[W]]]]):
    """Decorates a model builder with the new interface to make it compatible with the old.

    In particular this handles two things:

    1. Allows positional parameters again, but emits a deprecation warning in case they are used. See
        :func:`torchvision.prototype.utils._internal.kwonly_to_pos_or_kw` for details.
    2. Handles the default value change from ``pretrained=False`` to ``weights=None`` and ``pretrained=True`` to
        ``weights=Weights`` and emits a deprecation warning with instructions for the new interface.

    Args:
        **weights (Tuple[str, Union[Optional[W], Callable[[Dict[str, Any]], Optional[W]]]]): Deprecated parameter
            name and default value for the legacy ``pretrained=True``. The default value can be a callable in which
            case it will be called with a dictionary of the keyword arguments. The only key that is guaranteed to be in
            the dictionary is the deprecated parameter name passed as first element in the tuple. All other parameters
            should be accessed with :meth:`~dict.get`.
    """

    def outer_wrapper(builder: Callable[..., M]) -> Callable[..., M]:
        @kwonly_to_pos_or_kw
        @functools.wraps(builder)
        def inner_wrapper(*args: Any, **kwargs: Any) -> M:
            for weights_param, (pretrained_param, default) in weights.items():  # type: ignore[union-attr]
                # If neither the weights nor the pretrained parameter as passed, or the weights argument already use
                # the new style arguments, there is nothing to do. Note that we cannot use `None` as sentinel for the
                # weight argument, since it is a valid value.
                sentinel = object()
                weights_arg = kwargs.get(weights_param, sentinel)
                if (
                    (weights_param not in kwargs and pretrained_param not in kwargs)
                    or isinstance(weights_arg, WeightsEnum)
                    or (isinstance(weights_arg, str) and weights_arg != "legacy")
                    or weights_arg is None
                ):
                    continue

                # If the pretrained parameter was passed as positional argument, it is now mapped to
                # `kwargs[weights_param]`. This happens because the @kwonly_to_pos_or_kw decorator uses the current
                # signature to infer the names of positionally passed arguments and thus has no knowledge that there
                # used to be a pretrained parameter.
                pretrained_positional = weights_arg is not sentinel
                if pretrained_positional:
                    # We put the pretrained argument under its legacy name in the keyword argument dictionary to have a
                    # unified access to the value if the default value is a callable.
                    kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param)
                else:
                    pretrained_arg = kwargs[pretrained_param]

                if pretrained_arg:
                    default_weights_arg = default(kwargs) if callable(default) else default
                    if not isinstance(default_weights_arg, WeightsEnum):
                        raise ValueError(f"No weights available for model {builder.__name__}")
                else:
                    default_weights_arg = None
                del kwargs[pretrained_param]
                kwargs[weights_param] = default_weights_arg

            return builder(*args, **kwargs)

        return inner_wrapper

    return outer_wrapper


def _ovewrite_named_param(kwargs: Dict[str, Any], param: str, new_value: V) -> None:
    if param in kwargs:
        if kwargs[param] != new_value:
            raise ValueError(f"The parameter '{param}' expected value {new_value} but got {kwargs[param]} instead.")
    else:
        kwargs[param] = new_value


def _ovewrite_value_param(param: Optional[V], new_value: V) -> V:
    if param is not None:
        if param != new_value:
            raise ValueError(f"The parameter '{param}' expected value {new_value} but got {param} instead.")
    return new_value


class _ModelURLs(dict):
    def __getitem__(self, item):

        return super().__getitem__(item)

你可能感兴趣的:(Python数据处理,数据清洗,python环境配置,python,大数据,自然语言处理)