第一句子网 > 基于Python的离线OCR图片文字识别（五）——终极版本

基于Python的离线OCR图片文字识别（五）——终极版本

时间：2020-04-18 00:53:53

至此，终于迎来了离线ocr的终极大结局，命令行后面参数既支持图像文件、图像文件夹，还支持PDF图像类型的文件，既支持通过json文件进行参数配置，又支持帮助文档，easyOCR包既支持允许字符集（也即仅支持字符集中的识别，例如在验证码识别场合），也支持排除字符集，还支持批处理尺寸大小、线程数目、分段结构保留（支持paragraph时，ocr结果就没有原来单句时的识别概率值了）等。

#!/home/super/miniconda3/bin/python#encoding=utf-8#author: superchao1982, 50903556@#帮助信息strhelp='''img2txt is one program to get ocr texts from image or pdf files!batchsize is the batch size, larger more faster but more merrory;workernum is the number of threads, larger more faster;maximgsize is the max height or width of the images to be passed to the ocr processing when extract from the pdf files;paragraph is whether to keep the paragraph when ocringlangpath is the directory of the language data stored, '/home/langdata' for linux and 'C:\ocr\langdata' for win;allowlist is chars allow to be recognized only, '' means allow all charactors;removechar is char to be removed when ocr processing, for example '| _^~`&';txtdir is the path to store the txt files, could be any legal absolute or relative path,'' means the same directory of the image files;=== settings above can be changed in the file 'config.json' which stored in langpath ===contents in config.json like:{"batchsize": 2,"workernum": 4,"maximgsize": 1000,"paragraph": True"langpath": "/home/langdata","allowlist": "","removechar": " _^~`&""txtdir": ""}------------------------------------e.g../img2txt.py img1.jpg jmg2.jpg 001.pdf 002.pdf #follow by one or more image or pdf files./img2txt.py ./pdfs home/usr/Document/imgs #follow by one or more directory contain image or pdf files./img2txt.py --help #output the help info./img2txt.py --config #generate the default config.json file in the langpath------------------------------------'''import sysimport jsonimport osimport pdf2imageimport numpy as np#------------------默认参数设置----------------------batchsize=2 # (default = 1) - Batch_size>1 will make EasyOCR faster but use more memoryworkernum=4 # (default = 0) - Number thread used in of dataloadermaximgsize=1000# (default = 1000) - Max image width & height when using pdfparaend='\n' # (default = '\n') - The paragraph ending charallowlist='' # (string) - Force EasyOCR to recognize only subset of charactersremovechar='| _^~`&'#待删除无效字符txtdir='' #ocr识别后同名txt文件存放的位置:空表示同一目录，点表示相对目录，其他表示绝对目录#根据系统设置默认的语言包路径if sys.platform.lower().startswith('linux'):langpath='/home/langdata'elif sys.platform.lower().startswith('win'):langpath='C:\ocr\langdata'else:print('\tError: Unknow System!')sys.exit()#根据默认参数生成配置字典config={"batchsize": batchsize,"workernum": workernum,"maximgsize": maximgsize,"paraend": paraend,"allowlist": allowlist,"langpath": langpath,"removechar": removechar,"txtdir": txtdir}#------------------命令行参数处理----------------------#首先对输入的命令行参数进行处理，在加载ocr包之前排查的好处是避免临处理时出错白白浪费时间for i in range(1,len(sys.argv)):#获取命令行参数：argv[0]表示可执行文件本身if sys.argv[i] in ['-h', '--help']:print(strhelp)sys.exit()elif sys.argv[i] in ['-c', '--config']:#保存字典到文件try:with open(os.path.join(langpath,'config.json'), 'w') as jsonfile:json.dump(config, jsonfile, ensure_ascii=False,indent=4)print('Genrerating config.json success! ---> ', os.path.join(langpath,'config.json'))except(Exception) as e:print('\tSaving config file config.json Error: ', e)#输出异常错误sys.exit()else:#check the image file or directory is valid-提前校验，免得浪费时间加载easyocr模型if not os.path.exists(sys.argv[i]):print(sys.argv[i], ' is invalid, please input the correct file or directory path!')sys.exit()#判断指定目录下是否存在配置文件config.json,存在就使用（不存在就使用上面的默认值）：configfile=os.path.join(langpath,'config.json')if os.path.exists(configfile):try:with open(configfile, 'r') as jsonfile:config=json.load(jsonfile)batchsize=config['batchsize']workernum=config['workernum']maximgsize=config['maximgsize']paraend=config['paraend']langpath=config['langpath']allowlist=config['allowlist']removechar=config['removechar']txtdir=config['txtdir']print('Using the config in ', configfile)except(Exception) as e:print('\tReading config file ', configfile ,' Error: ', e)#输出异常错误print('\tCheck the json file, or remove the config.json file to use defaulting configs!')sys.exit()else:print('Using the default config! You can make your own config.json in ', langpath, ' by using the "--config" option')print(config)#------------------OCR前准备工作----------------------#检查语言包路径是否正确，语言包是必须的if not os.path.exists(langpath):print('\tError: Invalid langpath! Checking the path again!')sys.exit()#检查txt文件保存路径，不存在就生成一个if len(txtdir)>0 and not os.path.exists(txtdir):print('txtdir in config.json is not exists, generating ', txtdir)try:os.system('mkdir '+txtdir)print('Making directory: ',txtdir)except(Exception) as e:print('\tMaking txt directory Error: ', e)#输出异常错误print('\tPlease input a legal txtdir in the config.json file and try again!')sys.exit()#根据段落结尾符ocr时判断是否分段落if len(paraend)>0:paragraph=Trueelse:paragraph=False#导入ocr包及语言包——之所以不在前面导入，是因为导入包花费时间较多，如果前面由于配置出错就浪费了时间import easyocrocrreader=easyocr.Reader(['ch_sim', 'en'], model_storage_directory=langpath)#Linux: r'/home/langdata', Windows: r'C:\ocr\langdata'#------------------开始OCR识别----------------------for ind in range(1,len(sys.argv)):#依次获取命令行参数：由于argv[0]表示可执行文件本身，所以忽略该参数argvalue=sys.argv[ind]#如果命令行参数是文件类型，就对该文件进行处理...if os.path.isfile(argvalue):paper=''#获取文件后缀名filext=os.path.splitext(argvalue)[-1]if filext.upper() not in ['.JPG','.JPEG','.PNG','.BMP','.PDF']:#转换为大写后再比对print('\t', argvalue, ' 不是有效的文件格式(jpg/jpeg/png/bmp/pdf)!')continue#下一个命令行参数#如果是pdf文档 if filext.upper() in['.PDF']:images=pdf2image.convert_from_path(argvalue)#将pdf文档转换为图像序列for i in range(len(images)):#如果pdf转换后的图片尺寸过大，为了避免内存崩溃，缩小到特定尺寸ratio=max(images[i].width, images[i].height)/maximgsize#需要缩小的倍数if ratio>1:images[i]=images[i].resize((round(images[i].width/ratio),round(images[i].height/ratio)))#至此，需要进行ocr的图片数据准备完毕！if len(allowlist)>0:#如果设置了识别字符集result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph,allowlist=allowlist)else:result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph)for w in result:#识别结果是一个列表，对识别结果进行拼接paper = paper+w+paraendelse:#否则，本身就是图片数据if len(allowlist)>0:#如果设置了识别字符集result = ocrreader.readtext(argvalue,batch_size=batchsize,workers=workernumt,detail=0,paragraph=paragraph,allowlist=allowlis)else:result = ocrreader.readtext(argvalue,batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph)for w in result:#识别结果是一个列表，对识别结果进行拼接paper = paper+w+paraend #如果设置了段落结尾符，在拼接时加上#如果设置了删除字符集for item in removechar:#依次删除paper=paper.replace(item, '')#print(paper)#至此，文本识别全部完成！-------------------#下面开始存储识别结果txt文件#记录当前文件的识别结果，保存为同名的txt文件if(len(txtdir)>0):#如果设置了txt文件目录txtname=os.path.basename(argvalue)+'.txt'#与原文件同名的txt文件（不含目录仅文件名）txtpath=os.path.join(txtdir, txtname)else:txtpath=os.path.splitext(argvalue)[0]+'.txt'#与原文件同名的txt文件（包括目录）print('saving file ---> ', txtpath)#保存的文件名字try:with open(txtpath, 'w') as txtfile:txtfile.write(paper)except(Exception) as e:print('\t', txtpath, ' Saving txt File Error: ', e)#输出异常错误continue#如果是文件夹...if os.path.isdir(argvalue):for root, _, filenames in os.walk(argvalue):#依次遍历文件夹，由于不关心其中的文件夹，所以将文件夹设置为隐变量for imgname in filenames:#遍历的每个文件（不含路径，路径在root里）paper=''filext=os.path.splitext(imgname)[-1]#得到文件后缀名if filext.upper() not in ['.JPG','.JPEG','.PNG','.BMP','.PDF']:print('\t', imgname, '的后缀名不是有效的文件格式，跳过该文件！')continue#与root进行拼接，得到图像文件的绝对路径（含文件名和后缀名）imgpath=os.path.join(root, imgname)#文件绝对路径#如果是pdf文档if filext.upper() in['.PDF']:images=pdf2image.convert_from_path(imgpath)#将pdf文档转换为图像序列for i in range(len(images)):#如果pdf转换后的图片尺寸过大，为了避免内存崩溃，缩小到特定尺寸ratio=max(images[i].width, images[i].height)/maximgsize#需要缩小的倍数if ratio>1:images[i]=images[i].resize((round(images[i].width/ratio),round(images[i].height/ratio)))#至此，需要进行ocr的图片数据准备完毕！if len(allowlist)>0:#如果设置了识别字符集result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph,allowlist=allowlist)else:result = ocrreader.readtext(np.asarray(images[i]),batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph)for w in result:#识别结果是一个列表，对识别结果进行拼接paper = paper+w+paraend #如果设置了段落结尾符，在拼接时加上else:#否则，本身就是图片数据if len(allowlist)>0:#如果设置了识别字符集result = ocrreader.readtext(imgpath,batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph,allowlist=allowlist)else:result = ocrreader.readtext(imgpath,batch_size=batchsize,workers=workernum,detail=0,paragraph=paragraph)for w in result:#识别结果是一个列表，对识别结果进行拼接paper = paper+w+paraend #如果设置了段落结尾符，在拼接时加上#如果设置了删除字符集for item in removechar:#依次删除paper=paper.replace(item, '')#print(paper)#至此，文本识别全部完成！--------------------#下面开始存储识别结果txt文件#记录当前文件的识别结果，保存为同名的txt文件txtname=os.path.splitext(imgname)[0]+'.txt'#与原文件同名的txt文件（不包括目录）if(len(txtdir)>0):#如果设置了非空的txt文件目录#原来的方式是直接把所有的txt全部放在指定的一个文件夹中，当不同文件夹中存在同名的图像文件时，会存在txt文件覆盖的情况#txtpath=os.path.join(txtdir, txtname)#拼接得到txt文件的绝对路径#下面的方式是在指定的文件夹下面按照原图像文件的目录结构新建相同的文件夹结构并存放txt文件relativeimgpath=imgpath[len(argvalue)+1:]#图片绝对路径左减去命令行指定的路径argpath得到图像文件的内部相对路径,+1是去除\imgtxtdir=os.path.join(txtdir,relativeimgpath)#指定txt文件路径+图像内部相对路径（还带有图像文件名和后缀名）txtfiledir=os.path.dirname(imgtxtdir)#去掉图像文件名和后缀名if not os.path.exists(txtfiledir):#上面的新文件路径不一定存在try:os.system('mkdir '+txtfiledir)#新建文件夹print('Making directory: ',txtfiledir)except(Exception) as e:print('\tMaking txt directory Error: ', e)#输出异常错误print('\tTxt file will be storded in the image file directory!')txtpath=os.path.join(root, txtname)#路径+txt文件名txtpath=os.path.join(txtfiledir, txtname)#新路径+txt文件名else:#否则就是默认的空的txt文件目录，表示txt文件就存储在图像对应的文件夹里txtpath=os.path.join(root, txtname)#路径+txt文件名print('saving file ---> ', txtpath)#保存的文件名字try:with open(txtpath, 'w') as txtfile:txtfile.write(paper)except(Exception) as e:print('\t', txtpath, ' Saving txt File Error: ', e)#输出异常错误continue

最后，由于easyOCR自身的原因，总会给出一些很烦人的关于pytorch包的warnings，为了避免这些警告信息干扰，可以按照warnings中的信息对其中的_utils.py文件进行修改，路径如下：

将其中的warnings对应的语句删除即可，删除后的_utils.py文件内容如下（请谨慎删除，或者先备份源文件后再删除）：

import functoolsimport inspectimport warningsfrom collections import OrderedDictfrom typing import Any, Dict, Optional, TypeVar, Callable, Tuple, Unionfrom torch import nnfrom .._utils import sequence_to_strfrom ._api import WeightsEnumclass IntermediateLayerGetter(nn.ModuleDict):"""Module wrapper that returns intermediate layers from a modelIt has a strong assumption that the modules have been registeredinto the model in the same order as they are used.This means that one should **not** reuse the same nn.Moduletwice in the forward if you want this to work.Additionally, it is only able to query submodules that are directlyassigned to the model. So if `model` is passed, `model.feature1` canbe returned, but not `model.feature1.layer2`.Args:model (nn.Module): model on which we will extract the featuresreturn_layers (Dict[name, new_name]): a dict containing the namesof the modules for which the activations will be returned asthe key of the dict, and the value of the dict is the nameof the returned activation (which the user can specify).Examples::>>> m = torchvision.models.resnet18(weights=ResNet18_Weights.DEFAULT)>>> # extract layer1 and layer3, giving as names `feat1` and feat2`>>> new_m = torchvision.models._utils.IntermediateLayerGetter(m,>>>{'layer1': 'feat1', 'layer3': 'feat2'})>>> out = new_m(torch.rand(1, 3, 224, 224))>>> print([(k, v.shape) for k, v in out.items()])>>>[('feat1', torch.Size([1, 64, 56, 56])),>>>('feat2', torch.Size([1, 256, 14, 14]))]"""_version = 2__annotations__ = {"return_layers": Dict[str, str],}def __init__(self, model: nn.Module, return_layers: Dict[str, str]) -> None:if not set(return_layers).issubset([name for name, _ in model.named_children()]):raise ValueError("return_layers are not present in model")orig_return_layers = return_layersreturn_layers = {str(k): str(v) for k, v in return_layers.items()}layers = OrderedDict()for name, module in model.named_children():layers[name] = moduleif name in return_layers:del return_layers[name]if not return_layers:breaksuper().__init__(layers)self.return_layers = orig_return_layersdef forward(self, x):out = OrderedDict()for name, module in self.items():x = module(x)if name in self.return_layers:out_name = self.return_layers[name]out[out_name] = xreturn outdef _make_divisible(v: float, divisor: int, min_value: Optional[int] = None) -> int:"""This function is taken from the original tf repo.It ensures that all layers have a channel number that is divisible by 8It can be seen here:/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py"""if min_value is None:min_value = divisornew_v = max(min_value, int(v + divisor / 2) // divisor * divisor)# Make sure that round down does not go down by more than 10%.if new_v < 0.9 * v:new_v += divisorreturn new_vD = TypeVar("D")def kwonly_to_pos_or_kw(fn: Callable[..., D]) -> Callable[..., D]:"""Decorates a function that uses keyword only parameters to also allow them being passed as positionals.For example, consider the use case of changing the signature of ``old_fn`` into the one from ``new_fn``:.. code::def old_fn(foo, bar, baz=None):...def new_fn(foo, *, bar, baz=None):...Calling ``old_fn("foo", "bar, "baz")`` was valid, but the same call is no longer valid with ``new_fn``. To keep BCand at the same time warn the user of the deprecation, this decorator can be used:.. code::@kwonly_to_pos_or_kwdef new_fn(foo, *, bar, baz=None):...new_fn("foo", "bar, "baz")"""params = inspect.signature(fn).parameterstry:keyword_only_start_idx = next(idx for idx, param in enumerate(params.values()) if param.kind == param.KEYWORD_ONLY)except StopIteration:raise TypeError(f"Found no keyword-only parameter on function '{fn.__name__}'") from Nonekeyword_only_params = tuple(inspect.signature(fn).parameters)[keyword_only_start_idx:]@functools.wraps(fn)def wrapper(*args: Any, **kwargs: Any) -> D:args, keyword_only_args = args[:keyword_only_start_idx], args[keyword_only_start_idx:]if keyword_only_args:keyword_only_kwargs = dict(zip(keyword_only_params, keyword_only_args))warnings.warn(f"Using {sequence_to_str(tuple(keyword_only_kwargs.keys()), separate_last='and ')} as positional "f"parameter(s) is deprecated since 0.13 and will be removed in 0.15. Please use keyword parameter(s) "f"instead.")kwargs.update(keyword_only_kwargs)return fn(*args, **kwargs)return wrapperW = TypeVar("W", bound=WeightsEnum)M = TypeVar("M", bound=nn.Module)V = TypeVar("V")def handle_legacy_interface(**weights: Tuple[str, Union[Optional[W], Callable[[Dict[str, Any]], Optional[W]]]]):"""Decorates a model builder with the new interface to make it compatible with the old.In particular this handles two things:1. Allows positional parameters again, but emits a deprecation warning in case they are used. See:func:`torchvision.prototype.utils._internal.kwonly_to_pos_or_kw` for details.2. Handles the default value change from ``pretrained=False`` to ``weights=None`` and ``pretrained=True`` to``weights=Weights`` and emits a deprecation warning with instructions for the new interface.Args:**weights (Tuple[str, Union[Optional[W], Callable[[Dict[str, Any]], Optional[W]]]]): Deprecated parametername and default value for the legacy ``pretrained=True``. The default value can be a callable in whichcase it will be called with a dictionary of the keyword arguments. The only key that is guaranteed to be inthe dictionary is the deprecated parameter name passed as first element in the tuple. All other parametersshould be accessed with :meth:`~dict.get`."""def outer_wrapper(builder: Callable[..., M]) -> Callable[..., M]:@kwonly_to_pos_or_kw@functools.wraps(builder)def inner_wrapper(*args: Any, **kwargs: Any) -> M:for weights_param, (pretrained_param, default) in weights.items(): # type: ignore[union-attr]# If neither the weights nor the pretrained parameter as passed, or the weights argument already use# the new style arguments, there is nothing to do. Note that we cannot use `None` as sentinel for the# weight argument, since it is a valid value.sentinel = object()weights_arg = kwargs.get(weights_param, sentinel)if ((weights_param not in kwargs and pretrained_param not in kwargs)or isinstance(weights_arg, WeightsEnum)or (isinstance(weights_arg, str) and weights_arg != "legacy")or weights_arg is None):continue# If the pretrained parameter was passed as positional argument, it is now mapped to# `kwargs[weights_param]`. This happens because the @kwonly_to_pos_or_kw decorator uses the current# signature to infer the names of positionally passed arguments and thus has no knowledge that there# used to be a pretrained parameter.pretrained_positional = weights_arg is not sentinelif pretrained_positional:# We put the pretrained argument under its legacy name in the keyword argument dictionary to have a# unified access to the value if the default value is a callable.kwargs[pretrained_param] = pretrained_arg = kwargs.pop(weights_param)else:pretrained_arg = kwargs[pretrained_param]if pretrained_arg:default_weights_arg = default(kwargs) if callable(default) else defaultif not isinstance(default_weights_arg, WeightsEnum):raise ValueError(f"No weights available for model {builder.__name__}")else:default_weights_arg = Nonedel kwargs[pretrained_param]kwargs[weights_param] = default_weights_argreturn builder(*args, **kwargs)return inner_wrapperreturn outer_wrapperdef _ovewrite_named_param(kwargs: Dict[str, Any], param: str, new_value: V) -> None:if param in kwargs:if kwargs[param] != new_value:raise ValueError(f"The parameter '{param}' expected value {new_value} but got {kwargs[param]} instead.")else:kwargs[param] = new_valuedef _ovewrite_value_param(param: Optional[V], new_value: V) -> V:if param is not None:if param != new_value:raise ValueError(f"The parameter '{param}' expected value {new_value} but got {param} instead.")return new_valueclass _ModelURLs(dict):def __getitem__(self, item):return super().__getitem__(item)

本内容不代表本网观点和政治立场，如有侵犯你的权益请联系我们处理。

网友评论

网友评论仅供其表达个人看法，并不表明网站立场。