python循环读取文件夹中的图片表格内容

有几百张格式相同的图片,图片中有固定格式的表格内容,需要读取表格中的固定单元格的值,根据值将图片存放到不同的目录。以下是实现代码。

使用的 img2table 库,首先需要安装 img2table

 pip install img2table

1、导入所需的库

import os
from img2table.document import Image
from img2table.ocr import TesseractOCR
import shutil
import string
import random

2、读取文件夹中的所有图片文件名

path_img = 'newimg/'
img_dir = [f for f in os.listdir(path_img)]
ocr = TesseractOCR(n_threads=1)

当然页可以换个其他写法。

3、定义了个随机生成字符串的方法,用来替换原来的文件名

def generate_random_string(length):
    letters = string.ascii_lowercase  # 所有小写字母
    result_str = ''.join(random.choice(letters) for i in range(length))
    return result_str

4、循环处理图片,懒得拆分方法了,真个粘贴了!!

for img in img_dir:
    ss = 0
    old_file_name = path_img + img
    new_file_name = path_img + "ocrimgimg.png"
    os.rename(old_file_name, new_file_name)

    image = Image(new_file_name)
    imgage_tables = image.extract_tables(ocr=ocr)

    for td in imgage_tables:
        try:
            amt = td.content[4][2].value
            jydate = td.content[7][2].value
            kjdate = td.content[7][5].value
            # company = td.content[2][2].value

            print(img,'======',amt,'====',jydate,'======',kjdate)

            if float(amt) >= 1000:
                ss = 1
                dirname = './data/'
                if not os.path.exists(dirname):
                    os.makedirs(dirname)

                filename = kjdate+'___'+generate_random_string(10)
                bkname =  dirname+filename+'.png'
                shutil.move(new_file_name, bkname)

            if float(amt) < 1000:
                ss = 1
                dirname = './dataxy/'
                if not os.path.exists(dirname):
                    os.makedirs(dirname)

                filename = kjdate + '___' + generate_random_string(10)
                bkname = dirname + filename + '.png'
                shutil.move(new_file_name, bkname)

        except Exception as e:
            if ss == 1:
                os.rename(new_file_name, old_file_name)
                print(e)

    if ss == 0:
        os.rename(new_file_name, old_file_name)

因为我的图片是中文的,因此需要替换原来的文件名成英文(要不读取图片表格报错),主要是这段代码,读取图片并提取图片中的表格。

python循环读取文件夹中的图片表格内容_第1张图片

完整代码也粘贴下:

import os
from img2table.document import Image
from img2table.ocr import TesseractOCR
import shutil
import string
import random

path_img = 'newimg/'
img_dir = [f for f in os.listdir(path_img)]
ocr = TesseractOCR(n_threads=1)

def generate_random_string(length):
    letters = string.ascii_lowercase  # 所有小写字母
    result_str = ''.join(random.choice(letters) for i in range(length))
    return result_str

for img in img_dir:
    ss = 0
    old_file_name = path_img + img
    new_file_name = path_img + "ocrimgimg.png"
    os.rename(old_file_name, new_file_name)

    image = Image(new_file_name)
    imgage_tables = image.extract_tables(ocr=ocr)

    for td in imgage_tables:
        try:
            amt = td.content[4][2].value
            jydate = td.content[7][2].value
            kjdate = td.content[7][5].value
            # company = td.content[2][2].value

            print(img,'======',amt,'====',jydate,'======',kjdate)

            if float(amt) >= 1000:
                ss = 1
                dirname = './data/'
                if not os.path.exists(dirname):
                    os.makedirs(dirname)

                filename = kjdate+'___'+generate_random_string(10)
                bkname =  dirname+filename+'.png'
                shutil.move(new_file_name, bkname)

            if float(amt) < 1000:
                ss = 1
                dirname = './dataxy/'
                if not os.path.exists(dirname):
                    os.makedirs(dirname)

                filename = kjdate + '___' + generate_random_string(10)
                bkname = dirname + filename + '.png'
                shutil.move(new_file_name, bkname)

        except Exception as e:
            if ss == 1:
                os.rename(new_file_name, old_file_name)
                print(e)

    if ss == 0:
        os.rename(new_file_name, old_file_name)

你可能感兴趣的:(python,前端,linux)