Windows 10上使用Python 3.7.9的paddleocr模块从健康云或者随申办截图中解析核酸检测数据

奥密克戎在上海爆发了,一个友人收集了一些健康云或者随申办的核酸检测截图做数据分析来帮助抗疫,可是人工处理太费事了,于是问我能不能帮忙写一个脚本把核酸检测结果从截图中解析出来,然后把结果导出到Excel文件。

我把它分享出来,感兴趣的同学可以参考一下。

准备工作

1,从官网下载Python 3.7.9(Windows x86_64),之所以下载这一版本的Python是我发现其他的版本在安装paddleocr模块的时候会有一些乱七八糟的报错。

Python Release Python 3.7.9 | Python.orgicon-default.png?t=M3K6https://www.python.org/downloads/release/python-379/

2,安装Build Tools for Visual Studio 2022,不然会有Microsoft Visual C++ 14.0 is required报错
Download Visual Studio Tools - Install Free for Windows, Mac, LinuxDownload Visual Studio IDE or VS Code for free. Try out Visual Studio Professional or Enterprise editions on Windows, Mac.https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022

3,解决paddlepaddle模块依赖的numpy版本与imageio模块依赖的numpy版本冲突问题

paddlepaddle所要的numpy版本与imageio所要的numpy版本冲突了怎么办? · Issue #40142 · PaddlePaddle/Paddle · GitHubicon-default.png?t=M3K6https://github.com/PaddlePaddle/Paddle/issues/40142

python -m pip install paddlepaddle==0.0.0 -f https://www.paddlepaddle.org.cn/whl/windows/cpu-mkl-avx/develop.html

4,Windows命令行显示汉字

通过 chcp命令改变代码页,UTF-8的代码页为65001
chcp 65001

下面是脚本代码

# -*- coding: utf-8 -*-
import pandas as pd
import os
import re
import time
import requests
from paddleocr import PaddleOCR, draw_ocr
from pprintpp import pprint as pp

def download(url, file_path):
    if os.path.exists(file_path):
        print(f"Delete the file {file_path}, as it already exists")
        os.remove(file_path)

    headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
    get_response = requests.get(url, headers=headers, stream=True)
    print(get_response)
    with open(file_path, 'wb') as f:
        for chunk in get_response.iter_content(chunk_size=1024):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

def parse_result(ocr_result):
    sampling_time, validate_result = '', ''
    ocr_s_sampling = re.search(r'采样时间:?\s+(?P\d{4}-\d{2}-\d{2}\s*\d{2}:\d{2}:\d{2}\s+?)\s+', ocr_result)
    if ocr_s_sampling:
        sampling_time = ocr_s_sampling.group('sampling_time')
    ocr_s_validate = re.search(r'检测结果:?\s+(?P.+?)(?:\s+|$)', ocr_result)
    if ocr_s_validate:
        validate_result = ocr_s_validate.group('validate_result')

    return sampling_time, validate_result

ocr = PaddleOCR(use_angle_cls=True, lang='ch')

img_dir_name='images'
if not os.path.exists(img_dir_name):
    os.mkdir(img_dir_name)
    print(f"Directory {img_dir_name} created")
else:
    print(f"Directory {img_dir_name} already exists")

my_sheet = 'Sheet1' # change it to your sheet name, you can find your sheet name at the bottom left of your excel file
#data_file_path = 'data/157993202_0_出舱信息登记_447_447-1.xlsx' # change it to the name of your excel file
data_file_path = 'data/157993202_0_出舱信息登记_604_604-1.xlsx' # change it to the name of your excel file
#dfs= pd.read_excel(data_file_path, sheet_name=None) # pandas version 0.21.0+
#pp(dfs)
pd.set_option('max_colwidth', 500)
df = pd.read_excel(data_file_path, sheet_name=my_sheet) # pandas version 0.21.0+
#pp(df.head())

users_results_parsed = []
for index, data in df.iterrows():
    user_results_parsed = []

    data = str(data)
    s = re.search(r'您的姓名:\s+(?P.*)\s*.*您的性别:\s+(?P.*)\s*.*您的手机号码:\s+(?P.+)\s*.*您的身份证号码:\s+(?P.+)\s*.*您在上海常住地址所属街道或乡镇:?\s+(?P.*)\s*.*请您上传24小时内核酸报告阴性图片:?\s+(?P.*)\s*.*请您上传72小时内核酸报告阴性图片:?\s+(?P.*)\s*', data)
    if s:
        name = s.group('name')
        sex = s.group('sex')
        phone = s.group('phone')
        phone = re.sub(r'\.0', '', phone)
        id_number = s.group('id_number')
        id_number = re.sub(r'\.0', '', id_number)
        home_address = s.group('home_address')
        nucleic_acid_24h = s.group('nucleic_acid_24h')
        nucleic_acid_72h = s.group('nucleic_acid_72h')
        print(f"{name}|{phone}|{id_number}|{home_address}|{nucleic_acid_24h}|{nucleic_acid_72h}")

        user_img_dir_name = f"{img_dir_name}/{name}_{id_number}"
        if not os.path.exists(user_img_dir_name):
            os.mkdir(user_img_dir_name)
            print(f"User Directory {user_img_dir_name} created")
        else:
            print(f"User Directory {user_img_dir_name} already exists")

        # latest 24 hours
        user_img_file_24h = f"{user_img_dir_name}/1.png"
        print(f"{user_img_file_24h}")
        download(nucleic_acid_24h, user_img_file_24h)
        ocr_result_24h = None
        user_sampling_time_latest_24h, user_validate_result_latest_24h = '', ''
        if os.path.exists(user_img_file_24h):
            ocr_result_24h_formatted = ''
            ocr_result_24h = ocr.ocr(user_img_file_24h, cls=True)
            if ocr_result_24h:
                for line in ocr_result_24h:
                    ocr_result_24h_formatted = f"{ocr_result_24h_formatted}   {line[1][0]}"
                user_sampling_time_latest_24h, user_validate_result_latest_24h = parse_result(ocr_result_24h_formatted)
                print(f"24hours: {user_sampling_time_latest_24h}|{user_validate_result_latest_24h}")

        # latest 72 hours
        user_img_file_72h = f"{user_img_dir_name}/2.png"
        print(f"{user_img_file_72h}")
        download(nucleic_acid_72h, user_img_file_72h)
        ocr_result_72h = None
        user_sampling_time_latest_72h, user_validate_result_latest_72h = '', ''
        if os.path.exists(user_img_file_72h):
            ocr_result_72h_formatted = ''
            ocr_result_72h = ocr.ocr(user_img_file_72h, cls=True)
            if ocr_result_72h:
                for line in ocr_result_72h:
                    ocr_result_72h_formatted = f"{ocr_result_72h_formatted}   {line[1][0]}"
                user_sampling_time_latest_72h, user_validate_result_latest_72h = parse_result(ocr_result_72h_formatted)
                print(f"72hours: {user_sampling_time_latest_72h}|{user_validate_result_latest_72h}")

        duplicate_24h_72h_flag = 'N'
        if user_sampling_time_latest_24h == user_sampling_time_latest_72h and user_validate_result_latest_24h == user_validate_result_latest_72h:
            duplicate_24h_72h_flag = 'Y'

        user_results_parsed = [name, sex, phone, id_number, home_address, user_sampling_time_latest_24h, user_validate_result_latest_24h, user_sampling_time_latest_72h, user_validate_result_latest_72h, duplicate_24h_72h_flag]
    users_results_parsed.append(user_results_parsed)

    #break
    time.sleep(2)
#print(users_results_parsed)

export_data_file = 'parsed_results.xlsx'
df2=pd.DataFrame(users_results_parsed, columns=['姓名', '性别', '手机号码', '身份证号码', '上海常住地址', '24小时采样时间', '24小时检测结果', '72小时采样时间', '72小时检测结果', '是否重复'])
df2.to_excel(export_data_file, index=False)

你可能感兴趣的:(Python,python,windows)