xml/html/json操作工具类,ofd发票解析

# coding=utf-8
'''
Created on 2022年5月23日

@author: 瞌睡蟲子
'''
from lxml import etree
from cssselect import GenericTranslator
from jsonpath import jsonpath
from io import BytesIO
from os import path
import json
import re
import xmltodict


def initJson(data):
    if type(data) == dict:
        pass
    elif path.isfile(data):        
        with open(data, "rb") as f:
            data = f.read()
        try:
            data = json.loads(data)
        except:
            data = data.decode("gbk")
            data = json.loads(data)
    else:
        data = json.loads(data)
    return data


def jpath(exp, jsonObj):
    return jsonpath(jsonObj, exp)


def initDom(data, parseType="html"):
    return _parse(data, parseType)


def xpath(exp, lxmlObj, args={}):
    elem = lxmlObj.xpath(exp, **args)
    return [str(item) for item in elem]


def css(exp, lxmlObj, args={}):
    return xpath(GenericTranslator().css_to_xpath(exp), lxmlObj, args)


def replace(reg, exp, come):
    # re.MULTILINE | re.DOTALL
    return re.sub(re.compile(reg, 8 | 16), (exp if exp.find('lambda') == -1 else eval(exp, globals(), locals())), come)


def pyeval(exp, args=None):
    return eval(exp, globals(), locals())


def xmlToJson(xml):
    converteJson = xmltodict.parse(xml, encoding='utf-8')
    jsonStr = json.dumps(converteJson, indent=4)
    return jsonStr


def jsonToXml(js):
    jsDict = json.loads(js)
    try:
        convertXml = xmltodict.unparse(jsDict, encoding='utf-8')
    except:
        convertXml = xmltodict.unparse({'root': jsDict}, encoding='utf-8')
    return convertXml


def _parse(data, parseType="html"):
    # print(parseType)
    # 设置解析方式
    if str.lower(parseType) == "html":
        parser = etree.HTMLParser()
    elif str.lower(parseType) == "feed":
        parser = etree.FeedParser()
    else:
        parser = etree.XMLParser()
    # print(type(parser))
    if type(data) == bytes:
        # 以bytes类型传入
        dom = etree.parse(BytesIO(data), parser)
    elif path.isfile(data):
        # 以文件路径格式传入,这里要转成bytes,解决xml含utf-8表头错误
        with open(data, "rb") as f:
            data = f.read()
        dom = etree.parse(BytesIO(data), parser)
    else:
        # 讲字符串转为bytes,解决xml含utf-8表头错误
        dom = etree.parse(BytesIO(data.encode()), parser)
    return dom


if __name__ == "__main__":
    dom = initJson(r"C:\Users\44413\Desktop\ofd\11.json")
    a = jpath("$..fp:SellerAddrTel",dom)
    print(a)
  

uibot ofd格式发票解析

Function ofd解析(ofdPath)
    
    Dim dom,dt,dt1,dt2,keys,values
    Dim data = ""
    Dim sRet = ""
    Dim arrayRet = ""
    Dim rootPath = @res"data"
    
    // 创建临时解压目录
    If File.FolderExists(rootPath)
        File.DeleteFolder(rootPath)
    End If
    File.CreateFolder(rootPath)
    // 用zip解压ofd格式文件
    arrayRet = File.Decompression(ofdPath,rootPath,{"sPassword":""})
    // 查找根文件xml
    data = File.SearchFile(rootPath,"*.xml",False)
    
    
    
    sRet = File.Read(data[0],"utf-8")
    sRet = domtools.xmlToJson(sRet)
    sRet = JSON.Parse(sRet)
    // sRet = domtools.InitJson(sRet)
    dt = domtools.Jpath("$..@DocType",sRet)
    dt = LCase(dt[0])
    dt1 = domtools.Jpath("$..@xmlns:ofd",sRet)
    namespaces = {"namespaces": {dt: dt1[0]}}
    Log.Info(namespaces)
    
    
    // 构建xml用于解析
    dom = domtools.InitDom(data[0],"xml")
    // 查找xml信息key
    keys = domtools.Xpath("//ofd:CustomData/@Name",dom,namespaces)
    // 查找xml信息value
    values = domtools.Xpath("//ofd:CustomData/text()",dom,namespaces)
    // 将key,value转换成自动
    data = domtools.pyeval("dict(zip(args[0],args[1]))",[keys,values])
    Log.Info(data)
    // 获取内容xml路径
    dt = domtools.Xpath("//ofd:DocRoot/text()",dom,namespaces)
    dt = rootPath & "\\" & dt[0]
    Log.Info(dt)
    // 获取Attachments的xm路径
    dom = domtools.InitDom(dt,"xml")
    dt1 = domtools.Xpath("//ofd:Attachments/text()",dom,namespaces)
    dt1 = File.ParentPath(dt) & "\\" & dt1[0]
    Log.Info(dt1)
    // 获取FileLoc的xm路径
    dom = domtools.InitDom(dt1,"xml")
    dt2 = domtools.Xpath("//ofd:FileLoc/text()",dom,namespaces)
    dt2 = File.ParentPath(dt1) & "\\" & dt2[0]
    Log.Info(dt2)
    // 将FileLoc转换成字典
    sRet = File.Read(dt2,"utf-8")
    sRet = domtools.xmlToJson(sRet)
    sRet = JSON.Parse(sRet)
    // 清洗完整的ofd内容
    sRet = sRet["eInvoice"]
    sRet["fp:GoodsInfos"] = Null
    sRet["header"] = data
    Log.Info(data)
    Return sRet
End Function

ofdPath='''C:\Users\44413\Desktop\412.ofd'''
data = ofd解析(ofdPath)
TracePrint(data)


你可能感兴趣的:(xml/html/json操作工具类,ofd发票解析)