# coding=utf-8
'''
Created on 2022年5月23日
@author: 瞌睡蟲子
'''
from lxml import etree
from cssselect import GenericTranslator
from jsonpath import jsonpath
from io import BytesIO
from os import path
import json
import re
import xmltodict
def initJson(data):
if type(data) == dict:
pass
elif path.isfile(data):
with open(data, "rb") as f:
data = f.read()
try:
data = json.loads(data)
except:
data = data.decode("gbk")
data = json.loads(data)
else:
data = json.loads(data)
return data
def jpath(exp, jsonObj):
return jsonpath(jsonObj, exp)
def initDom(data, parseType="html"):
return _parse(data, parseType)
def xpath(exp, lxmlObj, args={}):
elem = lxmlObj.xpath(exp, **args)
return [str(item) for item in elem]
def css(exp, lxmlObj, args={}):
return xpath(GenericTranslator().css_to_xpath(exp), lxmlObj, args)
def replace(reg, exp, come):
# re.MULTILINE | re.DOTALL
return re.sub(re.compile(reg, 8 | 16), (exp if exp.find('lambda') == -1 else eval(exp, globals(), locals())), come)
def pyeval(exp, args=None):
return eval(exp, globals(), locals())
def xmlToJson(xml):
converteJson = xmltodict.parse(xml, encoding='utf-8')
jsonStr = json.dumps(converteJson, indent=4)
return jsonStr
def jsonToXml(js):
jsDict = json.loads(js)
try:
convertXml = xmltodict.unparse(jsDict, encoding='utf-8')
except:
convertXml = xmltodict.unparse({'root': jsDict}, encoding='utf-8')
return convertXml
def _parse(data, parseType="html"):
# print(parseType)
# 设置解析方式
if str.lower(parseType) == "html":
parser = etree.HTMLParser()
elif str.lower(parseType) == "feed":
parser = etree.FeedParser()
else:
parser = etree.XMLParser()
# print(type(parser))
if type(data) == bytes:
# 以bytes类型传入
dom = etree.parse(BytesIO(data), parser)
elif path.isfile(data):
# 以文件路径格式传入,这里要转成bytes,解决xml含utf-8表头错误
with open(data, "rb") as f:
data = f.read()
dom = etree.parse(BytesIO(data), parser)
else:
# 讲字符串转为bytes,解决xml含utf-8表头错误
dom = etree.parse(BytesIO(data.encode()), parser)
return dom
if __name__ == "__main__":
dom = initJson(r"C:\Users\44413\Desktop\ofd\11.json")
a = jpath("$..fp:SellerAddrTel",dom)
print(a)
uibot ofd格式发票解析
Function ofd解析(ofdPath)
Dim dom,dt,dt1,dt2,keys,values
Dim data = ""
Dim sRet = ""
Dim arrayRet = ""
Dim rootPath = @res"data"
// 创建临时解压目录
If File.FolderExists(rootPath)
File.DeleteFolder(rootPath)
End If
File.CreateFolder(rootPath)
// 用zip解压ofd格式文件
arrayRet = File.Decompression(ofdPath,rootPath,{"sPassword":""})
// 查找根文件xml
data = File.SearchFile(rootPath,"*.xml",False)
sRet = File.Read(data[0],"utf-8")
sRet = domtools.xmlToJson(sRet)
sRet = JSON.Parse(sRet)
// sRet = domtools.InitJson(sRet)
dt = domtools.Jpath("$..@DocType",sRet)
dt = LCase(dt[0])
dt1 = domtools.Jpath("$..@xmlns:ofd",sRet)
namespaces = {"namespaces": {dt: dt1[0]}}
Log.Info(namespaces)
// 构建xml用于解析
dom = domtools.InitDom(data[0],"xml")
// 查找xml信息key
keys = domtools.Xpath("//ofd:CustomData/@Name",dom,namespaces)
// 查找xml信息value
values = domtools.Xpath("//ofd:CustomData/text()",dom,namespaces)
// 将key,value转换成自动
data = domtools.pyeval("dict(zip(args[0],args[1]))",[keys,values])
Log.Info(data)
// 获取内容xml路径
dt = domtools.Xpath("//ofd:DocRoot/text()",dom,namespaces)
dt = rootPath & "\\" & dt[0]
Log.Info(dt)
// 获取Attachments的xm路径
dom = domtools.InitDom(dt,"xml")
dt1 = domtools.Xpath("//ofd:Attachments/text()",dom,namespaces)
dt1 = File.ParentPath(dt) & "\\" & dt1[0]
Log.Info(dt1)
// 获取FileLoc的xm路径
dom = domtools.InitDom(dt1,"xml")
dt2 = domtools.Xpath("//ofd:FileLoc/text()",dom,namespaces)
dt2 = File.ParentPath(dt1) & "\\" & dt2[0]
Log.Info(dt2)
// 将FileLoc转换成字典
sRet = File.Read(dt2,"utf-8")
sRet = domtools.xmlToJson(sRet)
sRet = JSON.Parse(sRet)
// 清洗完整的ofd内容
sRet = sRet["eInvoice"]
sRet["fp:GoodsInfos"] = Null
sRet["header"] = data
Log.Info(data)
Return sRet
End Function
ofdPath='''C:\Users\44413\Desktop\412.ofd'''
data = ofd解析(ofdPath)
TracePrint(data)