本文主要介绍有关 Python 对 XML 文件格式的读写,及格式化,序列化,更多 Python 进阶系列文章,请参考 Python 进阶学习 玩转数据系列
内容提要:
XML: Extensible Markup Language
JSON:JavaScript Object Notation
用来创建和解析 XML
创建 XML:
from xml.etree.cElementTree import ElementTree
from xml.etree.cElementTree import Element
root = Element("root")
tree = ElementTree(root)
tree.write('results.xml', encoding='utf8')
举例:
from xml.etree.cElementTree import ElementTree
from xml.etree.cElementTree import Element
from collections import namedtuple
root = Element('contacts') #
tree = ElementTree(root)
# create ContactRecord class, its fields are first, last, age and email
Contact = namedtuple("ContactRecord", 'first last age email')
# Information to populate XML tree with
records = [
Contact('Tom', 'Smith', 53, '[email protected]'),
Contact('Phil', 'Hammer', 42, '[email protected]'),
Contact('Mary', 'Fast', 22, '[email protected]'),
Contact('Jessica', 'Rest', 33, '[email protected]')
]
records.sort(key=lambda a: a.age, reverse=True)
print("records:\n{}".format(records))
# Now build and append nodes to the XML tree:
for record in records:
contact = Element('contact') #
name = Element('name')
first = Element('first') #
last = Element('last')
email = Element('email')
name.attrib = {'age': str(record.age)} # < name age='43'>
first.text = record.first
last.text = record.last
email.text = record.email
name.append(first) # John
name.append(last)
contact.append(name)
contact.append(email)
root.append(contact)
# save the built XML tree as an XML file:
tree.write('results.xml', encoding='utf8')
输出:
records:
[ContactRecord(first='Tom', last='Smith', age=53, email='[email protected]'), ContactRecord(first='Phil', last='Hammer', age=42, email='[email protected]'), ContactRecord(first='Jessica', last='Rest', age=33, email='[email protected]'), ContactRecord(first='Mary', last='Fast', age=22, email='[email protected]')]
设置和获取属性的方法:
.set(): 设置属性
element.set(name, value)
.attrib(): 设置属性
element.attrib = {name:value}
element.attrib[name] = value
.get(): 获取属性
element.get(name)
element.get(name, default value)
举例:
from xml.etree.cElementTree import ElementTree
from xml.etree.cElementTree import Element
from collections import namedtuple
root = Element('contacts') #
tree = ElementTree(root)
# create ContactRecord class, its fields are first, last, age and email
Contact = namedtuple("ContactRecord", 'first last age email')
# Information to populate XML tree with
records = [
Contact('Tom', 'Smith', 53, '[email protected]'),
Contact('Phil', 'Hammer', 42, '[email protected]'),
Contact('Mary', 'Fast', 22, '[email protected]'),
Contact('Jessica', 'Rest', 33, '[email protected]')
]
records.sort(key=lambda a: a.age, reverse=True)
print("records:\n{}".format(records))
# Now build and append nodes to the XML tree:
for record in records:
name = Element('name')
name.set('age', str(record.age))
# or
name.attrib = {'age':str(record.age)}
# or
name.attrib['age'] = str(record.age)
# ... the rest of the code
# possible KeyError
print("age attribute is", name.attrib['age'])
# possible None
print("age attribute is", name.get('age'))
# will use a default
print("no age_foo attribute, default is", name.get('age_foo', 50))
输出:
records:
[ContactRecord(first='Tom', last='Smith', age=53, email='[email protected]'), ContactRecord(first='Phil', last='Hammer', age=42, email='[email protected]'), ContactRecord(first='Jessica', last='Rest', age=33, email='[email protected]'), ContactRecord(first='Mary', last='Fast', age=22, email='[email protected]')]
age attribute is 53
age attribute is 53
no age_foo attribute, default is 50
age attribute is 42
age attribute is 42
no age_foo attribute, default is 50
age attribute is 33
age attribute is 33
no age_foo attribute, default is 50
age attribute is 22
age attribute is 22
no age_foo attribute, default is 50
• xml ElementTree 不支持漂亮的格式输出
• minidom API 可以作为一个 work-around
• lxml 版本的 ElementTree 支持漂亮的格式选项
https://lxml.de/xpathxslt.html
● 推荐使用 LXML,有丰富的解析器
● 是基于 libxml2 C++ library 开发的
● 是一个验证解析器: 支持 schema 和 DTDs
● 支持 full XPath 语法, 和 XSLT 能力
● 安装: pip install lxml
XLST 是 XML 样式语言 style sheet language,利用它可以将一个 XML 文档转换成 HTML
DTD 是文档类型定义,一个 DID 定义 XML 文档的结构,合法的元素及其属性
letree.tostring(lroot, pretty_print=True).decode(‘utf8’)
import lxml.etree as letree
from collections import namedtuple
# Example: XML pretty printing with lxml
lroot = letree.Element("root")
Contact = namedtuple("ContactRecord", 'first last age email')
records = [
Contact('Tom', 'Smith', 53, '[email protected]'),
Contact('Phil', 'Hammer', 42, '[email protected]'),
Contact('Mary', 'Fast', 22, '[email protected]'),
Contact('Jessica', 'Rest', 33, '[email protected]')
]
records.sort(key=lambda a: a.age, reverse=True)
for record in records:
contact = letree.Element('contact') #
name = letree.Element('name')
first = letree.Element('first') #
last = letree.Element('last')
email = letree.Element('email')
name.set('age', str(record.age))
first.text = record.first
last.text = record.last
email.text = record.email
name.append(first) # John
name.append(last)
contact.append(name)
contact.append(email)
lroot.append(contact)
print(letree.tostring(lroot, pretty_print=True).decode('utf8'))
输出:
<root>
<contact>
<name age="53">
<first>Tom</first>
<last>Smith</last>
</name>
<email>tsmith@boo.com</email>
</contact>
<contact>
<name age="42">
<first>Phil</first>
<last>Hammer</last>
</name>
<email>phammer@boo.com</email>
</contact>
<contact>
<name age="33">
<first>Jessica</first>
<last>Rest</last>
</name>
<email>jrest@goo.com</email>
</contact>
<contact>
<name age="22">
<first>Mary</first>
<last>Fast</last>
</name>
<email>mfast@boo.com</email>
</contact>
</root>
pretty_xml = minidom.parseString(xml_str).toprettyxml(encoding=‘utf8’)
import xml.etree.cElementTree as etree
from xml.etree.cElementTree import ElementTree
from xml.etree.cElementTree import Element
from collections import namedtuple
from xml.dom import minidom
root = Element('contacts') #
tree = ElementTree(root)
# create ContactRecord class, its fields are first, last, age and email
Contact = namedtuple("ContactRecord", 'first last age email')
# Information to populate XML tree with
records = [
Contact('Tom', 'Smith', 53, '[email protected]'),
Contact('Phil', 'Hammer', 42, '[email protected]'),
Contact('Mary', 'Fast', 22, '[email protected]'),
Contact('Jessica', 'Rest', 33, '[email protected]')
]
records.sort(key=lambda a: a.age, reverse=True)
print("records:\n{}".format(records))
# Now build and append nodes to the XML tree:
for record in records:
contact = Element('contact') #
name = Element('name')
first = Element('first') #
last = Element('last')
email = Element('email')
name.attrib = {'age': str(record.age)} # < name age='43'>
first.text = record.first
last.text = record.last
email.text = record.email
name.append(first) # John
name.append(last)
contact.append(name)
contact.append(email)
root.append(contact)
xml_str = etree.tostring(root)
pretty_xml = minidom.parseString(xml_str).toprettyxml(encoding='utf8')
print(pretty_xml.decode())
with open("pretty.xml", 'w') as f:
f.write(pretty_xml.decode())
输出:
records:
[ContactRecord(first='Tom', last='Smith', age=53, email='[email protected]'), ContactRecord(first='Phil', last='Hammer', age=42, email='[email protected]'), ContactRecord(first='Jessica', last='Rest', age=33, email='[email protected]'), ContactRecord(first='Mary', last='Fast', age=22, email='[email protected]')]
<?xml version="1.0" encoding="utf8"?>
<contacts>
<contact>
<name age="53">
<first>Tom</first>
<last>Smith</last>
</name>
<email>tsmith@boo.com</email>
</contact>
<contact>
<name age="42">
<first>Phil</first>
<last>Hammer</last>
</name>
<email>phammer@boo.com</email>
</contact>
<contact>
<name age="33">
<first>Jessica</first>
<last>Rest</last>
</name>
<email>jrest@goo.com</email>
</contact>
<contact>
<name age="22">
<first>Mary</first>
<last>Fast</last>
</name>
<email>mfast@boo.com</email>
</contact>
</contacts>
results.xml 内容:
<contacts><contact><name age="53"><first>Tomfirst><last>Smithlast>name><email>[email protected]email>contact><contact><name age="42"><first>Philfirst><last>Hammerlast>name><email>[email protected]email>contact><contact><name age="33"><first>Jessicafirst><last>Restlast>name><email>[email protected]email>contact><contact><name age="22"><first>Maryfirst><last>Fastlast>name><email>[email protected]email>contact>contacts>
解析 results.xml 内容
from xml.etree.cElementTree import ElementTree
from collections import namedtuple
Contact = namedtuple('ContactRecord', 'first last age email')
tree = ElementTree().parse('results.xml')
contacts = []
for contact in tree.getiterator('contact'):
first = contact.find('.//first').text
last = contact.find('.//last').text
age = contact.find('./name').get('age')
email = contact.find('.//email').text
contacts.append(Contact(first, last, age, email))
print(contacts)
输出:
[ContactRecord(first='Tom', last='Smith', age='53', email='[email protected]'), ContactRecord(first='Phil', last='Hammer', age='42', email='[email protected]'), ContactRecord(first='Jessica', last='Rest', age='33', email='[email protected]'), ContactRecord(first='Mary', last='Fast', age='22', email='[email protected]')]
from xml.etree.cElementTree import ElementTree
xml = '''
-
2260
1853
-
3312
1853
'''
with open("xpath.xml",'w') as f:
f.write(xml)
fp = open("xpath.xml",'r')
root = ElementTree().parse(fp)
elements = root.findall('Items/Item/ItemAttributes/ListPrice/Amount')
for i in elements:
print(i.text)
输出:
2260
3312
XML 操作可能会抛出异常,所以需要用 try-except 来处理异常。
举例:
from xml.etree.cElementTree import ElementTree
from xml.etree.cElementTree import ParseError
from collections import namedtuple
import sys
try:
tree = ElementTree().parse('results.xml')
except ParseError as e:
print('Parse error: {err}'.format(err=e))
sys.exit()
contacts = []
Contact = namedtuple('ContactRecord', 'first last age email')
for contact in tree.getiterator('contact'):
try:
first = contact.find('.//first').text
last = contact.find('.//last').text
age = contact.find('./name').get('age')
email = contact.find('.//email').text
contacts.append(Contact(first, last, age, email))
except AttributeError as e:
print('Element error: {err}'.format(err=e))
print(contacts)
输出:
[ContactRecord(first='Tom', last='Smith', age='53', email='[email protected]'), ContactRecord(first='Phil', last='Hammer', age='42', email='[email protected]'), ContactRecord(first='Jessica', last='Rest', age='33', email='[email protected]'), ContactRecord(first='Mary', last='Fast', age='22', email='[email protected]')]
from xml.etree.cElementTree import ElementTree
from xml.etree.cElementTree import ParseError
import pandas as pd
import sys
cols = ['first', 'last','age', 'email']
xml_df = pd.DataFrame(columns = cols,dtype=str)
try:
tree = ElementTree().parse('results.xml')
except ParseError as e:
print('Parse error: {err}'.format(err=e))
sys.exit()
for contact in tree.getiterator('contact'):
try:
first = contact.find('.//first').text
last = contact.find('.//last').text
age = contact.find('./name').get('age')
email = contact.find('.//email').text
xml_df = xml_df.append(
pd.Series([first, last, age, email],index=cols),
ignore_index=True)
except AttributeError as e:
print('Element error: {err}'.format(err=e))
print("xml_df:\n{}".format(xml_df))
输出:
xml_df:
first last age email
0 Tom Smith 53 tsmith@boo.com
1 Phil Hammer 42 phammer@boo.com
2 Jessica Rest 33 jrest@goo.com
3 Mary Fast 22 mfast@boo.com
• pip install xmltodict
• read XML to OrderedDict
有关 JSON 和 Python Object 序列化和反序列化,请参考Python JSON 操作 - JSON 与 Python 对象,自定义对象 之间的互相转化
解析 results.xml 成 JSON 格式:
<contacts><contact><name age="53"><first>Tomfirst><last>Smithlast>name><email>[email protected]email>contact><contact><name age="42"><first>Philfirst><last>Hammerlast>name><email>[email protected]email>contact><contact><name age="33"><first>Jessicafirst><last>Restlast>name><email>[email protected]email>contact><contact><name age="22"><first>Maryfirst><last>Fastlast>name><email>[email protected]email>contact>contacts>
import xmltodict
import json
with open('results.xml') as f:
xml_input = f.read()
ordered_dict_object_from_xml = xmltodict.parse(xml_input)
print("ordered_dict_object_from_xml:\n{}".format(ordered_dict_object_from_xml))
# serialize ordered_dict_object to json str
json_str_from_xml = json.dumps(ordered_dict_object_from_xml)
print("json_str_from_xml:\n{}".format(json_str_from_xml))
# deserialize json str to python object
json_from_xml = json.loads(json_str_from_xml)
print("json_from_xml:\n{}".format(json_from_xml))
输出:
ordered_dict_object_from_xml:
OrderedDict([('contacts', OrderedDict([('contact', [OrderedDict([('name', OrderedDict([('@age', '53'), ('first', 'Tom'), ('last', 'Smith')])), ('email', '[email protected]')]), OrderedDict([('name', OrderedDict([('@age', '42'), ('first', 'Phil'), ('last', 'Hammer')])), ('email', '[email protected]')]), OrderedDict([('name', OrderedDict([('@age', '33'), ('first', 'Jessica'), ('last', 'Rest')])), ('email', '[email protected]')]), OrderedDict([('name', OrderedDict([('@age', '22'), ('first', 'Mary'), ('last', 'Fast')])), ('email', '[email protected]')])])]))])
json_str_from_xml:
{"contacts": {"contact": [{"name": {"@age": "53", "first": "Tom", "last": "Smith"}, "email": "[email protected]"}, {"name": {"@age": "42", "first": "Phil", "last": "Hammer"}, "email": "[email protected]"}, {"name": {"@age": "33", "first": "Jessica", "last": "Rest"}, "email": "[email protected]"}, {"name": {"@age": "22", "first": "Mary", "last": "Fast"}, "email": "[email protected]"}]}}
json_from_xml:
{'contacts': {'contact': [{'name': {'@age': '53', 'first': 'Tom', 'last': 'Smith'}, 'email': '[email protected]'}, {'name': {'@age': '42', 'first': 'Phil', 'last': 'Hammer'}, 'email': '[email protected]'}, {'name': {'@age': '33', 'first': 'Jessica', 'last': 'Rest'}, 'email': '[email protected]'}, {'name': {'@age': '22', 'first': 'Mary', 'last': 'Fast'}, 'email': '[email protected]'}]}}