这个东西是我的朋友 Martin在处理一个老系统转换erwin的数据库表结构定义的xml文件到hibernate配置文件, VO和DAO时做的一个实验,我们本来有一个牛人写的程序来处理这些乱七八糟的表设计,可惜源代码都丢失了,而且他的转换是基于hibernate 1.x,有很多的先进特性无法利用,我们的目标是:
1,erwin版本独立,erwin这个xml文件格式随不同的版本略有不同,希望我们的东西可以做到隔离版本带来的问题。
2,高效,系统中的表定义超过2000,导出的xml文件非常庞大,动辄超过三四十兆,我们希望对于表结构变动带来的变化能够及时响应,不能(过于)影响开发。
初步的设想是先解析文件,得到最基本的表结构信息,主外键信息,然后进一步加工成我们想要得文件格式,我选择的工具是python。
现在准备工作已经完成了,下面是把xml文件解析成为python对象的代码。
下面是几个testcase 点击下载
1,erwin版本独立,erwin这个xml文件格式随不同的版本略有不同,希望我们的东西可以做到隔离版本带来的问题。
2,高效,系统中的表定义超过2000,导出的xml文件非常庞大,动辄超过三四十兆,我们希望对于表结构变动带来的变化能够及时响应,不能(过于)影响开发。
初步的设想是先解析文件,得到最基本的表结构信息,主外键信息,然后进一步加工成我们想要得文件格式,我选择的工具是python。
现在准备工作已经完成了,下面是把xml文件解析成为python对象的代码。
下面是几个testcase 点击下载
import
logging,logging.handlers
logging.basicConfig(level = logging.DEBUG,\
format = ' %(asctime)s %(levelname)s %(message)s ' ,\
filename = ' trans2.log ' ,\
filemode = ' w ' )
logger = logging.getLogger( ' trans2 ' )
class entity:
def _parse_entity_properties(self, root):
entity_attrs_map = _get_attributes_as_dict(root)
self.id = entity_attrs_map[ ' id ' ]
self.name = entity_attrs_map[ ' Name ' ]
entity_properties_map = _get_child_nodes_as_dict(root.firstChild)
if ' Physical_Name ' in entity_properties_map:
logger.debug( ' found Physical_Name in entity(%s) ' % self.id)
self.physical_name = entity_properties_map[ ' Physical_Name ' ]
else :
self.physical_name = ''
logger.debug( ' entity id = %s, name=%s, physical_name=%s ' \
% (self.id, self.name,self.physical_name))
def _parse_entity_attributes(self, root):
self.attrs = []
attr_list = root.getElementsByTagName( ' Attribute ' )
for a in attr_list:
attr_map = {}
id = _get_attributes_as_dict(a)[ ' id ' ]
name = _get_attributes_as_dict(a)[ ' Name ' ]
child_map = _get_child_nodes_as_dict(a.firstChild)
pysical_name = ''
parent_attr_id = ''
parent_relation_id = ''
master_attr_id = ''
nullable = 1
try :
# for process some special cases
if ' Physical_Name ' in child_map:
logger.debug( ' found Physical_Name element in Attribute(%s) ' % id)
pysical_name = child_map[ ' Physical_Name ' ].firstChild.data
if ' Parent_Attribute ' in child_map:
logger.debug( ' found Parent_Attribute element in Attribute(%s) ' % id)
parent_attr_id = child_map[ ' Parent_Attribute ' ].firstChild.data
if ' Parent_Relationship ' in child_map:
logger.debug( ' found Parent_Relationship element in Attribute(%s) ' % id)
parent_relation_id = child_map[ ' Parent_Relationship ' ].firstChild.data
if ' Master_Attribute ' in child_map:
logger.debug( ' found Master_Attribute element in Attribute(%s) ' % id)
master_attr_id = child_map[ ' Master_Attribute ' ].firstChild.data
if ' Null_Option ' in child_map:
logger.debug( ' found Null_Option element in Attribute(%s) ' % id)
nullable = child_map[ ' Null_Option ' ].firstChild.data
data_type = child_map[ ' Datatype ' ].firstChild.data
attr_map = { ' attr_id ' :id, ' name ' :name, ' pysical_name ' :pysical_name,\
' nullable ' :nullable, ' data_type ' :data_type, ' parent_attr_id ' :parent_attr_id\
, ' parent_relation_id ' :parent_relation_id , ' master_attr_id ' :master_attr_id }
except KeyError,msg:
print ' warring, maybe missing some attribute\ ' s infomation: % s of entity % s ' \
% (msg, self.name)
logger.warn( ' warring, maybe missing some attribute\ ' s infomation: % s of entity % s ' \
% (msg, self.name))
self.attrs.append(attr_map)
def _parse_entity_keys(self, root):
self.pk = {}
self.fks = []
key_list = root.getElementsByTagName( ' Key_Group ' )
for k in key_list:
key_id = _get_attributes_as_dict(k)[ ' id ' ]
key_name = _get_attributes_as_dict(k)[ ' Name ' ]
# process Key_GroupProps, get the key type infomation
key_type = _get_child_nodes_as_dict(k.firstChild)[ ' Key_Group_Type ' ].firstChild.data
# process Key_Group_MemberProps, get the key column property
try :
key_attr_id = _get_child_nodes_as_dict(k.lastChild.firstChild.firstChild)\
[ ' Key_Group_Member_Column ' ].firstChild.data
key_conf = { ' key_id ' :key_id, ' key_name ' :key_name,\
' key_type ' :key_type, ' key_attr_id ' :key_attr_id}
if key_conf[ ' key_type ' ] == ' PK ' :
self.pk = key_conf
else :
self.fks.append(key_conf)
logger.debug( ' key_props for Key_Group(%s):%s:%s:%s ' \
% (key_id,key_name,key_type,key_attr_id))
except KeyError, msg:
print ' error, can\ ' t find the key defination % s for % s ' \
% (msg, self.name)
logger.error( ' error, can\ ' t find the key defination % s for % s ' \
% (msg, self.name))
def _reset(self):
self.id = ''
self.name = ''
self.attrs = []
self.pk = {}
self.fks = []
def __init__ (self,entity_element):
self._reset()
self._parse_entity_properties(entity_element)
self._parse_entity_attributes(entity_element)
self._parse_entity_keys(entity_element)
def __eq__ (a,b):
return a.id == b.id
def __repr__ (self):
# print self.__dict__
return ' entity with {id:%(id)s,name:%(name)s,pk:%(pk)s ' \
% self. __dict__
class relationship:
def __init__ (self,relation_element):
self._reset()
self._parse_relationship(relation_element)
def _reset(self):
self.id = ''
self.parent_id = ''
self.child_id = ''
self.name = ''
def _parse_relationship(self, relations_element):
attr_map = _get_attributes_as_dict(relations_element)
self.id = attr_map[ ' id ' ]
self.name = attr_map[ ' Name ' ]
rel_props = _get_child_nodes_as_dict(relations_element.childNodes[0])
self.parent_id = rel_props[ ' Relationship_Parent_Entity ' ].firstChild.data
self.child_id = rel_props[ ' Relationship_Child_Entity ' ].firstChild.data
logger.debug( ' parsed relation:%s: ' % self)
def __repr__ (self):
return ' relationship with {id:%(id)s,name:%(name)s,parent_id:%(parent_id)s,child_id:%(child_id)s} ' \
% self. __dict__
def __eq__ (a, b):
return a.id == b.id
def _get_attributes_as_dict(element):
attributes = {}
if element.attributes:
for attr in element.attributes:
attributes[attr.name.strip()] = attr.value
return attributes
def _get_child_nodes_as_dict(element):
child_nodes_map = {}
if element.childNodes:
for e in element.childNodes:
if not e.nodeType == e.TEXT_NODE:
child_nodes_map[e.tagName.strip()] = e
else :
child_nodes_map[e.parentNode.tagName.strip()] = e.data
return child_nodes_map
def parseXmlFile(file_name):
from xml.dom.ext.reader import Sax2
doc = Sax2.FromXmlFile(file_name)
return doc
def _startParse(root):
entities = root.getElementsByTagName( ' Entity ' )
relations = root.getElementsByTagName( ' Relationship ' )
parsed_entities = [entity(item) for item in entities]
parsed_relations = [relationship(item) for item in relations]
return parsed_entities,parsed_relations
if __name__ == ' __main__ ' :
import sys,time
start = time.time()
print ' start@%s ' % start
root = parseXmlFile(sys.argv[ 1 ])
entities,relations = _startParse(root.documentElement)
end = time.time()
print ' stop@%s ' % end
print ' cost %s ' % (end - start)
logging.basicConfig(level = logging.DEBUG,\
format = ' %(asctime)s %(levelname)s %(message)s ' ,\
filename = ' trans2.log ' ,\
filemode = ' w ' )
logger = logging.getLogger( ' trans2 ' )
class entity:
def _parse_entity_properties(self, root):
entity_attrs_map = _get_attributes_as_dict(root)
self.id = entity_attrs_map[ ' id ' ]
self.name = entity_attrs_map[ ' Name ' ]
entity_properties_map = _get_child_nodes_as_dict(root.firstChild)
if ' Physical_Name ' in entity_properties_map:
logger.debug( ' found Physical_Name in entity(%s) ' % self.id)
self.physical_name = entity_properties_map[ ' Physical_Name ' ]
else :
self.physical_name = ''
logger.debug( ' entity id = %s, name=%s, physical_name=%s ' \
% (self.id, self.name,self.physical_name))
def _parse_entity_attributes(self, root):
self.attrs = []
attr_list = root.getElementsByTagName( ' Attribute ' )
for a in attr_list:
attr_map = {}
id = _get_attributes_as_dict(a)[ ' id ' ]
name = _get_attributes_as_dict(a)[ ' Name ' ]
child_map = _get_child_nodes_as_dict(a.firstChild)
pysical_name = ''
parent_attr_id = ''
parent_relation_id = ''
master_attr_id = ''
nullable = 1
try :
# for process some special cases
if ' Physical_Name ' in child_map:
logger.debug( ' found Physical_Name element in Attribute(%s) ' % id)
pysical_name = child_map[ ' Physical_Name ' ].firstChild.data
if ' Parent_Attribute ' in child_map:
logger.debug( ' found Parent_Attribute element in Attribute(%s) ' % id)
parent_attr_id = child_map[ ' Parent_Attribute ' ].firstChild.data
if ' Parent_Relationship ' in child_map:
logger.debug( ' found Parent_Relationship element in Attribute(%s) ' % id)
parent_relation_id = child_map[ ' Parent_Relationship ' ].firstChild.data
if ' Master_Attribute ' in child_map:
logger.debug( ' found Master_Attribute element in Attribute(%s) ' % id)
master_attr_id = child_map[ ' Master_Attribute ' ].firstChild.data
if ' Null_Option ' in child_map:
logger.debug( ' found Null_Option element in Attribute(%s) ' % id)
nullable = child_map[ ' Null_Option ' ].firstChild.data
data_type = child_map[ ' Datatype ' ].firstChild.data
attr_map = { ' attr_id ' :id, ' name ' :name, ' pysical_name ' :pysical_name,\
' nullable ' :nullable, ' data_type ' :data_type, ' parent_attr_id ' :parent_attr_id\
, ' parent_relation_id ' :parent_relation_id , ' master_attr_id ' :master_attr_id }
except KeyError,msg:
print ' warring, maybe missing some attribute\ ' s infomation: % s of entity % s ' \
% (msg, self.name)
logger.warn( ' warring, maybe missing some attribute\ ' s infomation: % s of entity % s ' \
% (msg, self.name))
self.attrs.append(attr_map)
def _parse_entity_keys(self, root):
self.pk = {}
self.fks = []
key_list = root.getElementsByTagName( ' Key_Group ' )
for k in key_list:
key_id = _get_attributes_as_dict(k)[ ' id ' ]
key_name = _get_attributes_as_dict(k)[ ' Name ' ]
# process Key_GroupProps, get the key type infomation
key_type = _get_child_nodes_as_dict(k.firstChild)[ ' Key_Group_Type ' ].firstChild.data
# process Key_Group_MemberProps, get the key column property
try :
key_attr_id = _get_child_nodes_as_dict(k.lastChild.firstChild.firstChild)\
[ ' Key_Group_Member_Column ' ].firstChild.data
key_conf = { ' key_id ' :key_id, ' key_name ' :key_name,\
' key_type ' :key_type, ' key_attr_id ' :key_attr_id}
if key_conf[ ' key_type ' ] == ' PK ' :
self.pk = key_conf
else :
self.fks.append(key_conf)
logger.debug( ' key_props for Key_Group(%s):%s:%s:%s ' \
% (key_id,key_name,key_type,key_attr_id))
except KeyError, msg:
print ' error, can\ ' t find the key defination % s for % s ' \
% (msg, self.name)
logger.error( ' error, can\ ' t find the key defination % s for % s ' \
% (msg, self.name))
def _reset(self):
self.id = ''
self.name = ''
self.attrs = []
self.pk = {}
self.fks = []
def __init__ (self,entity_element):
self._reset()
self._parse_entity_properties(entity_element)
self._parse_entity_attributes(entity_element)
self._parse_entity_keys(entity_element)
def __eq__ (a,b):
return a.id == b.id
def __repr__ (self):
# print self.__dict__
return ' entity with {id:%(id)s,name:%(name)s,pk:%(pk)s ' \
% self. __dict__
class relationship:
def __init__ (self,relation_element):
self._reset()
self._parse_relationship(relation_element)
def _reset(self):
self.id = ''
self.parent_id = ''
self.child_id = ''
self.name = ''
def _parse_relationship(self, relations_element):
attr_map = _get_attributes_as_dict(relations_element)
self.id = attr_map[ ' id ' ]
self.name = attr_map[ ' Name ' ]
rel_props = _get_child_nodes_as_dict(relations_element.childNodes[0])
self.parent_id = rel_props[ ' Relationship_Parent_Entity ' ].firstChild.data
self.child_id = rel_props[ ' Relationship_Child_Entity ' ].firstChild.data
logger.debug( ' parsed relation:%s: ' % self)
def __repr__ (self):
return ' relationship with {id:%(id)s,name:%(name)s,parent_id:%(parent_id)s,child_id:%(child_id)s} ' \
% self. __dict__
def __eq__ (a, b):
return a.id == b.id
def _get_attributes_as_dict(element):
attributes = {}
if element.attributes:
for attr in element.attributes:
attributes[attr.name.strip()] = attr.value
return attributes
def _get_child_nodes_as_dict(element):
child_nodes_map = {}
if element.childNodes:
for e in element.childNodes:
if not e.nodeType == e.TEXT_NODE:
child_nodes_map[e.tagName.strip()] = e
else :
child_nodes_map[e.parentNode.tagName.strip()] = e.data
return child_nodes_map
def parseXmlFile(file_name):
from xml.dom.ext.reader import Sax2
doc = Sax2.FromXmlFile(file_name)
return doc
def _startParse(root):
entities = root.getElementsByTagName( ' Entity ' )
relations = root.getElementsByTagName( ' Relationship ' )
parsed_entities = [entity(item) for item in entities]
parsed_relations = [relationship(item) for item in relations]
return parsed_entities,parsed_relations
if __name__ == ' __main__ ' :
import sys,time
start = time.time()
print ' start@%s ' % start
root = parseXmlFile(sys.argv[ 1 ])
entities,relations = _startParse(root.documentElement)
end = time.time()
print ' stop@%s ' % end
print ' cost %s ' % (end - start)