Martin已经看过了我的代码,并且提出了意见,我们最担心的问题——性能问题还是凸显出来了,并不奇怪,erwin导出的文件确实是非常庞大的。
我在python邮件列表里面查找了一下,找到了几个常见的解决方案:
我在python邮件列表里面查找了一下,找到了几个常见的解决方案:
- 使用4suite提供的cDomleete
- 通过pywin32通过com调用ms的MSXML解析器
- PIRXX一个和xerces绑定的dom
1
import
logging,logging.handlers
2 logging.basicConfig(level = logging.DEBUG,\
3 format = ' %(asctime)s %(levelname)s %(message)s ' ,\
4 filename = ' trans2.log ' ,\
5 filemode = ' w ' )
6 logger = logging.getLogger( ' trans2 ' )
7
8 class entity:
9
10 def _parse_entity_properties(self, root):
11 entity_attrs_map = _get_attributes_as_dict(root)
12 self.id = entity_attrs_map[ ' id ' ]
13 self.name = entity_attrs_map[ ' Name ' ]
14
15 entity_properties_map = _get_child_nodes_as_dict(root.firstChild)
16 if ' Physical_Name ' in entity_properties_map:
17 logger.debug( ' found Physical_Name in entity(%s) ' % self.id)
18 self.physical_name = entity_properties_map[ ' Physical_Name ' ]
19 else :
20 self.physical_name = ''
21 logger.info( ' entity id = %s, name=%s, physical_name=%s ' \
22 % (self.id, self.name,self.physical_name))
23
24 def _parse_entity_attributes(self, root):
25 self.attrs = []
26 # attr_list = root.getElementsByTagName('Attribute')
27 attr_list = root.xpath( ' //Attribute ' )
28 for a in attr_list:
29 attr_map = {}
30 id = _get_attributes_as_dict(a)[ ' id ' ]
31 name = _get_attributes_as_dict(a)[ ' Name ' ]
32
33 child_map = _get_child_nodes_as_dict(a.firstChild)
34 pysical_name = ''
35 parent_attr_id = ''
36 parent_relation_id = ''
37 master_attr_id = ''
38 nullable = 1
39 try :
40 # for process some special cases
41 if ' Physical_Name ' in child_map:
42 logger.debug( ' found Physical_Name element in Attribute(%s) ' % id)
43 pysical_name = child_map[ ' Physical_Name ' ].firstChild.data
44 if ' Parent_Attribute ' in child_map:
45 logger.debug( ' found Parent_Attribute element in Attribute(%s) ' % id)
46 parent_attr_id = child_map[ ' Parent_Attribute ' ].firstChild.data
47 if ' Parent_Relationship ' in child_map:
48 logger.debug( ' found Parent_Relationship element in Attribute(%s) ' % id)
49 parent_relation_id = child_map[ ' Parent_Relationship ' ].firstChild.data
50 if ' Master_Attribute ' in child_map:
51 logger.debug( ' found Master_Attribute element in Attribute(%s) ' % id)
52 master_attr_id = child_map[ ' Master_Attribute ' ].firstChild.data
53 if ' Null_Option ' in child_map:
54 logger.debug( ' found Null_Option element in Attribute(%s) ' % id)
55 nullable = child_map[ ' Null_Option ' ].firstChild.data
56 data_type = child_map[ ' Datatype ' ].firstChild.data
57
58 attr_map = { ' attr_id ' :id, ' name ' :name, ' pysical_name ' :pysical_name,\
59 ' nullable ' :nullable, ' data_type ' :data_type, ' parent_attr_id ' :parent_attr_id\
60 , ' parent_relation_id ' :parent_relation_id , ' master_attr_id ' :master_attr_id }
61 except KeyError,msg:
62 print ' warring, maybe missing some attribute\ ' s infomation: % s of entity % s ' \
63 % (msg, self.name)
64 logger.warn( ' warring, maybe missing some attribute\ ' s infomation: % s of entity % s ' \
65 % (msg, self.name))
66 self.attrs.append(attr_map)
67
68 def _parse_entity_keys(self, root):
69 self.pk = {}
70 self.fks = []
71 # key_list = root.getElementsByTagName('Key_Group')
72 key_list = root.xpath( ' //Key_Group ' )
73 for k in key_list:
74 key_id = _get_attributes_as_dict(k)[ ' id ' ]
75 key_name = _get_attributes_as_dict(k)[ ' Name ' ]
76 # process Key_GroupProps, get the key type infomation
77 key_type = _get_child_nodes_as_dict(k.firstChild)[ ' Key_Group_Type ' ].firstChild.data
78 # process Key_Group_MemberProps, get the key column property
79 # try:
80 if ' Key_Group_Member_Column ' in _get_child_nodes_as_dict(\
81 k.lastChild.firstChild.firstChild):
82 key_attr_id = _get_child_nodes_as_dict(k.lastChild.firstChild.firstChild)\
83 [ ' Key_Group_Member_Column ' ].firstChild.data
84 else :
85 logger.error( ' error, can\ ' t find the key defination % s for % s ' \
86 % ( ' Key_Group_Member_Column ' , self.name))
87 key_attr_id = ''
88
89 key_conf = { ' key_id ' :key_id, ' key_name ' :key_name,\
90 ' key_type ' :key_type, ' key_attr_id ' :key_attr_id}
91 if key_conf[ ' key_type ' ] == ' PK ' :
92 self.pk = key_conf
93 else :
94 self.fks.append(key_conf)
95 logger.debug( ' key_props for Key_Group(%s):%s:%s:%s ' \
96 % (key_id,key_name,key_type,key_attr_id))
97 # except KeyError, msg:
98 # print 'error, can\'t find the key defination %s for %s'\
99 # % (msg, self.name)
100 # logger.error('error, can\'t find the key defination %s for %s'\
101 # % (msg, self.name))
102
103
104 def _reset(self):
105 self.id = ''
106 self.name = ''
107 self.attrs = []
108 self.pk = {}
109 self.fks = []
110
111 def __init__ (self,entity_element):
112 self._reset()
113 self._parse_entity_properties(entity_element)
114 self._parse_entity_attributes(entity_element)
115 self._parse_entity_keys(entity_element)
116
117 def __eq__ (a,b):
118 return a.id == b.id
119
120 def __repr__ (self):
121 # print self.__dict__
122 return ' entity with {id:%(id)s,name:%(name)s,pk:%(pk)s ' \
123 % self. __dict__
124
125 class relationship:
126 def __init__ (self,relation_element):
127 self._reset()
128 self._parse_relationship(relation_element)
129
130 def _reset(self):
131 self.id = ''
132 self.parent_id = ''
133 self.child_id = ''
134 self.name = ''
135
136 def _parse_relationship(self, relations_element):
137 attr_map = _get_attributes_as_dict(relations_element)
138 self.id = attr_map[ ' id ' ]
139 self.name = attr_map[ ' Name ' ]
140
141 rel_props = _get_child_nodes_as_dict(relations_element.childNodes[0])
142 self.parent_id = rel_props[ ' Relationship_Parent_Entity ' ].firstChild.data
143 self.child_id = rel_props[ ' Relationship_Child_Entity ' ].firstChild.data
144 logger.debug( ' parsed relation:%s: ' % self)
145
146 def __repr__ (self):
147 return ' relationship with {id:%(id)s,name:%(name)s,parent_id:%(parent_id)s,child_id:%(child_id)s} ' \
148 % self. __dict__
149
150 def __eq__ (a, b):
151 return a.id == b.id
152
153 def _get_attributes_as_dict(element):
154 attrs = {}
155 if element.attributes:
156 for attr in element.attributes.values():
157 attrs[attr.name.strip()] = attr.value
158 return attrs
159
160 def _get_child_nodes_as_dict(element):
161 child_nodes_map = {}
162 if element.childNodes:
163 for e in element.childNodes:
164 if not e.nodeType == e.TEXT_NODE:
165 child_nodes_map[e.tagName.strip()] = e
166 else :
167 child_nodes_map[e.parentNode.tagName.strip()] = e.data
168 return child_nodes_map
169
170 def parseXmlFile(file_name):
171 from xml.dom.ext.reader import Sax2 as parser
172 from Ft.Xml.InputSource import InputSourceFactory
173 from Ft.Xml.cDomlette import Parse
174 f = None
175 docContent = ''
176 try :
177 f = file(file_name)
178 docContent = ' \n ' .join(f.readlines())
179 finally :
180 f.close()
181 # doc = parser.FromXmlFile(file_name)
182 doc = Parse(InputSourceFactory().fromString(docContent))
183 return doc
184
185 def _startParse(root):
186 # entities = root.getElementsByTagName('Entity')
187 # relations = root.getElementsByTagName('Relationship')
188 entities = root.xpath(u ' //Entity ' )
189 relations = root.xpath(u ' //Relationship ' )
190 parsed_entities = [entity(item) for item in entities]
191 parsed_relations = [relationship(item) for item in relations]
192 return parsed_entities,parsed_relations
193
194 if __name__ == ' __main__ ' :
195 import sys,time
196 start = time.time()
197 print ' start@%s ' % start
198 root = parseXmlFile(sys.argv[ 1 ])
199 entities,relations = _startParse(root.documentElement)
200 end = time.time()
201 print ' stop@%s ' % end
202 logger.info( ' cost %ss ' % (end - start))
203 print ' cost %s ' % (end - start)
204
除了对解析器的初始化代码改动了之外,dom的getElementsByTagName被一个Xpath表达式取代了,这个变化其实也不是太大,另外,我把一些依靠异常处理的代码路径截掉,主要是考虑构造异常对象是一个比较耗时的操作。 经过测试,最终的结果是大约加速了4倍,不知道这个结果能不能令人满意,但是从主观感觉来看,解析的时间确实大大缩短了。我使用的xpath表达式可能也影响了效率,这种问题还是交给Martin来帮我看看吧。
2 logging.basicConfig(level = logging.DEBUG,\
3 format = ' %(asctime)s %(levelname)s %(message)s ' ,\
4 filename = ' trans2.log ' ,\
5 filemode = ' w ' )
6 logger = logging.getLogger( ' trans2 ' )
7
8 class entity:
9
10 def _parse_entity_properties(self, root):
11 entity_attrs_map = _get_attributes_as_dict(root)
12 self.id = entity_attrs_map[ ' id ' ]
13 self.name = entity_attrs_map[ ' Name ' ]
14
15 entity_properties_map = _get_child_nodes_as_dict(root.firstChild)
16 if ' Physical_Name ' in entity_properties_map:
17 logger.debug( ' found Physical_Name in entity(%s) ' % self.id)
18 self.physical_name = entity_properties_map[ ' Physical_Name ' ]
19 else :
20 self.physical_name = ''
21 logger.info( ' entity id = %s, name=%s, physical_name=%s ' \
22 % (self.id, self.name,self.physical_name))
23
24 def _parse_entity_attributes(self, root):
25 self.attrs = []
26 # attr_list = root.getElementsByTagName('Attribute')
27 attr_list = root.xpath( ' //Attribute ' )
28 for a in attr_list:
29 attr_map = {}
30 id = _get_attributes_as_dict(a)[ ' id ' ]
31 name = _get_attributes_as_dict(a)[ ' Name ' ]
32
33 child_map = _get_child_nodes_as_dict(a.firstChild)
34 pysical_name = ''
35 parent_attr_id = ''
36 parent_relation_id = ''
37 master_attr_id = ''
38 nullable = 1
39 try :
40 # for process some special cases
41 if ' Physical_Name ' in child_map:
42 logger.debug( ' found Physical_Name element in Attribute(%s) ' % id)
43 pysical_name = child_map[ ' Physical_Name ' ].firstChild.data
44 if ' Parent_Attribute ' in child_map:
45 logger.debug( ' found Parent_Attribute element in Attribute(%s) ' % id)
46 parent_attr_id = child_map[ ' Parent_Attribute ' ].firstChild.data
47 if ' Parent_Relationship ' in child_map:
48 logger.debug( ' found Parent_Relationship element in Attribute(%s) ' % id)
49 parent_relation_id = child_map[ ' Parent_Relationship ' ].firstChild.data
50 if ' Master_Attribute ' in child_map:
51 logger.debug( ' found Master_Attribute element in Attribute(%s) ' % id)
52 master_attr_id = child_map[ ' Master_Attribute ' ].firstChild.data
53 if ' Null_Option ' in child_map:
54 logger.debug( ' found Null_Option element in Attribute(%s) ' % id)
55 nullable = child_map[ ' Null_Option ' ].firstChild.data
56 data_type = child_map[ ' Datatype ' ].firstChild.data
57
58 attr_map = { ' attr_id ' :id, ' name ' :name, ' pysical_name ' :pysical_name,\
59 ' nullable ' :nullable, ' data_type ' :data_type, ' parent_attr_id ' :parent_attr_id\
60 , ' parent_relation_id ' :parent_relation_id , ' master_attr_id ' :master_attr_id }
61 except KeyError,msg:
62 print ' warring, maybe missing some attribute\ ' s infomation: % s of entity % s ' \
63 % (msg, self.name)
64 logger.warn( ' warring, maybe missing some attribute\ ' s infomation: % s of entity % s ' \
65 % (msg, self.name))
66 self.attrs.append(attr_map)
67
68 def _parse_entity_keys(self, root):
69 self.pk = {}
70 self.fks = []
71 # key_list = root.getElementsByTagName('Key_Group')
72 key_list = root.xpath( ' //Key_Group ' )
73 for k in key_list:
74 key_id = _get_attributes_as_dict(k)[ ' id ' ]
75 key_name = _get_attributes_as_dict(k)[ ' Name ' ]
76 # process Key_GroupProps, get the key type infomation
77 key_type = _get_child_nodes_as_dict(k.firstChild)[ ' Key_Group_Type ' ].firstChild.data
78 # process Key_Group_MemberProps, get the key column property
79 # try:
80 if ' Key_Group_Member_Column ' in _get_child_nodes_as_dict(\
81 k.lastChild.firstChild.firstChild):
82 key_attr_id = _get_child_nodes_as_dict(k.lastChild.firstChild.firstChild)\
83 [ ' Key_Group_Member_Column ' ].firstChild.data
84 else :
85 logger.error( ' error, can\ ' t find the key defination % s for % s ' \
86 % ( ' Key_Group_Member_Column ' , self.name))
87 key_attr_id = ''
88
89 key_conf = { ' key_id ' :key_id, ' key_name ' :key_name,\
90 ' key_type ' :key_type, ' key_attr_id ' :key_attr_id}
91 if key_conf[ ' key_type ' ] == ' PK ' :
92 self.pk = key_conf
93 else :
94 self.fks.append(key_conf)
95 logger.debug( ' key_props for Key_Group(%s):%s:%s:%s ' \
96 % (key_id,key_name,key_type,key_attr_id))
97 # except KeyError, msg:
98 # print 'error, can\'t find the key defination %s for %s'\
99 # % (msg, self.name)
100 # logger.error('error, can\'t find the key defination %s for %s'\
101 # % (msg, self.name))
102
103
104 def _reset(self):
105 self.id = ''
106 self.name = ''
107 self.attrs = []
108 self.pk = {}
109 self.fks = []
110
111 def __init__ (self,entity_element):
112 self._reset()
113 self._parse_entity_properties(entity_element)
114 self._parse_entity_attributes(entity_element)
115 self._parse_entity_keys(entity_element)
116
117 def __eq__ (a,b):
118 return a.id == b.id
119
120 def __repr__ (self):
121 # print self.__dict__
122 return ' entity with {id:%(id)s,name:%(name)s,pk:%(pk)s ' \
123 % self. __dict__
124
125 class relationship:
126 def __init__ (self,relation_element):
127 self._reset()
128 self._parse_relationship(relation_element)
129
130 def _reset(self):
131 self.id = ''
132 self.parent_id = ''
133 self.child_id = ''
134 self.name = ''
135
136 def _parse_relationship(self, relations_element):
137 attr_map = _get_attributes_as_dict(relations_element)
138 self.id = attr_map[ ' id ' ]
139 self.name = attr_map[ ' Name ' ]
140
141 rel_props = _get_child_nodes_as_dict(relations_element.childNodes[0])
142 self.parent_id = rel_props[ ' Relationship_Parent_Entity ' ].firstChild.data
143 self.child_id = rel_props[ ' Relationship_Child_Entity ' ].firstChild.data
144 logger.debug( ' parsed relation:%s: ' % self)
145
146 def __repr__ (self):
147 return ' relationship with {id:%(id)s,name:%(name)s,parent_id:%(parent_id)s,child_id:%(child_id)s} ' \
148 % self. __dict__
149
150 def __eq__ (a, b):
151 return a.id == b.id
152
153 def _get_attributes_as_dict(element):
154 attrs = {}
155 if element.attributes:
156 for attr in element.attributes.values():
157 attrs[attr.name.strip()] = attr.value
158 return attrs
159
160 def _get_child_nodes_as_dict(element):
161 child_nodes_map = {}
162 if element.childNodes:
163 for e in element.childNodes:
164 if not e.nodeType == e.TEXT_NODE:
165 child_nodes_map[e.tagName.strip()] = e
166 else :
167 child_nodes_map[e.parentNode.tagName.strip()] = e.data
168 return child_nodes_map
169
170 def parseXmlFile(file_name):
171 from xml.dom.ext.reader import Sax2 as parser
172 from Ft.Xml.InputSource import InputSourceFactory
173 from Ft.Xml.cDomlette import Parse
174 f = None
175 docContent = ''
176 try :
177 f = file(file_name)
178 docContent = ' \n ' .join(f.readlines())
179 finally :
180 f.close()
181 # doc = parser.FromXmlFile(file_name)
182 doc = Parse(InputSourceFactory().fromString(docContent))
183 return doc
184
185 def _startParse(root):
186 # entities = root.getElementsByTagName('Entity')
187 # relations = root.getElementsByTagName('Relationship')
188 entities = root.xpath(u ' //Entity ' )
189 relations = root.xpath(u ' //Relationship ' )
190 parsed_entities = [entity(item) for item in entities]
191 parsed_relations = [relationship(item) for item in relations]
192 return parsed_entities,parsed_relations
193
194 if __name__ == ' __main__ ' :
195 import sys,time
196 start = time.time()
197 print ' start@%s ' % start
198 root = parseXmlFile(sys.argv[ 1 ])
199 entities,relations = _startParse(root.documentElement)
200 end = time.time()
201 print ' stop@%s ' % end
202 logger.info( ' cost %ss ' % (end - start))
203 print ' cost %s ' % (end - start)
204