# -*- coding:utf-8 -*-
import codecs
import xml.etree.ElementTree as ET
import sys
defaultencoding = 'utf-8'
if sys.getdefaultencoding() != defaultencoding:
reload(sys)
sys.setdefaultencoding(defaultencoding)
# 默认情况下,Python采用的是ascii编码方式,将Python的默认编码方式修改为utf-8
tree = ET.parse('test.tmx')
root = tree.getroot()
body = root[1]
txtfile = codecs.open('target.txt', 'w', 'utf-8')
for item in body:
try:
bs = item[0][0].text
zh = item[1][0].text
bs = bs.decode('utf-8')
zh = zh.decode('utf-8')
txtfile.write(bs + "\t" + zh + "\n")
except Exception as ex:
print ex.message
continue
txtfile.close()
以上方法能够很快遍历小型的xml和tmx,对于大型文件,直接读入建文档树,内存可能会爆炸,查了一些资料,像是分块处理之类的能够解决,小白没看懂,直接用最简单粗暴的方法,当作普通文件一行一行读入之后正则匹配提取内容
xmlfile = codecs.open("test.xml", 'r', 'utf-8')
txtfile = codecs.open("target.txt", 'w', 'utf-8')
line = xmlfile.readline()
while line:
result = re.search('', line)
if result is not None:
bs = re.sub('<.*?>', "", line)
bs = bs.strip()
bs = bs.decode('utf-8')
txtfile.write(bs+"\t")
else:
result = re.search('', line)
if result is not None:
zh = re.sub('<.*?>', "", line)
zh = zh.strip()
zh = zh.decode('utf-8')
txtfile.write(zh+"\n")
line = xmlfile.readline()
xmlfile.close()
txtfile.close()
以上方法太过粗暴,会受很多因素影响,比较合理的办法是先识别tuv标签头作为一对语句的开始,匹配到tuv关闭标签代表这对语句的结束