本人最近在搞FasterRCNN的项目,无奈在labelimg的时候无意中加入了中文路径,导致读xml的时候会报错,于是乎参照别的博客写了一个批量修改的脚本
#!/usr/bin/python
import os
import os.path
import xml.dom.minidom as xdm
#以下均是处理中文时所需库
import sys
reload(sys)
sys.setdefaultencoding('utf8')
path = './test/'
#以下是按照数字顺序排列文件从xml开始数倒数4位
items = os.listdir(path)
items.sort(key=lambda x:int(x[:-4]))
train_txt = 'train.txt'
print items
for item in items:
dom = xdm.parse(os.path.join(path,item))
root = dom.documentElement
# change folder name
folder_dom = root.getElementsByTagName('folder')
print folder_dom[0].firstChild.data
folder_dom[0].firstChild.data = 'VOC2007'
# change image name
image_dom = root.getElementsByTagName('filename')
print image_dom[0].firstChild.data
with open(train_txt, 'a+') as f:
f.write(image_dom[0].firstChild.data)
f.write('\n')
image_dom[0].firstChild.data = image_dom[0].firstChild.data + '.jpg'
# delete path
path_dom = root.getElementsByTagName('path')
root.removeChild(path_dom[0])
with open(os.path.join(path,item), 'w') as f:
dom.writexml(f, encoding='utf-8')
参照博客 https://saicoco.github.io/object-detection-4/