做深度学习等模型训练工作时经常需要批量处理一些数据文件,为了方便查阅,笔者将会在本篇博客持续更新汇总一些常用的数据文件批量处理脚本,文中大多脚本都参考了网络现有资源加以改进,给出了多种实现方法(包括标准库和第三方库的方法以及自己编写的更接近底层的、可以根据需求灵活改变的方法),文章中将会介绍脚本作用以及需要修改的地方并给出使用示例。
Input
或判断
即可)Output
import os
def get_file_name(file_dir,file_type=''):#默认为文件夹下的所有文件
lst = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if(file_type == ''):
lst.append(file)
else:
if os.path.splitext(file)[1] == str(file_type):#获取指定类型的文件名
lst.append(file)
return lst
from glob import glob
# glob获得路径下所有文件路径
#‘*’表示任意,如果文件命是任意,那么就是所有文件
# 可根据需要修改,获得指定命名格式的文件
src_file_list = glob(file_dir + '*')
通过glob方法还可以实现提取规定命名形式
的文件,进一步了解可以看python之glob模块以及根据路径获取文件名
import os
os.listdir(file_dir)
只需要在第一个脚本的基础上,对获取到的文件进行重命名即可(文件名称包括文件显示名以及文件扩展类型名称,改变文件类型本质也是重命名),重命名需要用到Python re库中的replace方法以及os库中的rename方法。
读者只需要根据需求修改Newdir的命名形式即可。
import os
import re
# 获取文件夹file_dir下指定类型file_type的所有文件名
def file_name(file_dir,file_type=''):#默认为文件夹下的所有文件
lst = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if(file_type == ''):
lst.append(file)
else:
if os.path.splitext(file)[1] == str(file_type):#获取指定类型的文件名
lst.append(file)
return lst
file_dir = "C:/Users/MECHREVO/Desktop/test/"
files = file_name(file_dir)
# 对文件名进行批量重命名
for name in files:#遍历所有文件
Olddir= file_dir + name
# 根据自己的需要对文件名称进行修改,生成新的文件名称
Newdir= file_dir + name.replace(".xml",".txt")
os.rename(Olddir,Newdir)#重命名
更简便的方法:用Python复制文件的9个方法
function
Input
import shutil
import os
# 复制文件到指定的目录
# srcfile 需要复制、移动的文件
# dstpath 目的地址
def mycopyfile(srcfile,dstpath): # 复制函数
if not os.path.isfile(srcfile):
print ("%s not exist!"%(srcfile))
else:
fpath,fname=os.path.split(srcfile) # 分离文件名和路径
if not os.path.exists(dstpath):
os.makedirs(dstpath) # 创建路径
shutil.copy(srcfile, dstpath + fname) # 复制文件
print ("copy %s -> %s"%(srcfile, dstpath + fname))
当你搜索了多个来源的Voc数据集时,可能会面临数据路径与名称无法对应的问题,需要对数据集进行融合,即对标注文件内容进行批量修改。
import os
import re
# 获取文件夹file_dir下指定类型file_type的所有文件名
def get_file_name(file_dir,file_type=''):#默认为文件夹下的所有文件
lst = []
for root, dirs, files in os.walk(file_dir):
for file in files:
if(file_type == ''):
lst.append(file)
else:
if os.path.splitext(file)[1] == str(file_type):#获取指定类型的文件名
lst.append(file)
return lst
file_dir = "C:/Users/MECHREVO/Desktop/test/"
files = get_file_name(file_dir)
# 获得文件数字并排序(可以看到上面是乱序)
import re
file_name_num = []
# 首先去掉文件扩展名
for each in files:
# 这一步是去掉.xml,也可以用replace
# 对于图片文件操作时,应改为对应的文件扩展名(如.jpg)
each_name_num = re.compile(r'.xml').sub(r'', each)
file_name_num.append(int(each_name_num))
# 对数字进行排序
file_name_num.sort()
# 设置文件名的起始顺序
i = -1
# 对文件名进行批量重命名
for name in files:# 遍历所有文件
i = i + 1
Olddir= file_dir + name
# 根据自己的需要对文件名称进行修改,生成新的文件名称
# 处理什么类型文件就填什么扩展名
Newdir= file_dir + str(i) + '.xml' # 对标注文件
# Newdir= file_dir + str(i) + '.jpg' # 对图片
# 重命名
os.rename(Olddir,Newdir)
对于每一个数据集的img和ann文件,我们都需要进行上述操作,并注意改变起始标号,比如:当处理第一个数据集时,我们设置起始标号为0,一直到完毕100,那么下一个数据集的标号就应该从101开始,继续往下标号。
最后将三个数据集的文件合并到一起(此时它们已经按顺序命名):
import os
import os.path
import xml.dom.minidom
path = "C:/Users/MECHREVO/Desktop/test/" #xml文件存储路径
#返回文件夹中的文件名列表
files=os.listdir(path)
#print(files)
# 修改filename 此处是用阿拉伯数字递增命名
count=0
for xmlFile in files:
if not os.path.isdir(xmlFile):#os.path.isdir()用于判断对象是否为一个目录
#如果不是目录,则直接打开
name1=xmlFile.split('.')[0]
#print(name1)
dom=xml.dom.minidom.parse(path+'/'+xmlFile)
#print(dom)
root=dom.documentElement
#newfolder=root.getElementsByTagName('folder')
#print(newfolder)
newpath = root.getElementsByTagName('path')
newfilename = root.getElementsByTagName('filename')
newfilename[0].firstChild.data = name1+'.jpg'
with open(os.path.join(path, xmlFile), 'w') as fh:
dom.writexml(fh)
print('写入name/pose OK!')
count = count + 1
# 修改path
count=0
# 图像的根目录(也就是我们本地图像的存储位置)
img_dir = "C:\\Users\\MECHREVO\\data\\image\\"
for xmlFile in files:
if not os.path.isdir(xmlFile):#os.path.isdir()用于判断对象是否为一个目录
#如果不是目录,则直接打开
name1=xmlFile.split('.')[0]
#print(name1)
dom=xml.dom.minidom.parse(path+'/'+xmlFile)
#print(dom)
root=dom.documentElement
#newfolder=root.getElementsByTagName('folder')
#print(newfolder)
filename = root.getElementsByTagName('filename')
newpath = root.getElementsByTagName('path')
newpath[0].firstChild.data = img_dir + name1 +'.jpg'
with open(os.path.join(path, xmlFile), 'w') as fh:
dom.writexml(fh)
print('写入name/pose OK!')
count = count + 1
利用python的xml模块对xml文件进行操作时,经常会出现,利用下列脚本去掉即可(不去应该也行)
# 去除版本号
import os
def listFiles():
fileDir = "C:/Users/MECHREVO/data/result/"
fileList = []
for root, dirs, files in os.walk(fileDir):
for fileObj in files:
fileList.append(os.path.join(root, fileObj))
for fileObj in fileList:
f = open(fileObj,'r+')
all_the_lines = f.readlines()
f.seek(0)
f.truncate()
for line in all_the_lines:
f.write(line.replace('', ''))
#print(line)
f.close()
listFiles()
至此我们就可以将多个数据集融合为一个数据集,实现标注文件与图像文件一一对应。
Dataset-Augment: https://github.com/mickkky/Dataset-Augment
需要修改:(在最底下的main函数里面)
IMG_DIR :原始数据集图片的文件夹路径
XML_DIR: 原始xml文件的文件夹路径
AUG_XML_DIR :数据增强后的图片的存储路径
AUG_IMG_DIR:数据增强后的xml文件的存储路径
AUGLOOP :每张图片增强多少次(我设的是10)
注意:使用前需要先确认原始的图片和xml文件夹内的文件是不是一一对应的,比如图片文件夹有一张abc.jpg,则xml文件夹一定有一个abc.xml文件。
import xml.etree.ElementTree as ET
import pickle
import os
from os import getcwd
import numpy as np
from PIL import Image
import shutil
import matplotlib.pyplot as plt
import imgaug as ia
from imgaug import augmenters as iaa
ia.seed(1)
def read_xml_annotation(root, image_id):
in_file = open(os.path.join(root, image_id))
tree = ET.parse(in_file)
root = tree.getroot()
bndboxlist = []
for object in root.findall('object'): # 找到root节点下的所有country节点
bndbox = object.find('bndbox') # 子节点下节点rank的值
xmin = int(bndbox.find('xmin').text)
xmax = int(bndbox.find('xmax').text)
ymin = int(bndbox.find('ymin').text)
ymax = int(bndbox.find('ymax').text)
# print(xmin,ymin,xmax,ymax)
bndboxlist.append([xmin, ymin, xmax, ymax])
# print(bndboxlist)
bndbox = root.find('object').find('bndbox')
return bndboxlist
# (506.0000, 330.0000, 528.0000, 348.0000) -> (520.4747, 381.5080, 540.5596, 398.6603)
def change_xml_annotation(root, image_id, new_target):
new_xmin = new_target[0]
new_ymin = new_target[1]
new_xmax = new_target[2]
new_ymax = new_target[3]
in_file = open(os.path.join(root, str(image_id) + '.xml')) # 这里root分别由两个意思
tree = ET.parse(in_file)
xmlroot = tree.getroot()
object = xmlroot.find('object')
bndbox = object.find('bndbox')
xmin = bndbox.find('xmin')
xmin.text = str(new_xmin)
ymin = bndbox.find('ymin')
ymin.text = str(new_ymin)
xmax = bndbox.find('xmax')
xmax.text = str(new_xmax)
ymax = bndbox.find('ymax')
ymax.text = str(new_ymax)
tree.write(os.path.join(root, str("%06d" % (str(id) + '.xml'))))
def change_xml_list_annotation(root, image_id, new_target, saveroot, id,img_name):
in_file = open(os.path.join(root, str(image_id) + '.xml')) # 这里root分别由两个意思
tree = ET.parse(in_file)
elem = tree.find('filename')
elem.text = (img_name + str("_%06d" % int(id)) + '.jpg')
xmlroot = tree.getroot()
index = 0
for object in xmlroot.findall('object'): # 找到root节点下的所有country节点
bndbox = object.find('bndbox') # 子节点下节点rank的值
# xmin = int(bndbox.find('xmin').text)
# xmax = int(bndbox.find('xmax').text)
# ymin = int(bndbox.find('ymin').text)
# ymax = int(bndbox.find('ymax').text)
new_xmin = new_target[index][0]
new_ymin = new_target[index][1]
new_xmax = new_target[index][2]
new_ymax = new_target[index][3]
xmin = bndbox.find('xmin')
xmin.text = str(new_xmin)
ymin = bndbox.find('ymin')
ymin.text = str(new_ymin)
xmax = bndbox.find('xmax')
xmax.text = str(new_xmax)
ymax = bndbox.find('ymax')
ymax.text = str(new_ymax)
index = index + 1
tree.write(os.path.join(saveroot, img_name + str("_%06d" % int(id)) + '.xml'))
def mkdir(path):
# 去除首位空格
path = path.strip()
# 去除尾部 \ 符号
path = path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
print(path + ' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path + ' 目录已存在')
return False
if __name__ == "__main__":
IMG_DIR = "C:/Users/MECHREVO/data/image"
XML_DIR = "C:/Users/MECHREVO/data/result"
# =============================================================================
# AUG_XML_DIR = "./Annotations" # 存储增强后的XML文件夹路径
# =============================================================================
AUG_XML_DIR = "C:/Users/MECHREVO/data/new_xml" # 存储增强后的XML文件夹路径
try:
shutil.rmtree(AUG_XML_DIR)
except FileNotFoundError as e:
a = 1
mkdir(AUG_XML_DIR)
# =============================================================================
# AUG_IMG_DIR = "./JPEGImages" # 存储增强后的影像文件夹路径
# =============================================================================
AUG_IMG_DIR = "C:/Users/MECHREVO/data/new_img" # 存储增强后的影像文件夹路径
try:
shutil.rmtree(AUG_IMG_DIR)
except FileNotFoundError as e:
a = 1
mkdir(AUG_IMG_DIR)
AUGLOOP = 10 # 每张影像增强的数量
boxes_img_aug_list = []
new_bndbox = []
new_bndbox_list = []
# 影像增强
seq = iaa.Sequential([
iaa.Flipud(0.5), # vertically flip 20% of all images
iaa.Fliplr(0.5), # 镜像
iaa.Multiply((1.2, 1.5)), # change brightness, doesn't affect BBs
iaa.GaussianBlur(sigma=(0, 3.0)), # iaa.GaussianBlur(0.5),
iaa.Affine(
translate_px={"x": 15, "y": 15},
scale=(0.8, 0.95),
rotate=(-30, 30)
) # translate by 40/60px on x/y axis, and scale to 50-70%, affects BBs
])
for root, sub_folders, files in os.walk(XML_DIR):
for name in files:
print(name)
bndbox = read_xml_annotation(XML_DIR, name)
shutil.copy(os.path.join(XML_DIR, name), AUG_XML_DIR)
shutil.copy(os.path.join(IMG_DIR, name[:-4] + '.jpg'), AUG_IMG_DIR)
for epoch in range(AUGLOOP):
seq_det = seq.to_deterministic() # 保持坐标和图像同步改变,而不是随机
# 读取图片
img = Image.open(os.path.join(IMG_DIR, name[:-4] + '.jpg'))
# sp = img.size
img = np.asarray(img)
# bndbox 坐标增强
for i in range(len(bndbox)):
bbs = ia.BoundingBoxesOnImage([
ia.BoundingBox(x1=bndbox[i][0], y1=bndbox[i][1], x2=bndbox[i][2], y2=bndbox[i][3]),
], shape=img.shape)
bbs_aug = seq_det.augment_bounding_boxes([bbs])[0]
boxes_img_aug_list.append(bbs_aug)
# new_bndbox_list:[[x1,y1,x2,y2],...[],[]]
n_x1 = int(max(1, min(img.shape[1], bbs_aug.bounding_boxes[0].x1)))
n_y1 = int(max(1, min(img.shape[0], bbs_aug.bounding_boxes[0].y1)))
n_x2 = int(max(1, min(img.shape[1], bbs_aug.bounding_boxes[0].x2)))
n_y2 = int(max(1, min(img.shape[0], bbs_aug.bounding_boxes[0].y2)))
if n_x1 == 1 and n_x1 == n_x2:
n_x2 += 1
if n_y1 == 1 and n_y2 == n_y1:
n_y2 += 1
if n_x1 >= n_x2 or n_y1 >= n_y2:
print('error', name)
new_bndbox_list.append([n_x1, n_y1, n_x2, n_y2])
# 存储变化后的图片
image_aug = seq_det.augment_images([img])[0]
path = os.path.join(AUG_IMG_DIR,
name[:-4] + str( "_%06d" % (epoch + 1)) + '.jpg')
image_auged = bbs.draw_on_image(image_aug, thickness=0)
Image.fromarray(image_auged).save(path)
# 存储变化后的XML
change_xml_list_annotation(XML_DIR, name[:-4], new_bndbox_list, AUG_XML_DIR,
epoch + 1,name[:-4])
print( name[:-4] + str( "_%06d" % (epoch + 1)) + '.jpg')
new_bndbox_list = []
数据格式的转换实际是annotation标注文件的转化
将xml文件转成json文件,训练集的所有xml文件会得到一个json文件,验证集同理。
需要修改:(在最下面)
xml_path :xml文件夹的路径(注意!是文件夹,不是文件,最后不需要反斜杠)
json_file :要导出的json文件的路径
category_name_set:指定类别名称(如果设置为空的话,那么将会按照xml中出现的顺序添加)
import xml.etree.ElementTree as ET
import os
import json
coco = dict()
coco['images'] = []
coco['type'] = 'instances'
coco['annotations'] = []
coco['categories'] = []
category_set = dict()
image_set = set()
category_item_id = 0
image_id = ''
id_num = 0
annotation_id = 0
def addCatItem(name):
global category_item_id
category_item = dict()
category_item['supercategory'] = 'none'
category_item_id += 1
category_item['id'] = category_item_id
category_item['name'] = name
coco['categories'].append(category_item)
category_set[name] = category_item_id
return category_item_id
def addImgItem(file_name, size):
global image_id,id_num
if file_name is None:
raise Exception('Could not find filename tag in xml file.')
if size['width'] is None:
raise Exception('Could not find width tag in xml file.')
if size['height'] is None:
raise Exception('Could not find height tag in xml file.')
image_item = dict()
temp = str(id_num)
image_item['id'] = int(temp)
id_num += 1
image_item['file_name'] = file_name
image_item['width'] = size['width']
image_item['height'] = size['height']
coco['images'].append(image_item)
image_set.add(file_name)
return image_item['id']
def addAnnoItem(object_name, image_id, category_id, bbox):
global annotation_id
annotation_item = dict()
annotation_item['segmentation'] = []
seg = []
#bbox[] is x,y,w,h
#left_top
seg.append(bbox[0])
seg.append(bbox[1])
#left_bottom
seg.append(bbox[0])
seg.append(bbox[1] + bbox[3])
#right_bottom
seg.append(bbox[0] + bbox[2])
seg.append(bbox[1] + bbox[3])
#right_top
seg.append(bbox[0] + bbox[2])
seg.append(bbox[1])
annotation_item['segmentation'].append(seg)
annotation_item['area'] = bbox[2] * bbox[3]
annotation_item['iscrowd'] = 0
annotation_item['ignore'] = 0
annotation_item['image_id'] = image_id
annotation_item['bbox'] = bbox
annotation_item['category_id'] = category_id
annotation_id += 1
annotation_item['id'] = annotation_id
coco['annotations'].append(annotation_item)
def parseXmlFiles(xml_path):
for f in os.listdir(xml_path):
if not f.endswith('.xml'):
continue
bndbox = dict()
size = dict()
current_image_id = None
current_category_id = None
file_name = None
size['width'] = None
size['height'] = None
size['depth'] = None
xml_file = os.path.join(xml_path, f)
print(xml_file)
tree = ET.parse(xml_file)
root = tree.getroot()
if root.tag != 'annotation':
raise Exception('pascal voc xml root element should be annotation, rather than {}'.format(root.tag))
#elem is , , ,
for elem in root:
current_parent = elem.tag
current_sub = None
object_name = None
if elem.tag == 'folder':
continue
if elem.tag == 'filename':
file_name = elem.text
if file_name in category_set:
raise Exception('file_name duplicated')
#add img item only after parse tag
elif current_image_id is None and file_name is not None and size['width'] is not None:
if file_name not in image_set:
current_image_id = addImgItem(file_name, size)
print('add image with {} and {}'.format(file_name, size))
else:
raise Exception('duplicated image: {}'.format(file_name))
#subelem is , , , ,
for subelem in elem:
bndbox ['xmin'] = None
bndbox ['xmax'] = None
bndbox ['ymin'] = None
bndbox ['ymax'] = None
current_sub = subelem.tag
if current_parent == 'object' and subelem.tag == 'name':
object_name = subelem.text
if object_name not in category_set:
current_category_id = addCatItem(object_name)
else:
current_category_id = category_set[object_name]
elif current_parent == 'size':
if size[subelem.tag] is not None:
raise Exception('xml structure broken at size tag.')
size[subelem.tag] = int(subelem.text)
#option is , , , , when subelem is
for option in subelem:
if current_sub == 'bndbox':
if bndbox[option.tag] is not None:
raise Exception('xml structure corrupted at bndbox tag.')
bndbox[option.tag] = int(option.text)
#only after parse the
if bndbox['xmin'] is not None:
if object_name is None:
raise Exception('xml structure broken at bndbox tag')
if current_image_id is None:
raise Exception('xml structure broken at bndbox tag')
if current_category_id is None:
raise Exception('xml structure broken at bndbox tag')
bbox = []
#x
bbox.append(bndbox['xmin'])
#y
bbox.append(bndbox['ymin'])
#w
bbox.append(bndbox['xmax'] - bndbox['xmin'])
#h
bbox.append(bndbox['ymax'] - bndbox['ymin'])
print('add annotation with {},{},{},{}'.format(object_name, current_image_id, current_category_id, bbox))
addAnnoItem(object_name, current_image_id, current_category_id, bbox )
if __name__ == '__main__':
category_name_set = ('class1','class2','class3','class4','class5')
for each in category_name_set:
addCatItem(each)
xml_path = "G:/guangdian/new_data/train/ann"
json_file = 'G:/guangdian/new_data/train/train.json'
parseXmlFiles(xml_path)
json.dump(coco, open(json_file, 'w'))