用于深度学习、机器学习划分自己的数据集。如train_scale=0.6, val_scale=0.2, test_scale=0.2,即该文件夹按0.6、0.2、0.2比例随机划分为训练集、验证集和测试集,根据需要自行更改数值,适用于文件夹包含多个子文件夹的情况,实现每个类均按比例划分
将图片本身按照指定比例划分到新的文件夹
将图片路径按照指定比例划分到新的txt文件
# ----------------------------------------- Dataset_divide -----------------------------------------------#
import os
import random
from shutil import copy2
def data_set_split(src_data_folder1, target_data_folder1, train_scale=0.6, val_scale=0.2, test_scale=0.2):
"""
读取源数据文件夹,生成划分好的文件夹,分为trian、val、test三个文件夹进行
:param src_data_folder1: 源文件夹
:param target_data_folder1: 目标文件夹
:param train_scale: 训练集比例
:param val_scale: 验证集比例
:param test_scale: 测试集比例
:return:
"""
print("开始数据集划分")
class_names = os.listdir(src_data_folder1)
split_names = ['train', 'val', 'test'] # 在目标目录下创建文件夹
for split_name in split_names:
split_path = os.path.join(target_data_folder1, split_name)
if os.path.isdir(split_path):
pass
else:
os.mkdir(split_path)
for class_name in class_names: # 然后在split_path的目录下创建类别文件夹
class_split_path = os.path.join(split_path, class_name)
if os.path.isdir(class_split_path):
pass
else:
os.mkdir(class_split_path)
for class_name in class_names: # 首先进行分类遍历, 按照比例划分数据集, 并进行数据图片的复制
current_class_data_path = os.path.join(src_data_folder1, class_name)
current_all_data = os.listdir(current_class_data_path)
current_data_length = len(current_all_data)
current_data_index_list = list(range(current_data_length))
random.shuffle(current_data_index_list)
train_folder = os.path.join(os.path.join(target_data_folder1, 'train'), class_name)
val_folder = os.path.join(os.path.join(target_data_folder1, 'val'), class_name)
test_folder = os.path.join(os.path.join(target_data_folder1, 'test'), class_name)
train_stop_flag = current_data_length * train_scale
val_stop_flag = current_data_length * (train_scale + val_scale)
current_idx = 0
train_num = 0
val_num = 0
test_num = 0
for i in current_data_index_list:
src_img_path = os.path.join(current_class_data_path, current_all_data[i])
if current_idx <= train_stop_flag:
copy2(src_img_path, train_folder) # print("{}复制到了{}".format(src_img_path, train_folder))
train_num = train_num + 1
elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
copy2(src_img_path, val_folder) # print("{}复制到了{}".format(src_img_path, val_folder))
val_num = val_num + 1
else:
copy2(src_img_path, test_folder) # print("{}复制到了{}".format(src_img_path, test_folder))
test_num = test_num + 1
current_idx = current_idx + 1
print("*********************************{}*************************************".format(class_name))
print(
"{}类按照{}:{}:{}的比例划分完成,一共{}张图片".format(
class_name, train_scale, val_scale, test_scale, current_data_length))
print("训练集{}:{}张".format(train_folder, train_num))
print("验证集{}:{}张".format(val_folder, val_num))
print("测试集{}:{}张".format(test_folder, test_num))
if __name__ == '__main__':
src_data_folder = r"/public2/LFX/source_data" # 输入路径
tar_data_folder = r"/public2/LFX/target_data" # 输出路径
data_set_split(src_data_folder, tar_data_folder)
# ------------------------------------- 直接随机划分训练、验证、测试图片路径到txt文件 ----------------------------------------#
import os
import random
import math
# 子文件夹的名称为类别,比如你的文件夹下有dog和cat两个类,即两个子文件夹,那就是Tag_array = {"dog":0, "cat":1},根据需要修改
Tag_array = {"F16": 0, "F22": 1, "B52": 2, "F15": 3, "A10": 4, "747": 5, "B2": 6, "IL76": 7, "JAS39": 8, "E2C": 9}
def data_list(src_data_folder, train_txt=None, val_txt=None, test_txt=None, train_scale=None, val_scale=None):
"""
读取源数据文件夹,生成划分好的图片路径,依次分到trian、val、test三个txt文件
:param src_data_folder: 源文件夹
:param train_txt: 目标训练集txt文件
:param val_txt: 目标验证集txt文件
:param test_txt: 目标测试集txt文件
:param train_scale: 训练集比例
:param val_scale: 验证集比例
:测试集比例: 1-训练集比例-验证集比例,故不用设置
:return:
"""
train, val, test, txt_name = 0, 0, 0, 0
train_sum, val_sum, test_sum, txt_name = 0, 0, 0, 0
with open(train_txt, 'a+') as file: # 通过with open( ) as f: 来打开文件的方式会自动关闭文件
file.truncate(0) # 删除txt文件内容,size是表示从第几个字节开始清除,如果是0时代表是删除全部内容
with open(val_txt, 'a+') as file: # ‘a+’可读可写不覆盖
file.truncate(0)
with open(test_txt, 'a+') as file:
file.truncate(0)
for root_dir, sub_dirs, _ in os.walk(src_data_folder): # 遍历os.walk()返回的每一个三元组,内容分别放在三个变量中
idx = 0
for sub_dir in sub_dirs:
if sub_dir in Tag_array:
idx = Tag_array[sub_dir]
file_names = os.listdir(os.path.join(root_dir, sub_dir)) # 遍历每个次级目录
file_names = list(filter(lambda x: x.endswith('.jpg'), file_names)) # 去掉列表中的非jpg格式的文件
random.shuffle(file_names)
for i in range(len(file_names)):
if i < math.floor(train_scale * len(file_names)):
txt_name = train_txt
train += 1
elif i < math.floor((train_scale + val_scale) * len(file_names)):
txt_name = val_txt
val += 1
elif i < len(file_names): # 确定train_scale和val_scale,剩下为test_scale
txt_name = test_txt
test += 1
with open(os.path.join(src_data_folder, txt_name), mode='a') as file: # ‘a‘可写不覆盖
file.write(os.path.join(src_data_folder, sub_dir, file_names[i]) + '$' + str(idx) + '\n') # 将' '改成了'$'
print("{}数据集, 划分训练集{}张, 验证集{}, 测试集{}".format(sub_dir, train, val, test))
train_sum += train
val_sum += val
test_sum += test
train, val, test = 0, 0, 0
print("\n共{}张, 划分训练集{}张, 验证集{}, 测试集{}".format(train_sum + val_sum + test_sum, train_sum, val_sum, test_sum))
if __name__ == '__main__':
data_path = r"/home/ubuntu/Desktop/Aircraft_dataset/Train_dataset/img" # 源文件图片路径
train_txt_path = r"/home/ubuntu/Desktop/Aircraft_dataset/Label_path/train.txt" # 训练集txt文件路径
val_txt_path = r"/home/ubuntu/Desktop/Aircraft_dataset/Label_path/val.txt" # 验证集txt文件路径
test_txt_path = r"/home/ubuntu/Desktop/Aircraft_dataset/Label_path/test.txt" # 测试集txt文件路径
data_list(data_path, train_txt_path, val_txt_path, test_txt_path, 0.6, 0.2) # 测试集比例 1-0.6-0.2=0.2