数据集按指定比例划分为训练集、验证集和测试集

文章目录

  • 前言
  • 代码一
  • 代码二


前言

用于深度学习、机器学习划分自己的数据集。如train_scale=0.6, val_scale=0.2, test_scale=0.2,即该文件夹按0.6、0.2、0.2比例随机划分为训练集、验证集和测试集,根据需要自行更改数值,适用于文件夹包含多个子文件夹的情况,实现每个类均按比例划分

  • [ ] 代码一

将图片本身按照指定比例划分到新的文件夹

  • [ ] 代码二

将图片路径按照指定比例划分到新的txt文件


代码一

# ----------------------------------------- Dataset_divide -----------------------------------------------#
import os
import random
from shutil import copy2


def data_set_split(src_data_folder1, target_data_folder1, train_scale=0.6, val_scale=0.2, test_scale=0.2):
    """
        读取源数据文件夹,生成划分好的文件夹,分为trian、val、test三个文件夹进行
        :param src_data_folder1:    源文件夹
        :param target_data_folder1: 目标文件夹
        :param train_scale:         训练集比例
        :param val_scale:           验证集比例
        :param test_scale:          测试集比例
        :return:
    """
    print("开始数据集划分")
    class_names = os.listdir(src_data_folder1)
    split_names = ['train', 'val', 'test']  # 在目标目录下创建文件夹
    for split_name in split_names:
        split_path = os.path.join(target_data_folder1, split_name)
        if os.path.isdir(split_path):
            pass
        else:
            os.mkdir(split_path)
        for class_name in class_names:  # 然后在split_path的目录下创建类别文件夹
            class_split_path = os.path.join(split_path, class_name)
            if os.path.isdir(class_split_path):
                pass
            else:
                os.mkdir(class_split_path)
    for class_name in class_names:  # 首先进行分类遍历, 按照比例划分数据集, 并进行数据图片的复制
        current_class_data_path = os.path.join(src_data_folder1, class_name)
        current_all_data = os.listdir(current_class_data_path)
        current_data_length = len(current_all_data)
        current_data_index_list = list(range(current_data_length))
        random.shuffle(current_data_index_list)
        train_folder = os.path.join(os.path.join(target_data_folder1, 'train'), class_name)
        val_folder = os.path.join(os.path.join(target_data_folder1, 'val'), class_name)
        test_folder = os.path.join(os.path.join(target_data_folder1, 'test'), class_name)
        train_stop_flag = current_data_length * train_scale
        val_stop_flag = current_data_length * (train_scale + val_scale)
        current_idx = 0
        train_num = 0
        val_num = 0
        test_num = 0
        for i in current_data_index_list:
            src_img_path = os.path.join(current_class_data_path, current_all_data[i])
            if current_idx <= train_stop_flag:
                copy2(src_img_path, train_folder)  # print("{}复制到了{}".format(src_img_path, train_folder))
                train_num = train_num + 1
            elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
                copy2(src_img_path, val_folder)  # print("{}复制到了{}".format(src_img_path, val_folder))
                val_num = val_num + 1
            else:
                copy2(src_img_path, test_folder)  # print("{}复制到了{}".format(src_img_path, test_folder))
                test_num = test_num + 1
            current_idx = current_idx + 1
        print("*********************************{}*************************************".format(class_name))
        print(
            "{}类按照{}:{}:{}的比例划分完成,一共{}张图片".format(
                class_name, train_scale, val_scale, test_scale, current_data_length))
        print("训练集{}:{}张".format(train_folder, train_num))
        print("验证集{}:{}张".format(val_folder, val_num))
        print("测试集{}:{}张".format(test_folder, test_num))


if __name__ == '__main__':
    src_data_folder = r"/public2/LFX/source_data"  # 输入路径
    tar_data_folder = r"/public2/LFX/target_data"  # 输出路径
    data_set_split(src_data_folder, tar_data_folder)

代码二

# ------------------------------------- 直接随机划分训练、验证、测试图片路径到txt文件 ----------------------------------------#
import os
import random
import math

# 子文件夹的名称为类别,比如你的文件夹下有dog和cat两个类,即两个子文件夹,那就是Tag_array = {"dog":0, "cat":1},根据需要修改
Tag_array = {"F16": 0, "F22": 1,  "B52": 2,   "F15": 3, "A10": 4, "747": 5,  "B2": 6, "IL76": 7, "JAS39": 8, "E2C": 9}


def data_list(src_data_folder, train_txt=None, val_txt=None, test_txt=None, train_scale=None, val_scale=None):
    """
        读取源数据文件夹,生成划分好的图片路径,依次分到trian、val、test三个txt文件
        :param src_data_folder: 源文件夹
        :param train_txt:       目标训练集txt文件
        :param val_txt:         目标验证集txt文件
        :param test_txt:        目标测试集txt文件
        :param train_scale:     训练集比例
        :param val_scale:       验证集比例
        :测试集比例:            1-训练集比例-验证集比例,故不用设置
        :return:
    """
    train, val, test, txt_name = 0, 0, 0, 0
    train_sum, val_sum, test_sum, txt_name = 0, 0, 0, 0
    with open(train_txt, 'a+') as file:  # 通过with open( ) as f: 来打开文件的方式会自动关闭文件
        file.truncate(0)  # 删除txt文件内容,size是表示从第几个字节开始清除,如果是0时代表是删除全部内容
    with open(val_txt, 'a+') as file:  # ‘a+’可读可写不覆盖
        file.truncate(0)
    with open(test_txt, 'a+') as file:
        file.truncate(0)
    for root_dir, sub_dirs, _ in os.walk(src_data_folder):  # 遍历os.walk()返回的每一个三元组,内容分别放在三个变量中
        idx = 0
        for sub_dir in sub_dirs:
            if sub_dir in Tag_array:
                idx = Tag_array[sub_dir]
            file_names = os.listdir(os.path.join(root_dir, sub_dir))  # 遍历每个次级目录
            file_names = list(filter(lambda x: x.endswith('.jpg'), file_names))  # 去掉列表中的非jpg格式的文件
            random.shuffle(file_names)
            for i in range(len(file_names)):
                if i < math.floor(train_scale * len(file_names)):
                    txt_name = train_txt
                    train += 1
                elif i < math.floor((train_scale + val_scale) * len(file_names)):
                    txt_name = val_txt
                    val += 1
                elif i < len(file_names):  # 确定train_scale和val_scale,剩下为test_scale
                    txt_name = test_txt
                    test += 1
                with open(os.path.join(src_data_folder, txt_name), mode='a') as file:  # ‘a‘可写不覆盖
                    file.write(os.path.join(src_data_folder, sub_dir, file_names[i]) + '$' + str(idx) + '\n')  # 将' '改成了'$'
            print("{}数据集, 划分训练集{}张, 验证集{}, 测试集{}".format(sub_dir, train, val, test))
            train_sum += train
            val_sum += val
            test_sum += test
            train, val, test = 0, 0, 0
    print("\n共{}张, 划分训练集{}张, 验证集{}, 测试集{}".format(train_sum + val_sum + test_sum, train_sum, val_sum, test_sum))


if __name__ == '__main__':
    data_path = r"/home/ubuntu/Desktop/Aircraft_dataset/Train_dataset/img"  # 源文件图片路径
    train_txt_path = r"/home/ubuntu/Desktop/Aircraft_dataset/Label_path/train.txt"  # 训练集txt文件路径
    val_txt_path = r"/home/ubuntu/Desktop/Aircraft_dataset/Label_path/val.txt"  # 验证集txt文件路径
    test_txt_path = r"/home/ubuntu/Desktop/Aircraft_dataset/Label_path/test.txt"  # 测试集txt文件路径
    data_list(data_path, train_txt_path, val_txt_path, test_txt_path, 0.6, 0.2)  # 测试集比例 1-0.6-0.2=0.2

你可能感兴趣的:(深度学习,深度学习,人工智能,python,计算机视觉)