Python将数据集划分为训练集、验证集和测试集

将数据集(位置:D:/Code/Data/centerlinedata/tem_voc/JPEGImages/)下的621张图片按照划分比例(如 训练集(train):验证集(val):测试集(test)=6:2:2)进行拆分复制到新的文件夹(D:/Code/Data/GREENTdata/)并在该文件夹下创建train、val、teset三个文件夹

 使用random.shuffle(current_data_index_list)打乱索引list的顺序

copy2()函数用来复制图片到另一个位置

import os
import random
from shutil import copy2

# 源文件夹路径
file_path = r"D:/Code/Data/centerlinedata/tem_voc/JPEGImages/"
# 新文件路径
new_file_path = r"D:/Code/Data/GREENTdata/"
# 划分数据比例6:2:2
split_rate = [0.6, 0.2, 0.2]
class_names = os.listdir(file_path)
# 目标文件夹下创建文件夹
split_names = ['train', 'val', 'test']
print(class_names)  # ['00000.jpg', '00001.jpg', '00002.jpg'... ]

# 判断是否存在目标文件夹,不存在则创建---->创建train\val\test文件夹
if os.path.isdir(new_file_path):
    pass
else:
    os.makedirs(new_file_path)
for split_name in split_names:
    split_path = new_file_path + "/" + split_name
    print(split_path)   # D:/Code/Data/GREENTdata/train, val, test
    if os.path.isdir(split_path):
        pass
    else:
        os.makedirs(split_path)

# 按照比例划分数据集,并进行数据图片的复制
for class_name in class_names:
    current_data_path = file_path  # D:/Code/Data/centerlinedata/tem_voc/JPEGImages/
    current_all_data = os.listdir(current_data_path)
    current_data_length = len(current_all_data)  # 文件夹下的图片个数
    current_data_index_list = list(range(current_data_length))
    random.shuffle(current_data_index_list)

    train_path = os.path.join(new_file_path, 'train/')   # D:/Code/Data/GREENTdata/train/
    val_path = os.path.join(new_file_path, 'val/')       # D:/Code/Data/GREENTdata/val/
    test_path = os.path.join(new_file_path, 'test/')     # D:/Code/Data/GREENTdata/test/

    train_stop_flag = current_data_length * split_rate[0]
    val_stop_flag = current_data_length * (split_rate[0] + split_rate[1])


current_idx = 0
train_num = 0
val_num = 0
test_num = 0
# 图片复制到文件夹中
for i in current_data_index_list:
    src_img_path = os.path.join(current_data_path, current_all_data[i])
    if current_idx <= train_stop_flag:
        copy2(src_img_path, train_path)
        train_num += 1
    elif (current_idx > train_stop_flag) and (current_idx <= val_stop_flag):
        copy2(src_img_path, val_path)
        val_num += 1
    else:
        copy2(src_img_path, test_path)
        test_num += 1
    current_idx += 1
print("Done!", train_num, val_num, test_num)


对应标签文件夹放入train_label中,代码如下:

import os
import random
from shutil import copy2

# 源文件夹路径
file_path = r"D:/Code/Data/centerlinedata/tem_voc/SegmentationClassPNG/"
# 新文件路径
new_file_path = r"D:/Code/Data/GREENTdata/"
# 匹配对应的文件夹
match_file_path = r"D:/Code/Data/GREENTdata/test/"

class_names = os.listdir(file_path)
match_names = os.listdir(match_file_path)
# 目标文件夹下创建文件夹
label_names = ['train_labels', 'val_labels', 'test_labels']
print(class_names)  # ['00000.jpg', '00001.jpg', '00002.jpg'... ]

# 判断是否存在目标文件夹,不存在则创建---->创建train_label\val_label\test_label文件夹
if os.path.isdir(new_file_path):
    pass
else:
    os.makedirs(new_file_path)
for label_name in label_names:
    split_path = new_file_path + label_name
    # print(split_path)   # D:/Code/Data/GREENTdata/train_label, val_label, test_label
    if os.path.isdir(split_path):
        pass
    else:
        os.makedirs(split_path)

# 按照比例划分数据集,并进行数据图片的复制
for class_name in class_names:
    transF = os.path.splitext(class_name)
    class_num = transF[0]
    for match_name in match_names:
        transF2 = os.path.splitext(match_name)
        match_num = transF2[0]
        if match_num == class_num:
            src_img_path = os.path.join(file_path, class_name)
            copy2(src_img_path, split_path)
print("Done!")


 

你可能感兴趣的:(python)