干货python划分数据集

*coding: utf-8 *

Author --LiMing–

import os
import random
import shutil
import time

def copyFile(fileDir, class_name):
image_list = os.listdir(fileDir) # 获取图片的原始路径,列出子文件夹
image_number = len(image_list)

train_number = int(image_number * train_rate)
train_sample = random.sample(image_list, train_number) # 从image_list中随机获取0.8比例的图像.
test_sample = list(set(image_list) - set(train_sample))
sample = [train_sample, test_sample]

# 复制图像到目标文件夹
for k in range(len(save_dir)):
    # os.makedirs(save_dir[k] + class_name)
    # for name in sample[k]:
    #     shutil.copy(os.path.join(fileDir, name), os.path.join(save_dir[k], class_name, name))
    if os.path.isdir(save_dir[k] + '/'+class_name):
        for name in sample[k]:
            shutil.copy(os.path.join(fileDir, name), os.path.join(save_dir[k] + '/'+class_name+'/', name))  # 连接两个或更多的路径名组件
    else:
        os.makedirs(save_dir[k] + '/'+class_name)
        for name in sample[k]:
            shutil.copy(os.path.join(fileDir, name), os.path.join(save_dir[k] +'/'+ class_name+'/', name))

if name == ‘main’:
time_start = time.time()

# 原始数据集路径
origion_path = './data/a256'

# 保存路径
save_train_dir = './data3/train'
save_test_dir = './data3/test'
save_dir = [save_train_dir, save_test_dir]

# 训练集比例
train_rate = 0.8

# 数据集类别及数量
file_list = os.listdir(origion_path)
num_classes = len(file_list)

for i in range(num_classes):
    class_name = file_list[i]
    image_Dir = os.path.join(origion_path, class_name)
    copyFile(image_Dir, class_name)
    print('%s划分完毕!' % class_name)

time_end = time.time()
print('---------------')
print('训练集和测试集划分共耗时%s!' % (time_end - time_start))

# *coding: utf-8 *

# Author --LiMing–

import os

import random

import shutil

import time

def copyFile(fileDir, class_name):

image_list = os.listdir(fileDir) # 获取图片的原始路径

image_number = len(image_list)

train_number = int(image_number * train_rate)

train_sample = random.sample(image_list, train_number) # 从image_list中随机获取0.8比例的图像.

test_sample = list(set(image_list) - set(train_sample))

sample = [train_sample, test_sample]

# 复制图像到目标文件夹

for k in range(len(save_dir)):

if os.path.isdir(save_dir[k] + class_name):

for name in sample[k]:

shutil.copy(os.path.join(fileDir, name), os.path.join(save_dir[k] + class_name+’/’, name))

else:

os.makedirs(save_dir[k] + class_name)

for name in sample[k]:

shutil.copy(os.path.join(fileDir, name), os.path.join(save_dir[k] + class_name+’/’, name))

if name == ‘main’:

time_start = time.time()

# 原始数据集路径

origion_path = ‘/home/room/lm_other/NWPU-RESISC45/’

# 保存路径

save_train_dir = ‘/home/room/lm_other/RS_45/2_8/train/’

save_test_dir = ‘/home/room/lm_other/RS_45/2_8/test/’

save_dir = [save_train_dir, save_test_dir]

# 训练集比例

train_rate = 0.2

# 数据集类别及数量

file_list = os.listdir(origion_path)

num_classes = len(file_list)

for i in range(num_classes):

class_name = file_list[i]

image_Dir = os.path.join(origion_path, class_name)

copyFile(image_Dir, class_name)

print(’%s划分完毕!’ % class_name)

time_end = time.time()

print(’---------------’)

print(‘训练集和测试集划分共耗时%s!’ % (time_end - time_start))

花了几天去找一个划分数据集的程序。这个程序是参考的一篇博客,谢谢这个大佬。改了其中一部分,因为原本代码文件存储方式不是我要的,这个大佬写的是存储不了两个文件夹。我用的是caltech256数据集,里面有三万多张图,我需要按比例随机抽取分成训练集与测试集。可能是大佬特意给我留了坑))也可能是我们需要分成文件夹的方式不一样。其实是一个很简单的问题,if os.path.isdir(save_dir[k] + ‘/’+class_name),原本的没有‘/’。

你可能感兴趣的:(python)