数据集扫描

适用于多类别的数据集文件的扫描,可检索数量、文件类型,防止自己创建数据集时出现纰漏,根据具体文件结构情况稍微改动即可

import sys
import shutil
import glob

def get_all(path):
    return  glob.glob(path)

def get_iterator(path): # 返回一个可迭代的数据
    return glob.iglob(path)

def main(controll):
    path = "E:\Desktop\Zhou\data13\*"
    # glob_dir = get_all(path) #可使用相对或绝对路径
    # print(glob_dir)

    iglob_dir = get_iterator(path)
    for first_subpath in iglob_dir: # 对应sketch和view目录
        first_sub_name = first_subpath.split("\\")[-1]
        print("\033[1;35;46m"+first_sub_name+"统计如下:\033[0m")
        second_subpath = get_iterator(first_subpath+"\*")
        class_num = 0
        sketch_train_num = 0
        sketch_test_num = 0
        views_train_num = 0
        views_test_num = 0

        total_type = set()
        for class_path in second_subpath: # 对应每个样本类的目录
            class_num = class_num+1
            class_name = class_path.split("\\")[-1]
            if controll:
                print("  "+class_name+":")
            third_subpath = get_iterator(class_path+"\*") # 对应每个类的train和test文件夹
            for train_or_test_path in third_subpath:
                train_or_test_name = train_or_test_path.split("\\")[-1]
                all_files = get_all(train_or_test_path+"\*")
                per_class_type = set()
                per_class_type.update(str(image_type.split(".")[-1]) for image_type in all_files)
                total_type = set.union(total_type, per_class_type)
                if controll:
                    print("    "+train_or_test_name+":",end="")
                    print(len(all_files),end="  ")
                    print("包含的文件类型有:",per_class_type)
                if first_sub_name == "sketch" and train_or_test_name == "train":
                    sketch_train_num += len(all_files)
                if first_sub_name == "views" and train_or_test_name == "train" :
                    views_train_num += len(all_files)
                if first_sub_name == "sketch" and train_or_test_name == "test":
                    sketch_test_num += len(all_files)
                if first_sub_name == "views" and train_or_test_name == "test" :
                    views_test_num += len(all_files)
        print("class_num===========================================================", class_num)
        if first_sub_name == "sketch":
            print("sketch_train_num====================================================",sketch_train_num)
            print("sketch_test_num=====================================================",sketch_test_num)
            print("包含的文件类型有====================================================",total_type)

        if first_sub_name == "views":
            print("views_train_num=====================================================", views_train_num)
            print("views_test_num======================================================", views_test_num)
            print("包含的文件类型有====================================================", total_type)


if __name__ == '__main__':
    controll = 0 # 是否显示每个类的详细信息,显示就写1
    main(controll)

数据集扫描_第1张图片

你可能感兴趣的:(实用工具,Python)