PASCAL Visual Object Classes Challenge 2007(VOC 2007)数据集预处理

VOC 2007[1] 是一个多标签数据集,有 20 类。这里为 multi-label classification 任务做预处理,包括:

  • 将图片移到同一个目录(方便读取);
  • 数据划分(本身就已经分好 train/val 和 test 两部分);
  • 处理标签。

Prepare

[1] 有下载链,train/val 450M,test 430M。下下来就是 VOCtrainval_06-Nov-2007.tarVOCtest_06-Nov-2007.tar 两个文件。以 test set 的文件为例,解压之后在 VOCtest_06-Nov-2007/VOCdevkit/VOC2007/ 下可以见到:

  • Annotations/:各样本对应的 .xml 标注文件,可以从中提取 label 信息,解析可参考 [5]。其中 标签下的 子标签与下一条的 0 tag 有对应关系,见 [2];
  • ImageSets/:只用到其中 Main/ 目录,里面是按类组织的 .txt 文件,标注每幅 image 样本是否包含此类物体,有 1/0/-1 三种标记(解释见 [2]):1 是含有,-1 是不含,0 表示 difficult。
  • JPEGImages/:图片;
  • SegmentationClass/:其它任务的,用不到;
  • SegmentationObject/:其它任务的,用不到;
  • ID, Label

    JPEGImages/ 下的图片是用 ID 命名的,可以从此获取样本 ID;而在 ImageSets/Main/ 中,又有 test.txttrain.txtval.txttrainval.txt 这 4 个 ID 划分文件。经验证,以两种方式获得的 ID 划分是一致的,且 train/val 与 test 无重合。
    处理 label 时,参照 [4],将 0 当成 -1,即只有 1 表示正例,0/-1 都表示负例,结果与 [3] 里每类正例数统计是对得上的。获取 label 又有两中方式:通过 Annotations/ 中的 .xml 文件,或通过 ImageSets/Main/(除了刚才的 ID 划分文件之外的).txt 文件。经验证,将 .txt 中的 0 当成 -1 处理与忽略 .xml 中 为 1 的效果相同。

    Code

    import os
    from os.path import join
    from xml.dom import minidom
    import numpy as np
    
    
    # http://host.robots.ox.ac.uk/pascal/VOC/voc2007/index.html
    # http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00090000000000000000
    
    
    P = "E:/iTom/dataset/VOC2007"  # 下载目录
    ALL_IMAGE_P = join(P, "images")  # 所有 image 复制一份到此目录下
    
    # train/val 解压目录
    TRAIN_P = join(P, "VOCtrainval_06-Nov-2007/VOCdevkit/VOC2007")
    TRAIN_IMAGE_P = join(TRAIN_P, "JPEGImages")
    TRAIN_LABEL_P = join(TRAIN_P, "ImageSets/Main")
    TRAIN_ANNO_P = join(TRAIN_P, "Annotations")
    
    # test 解压目录
    TEST_P = join(P, "VOCtest_06-Nov-2007/VOCdevkit/VOC2007")
    TEST_IMAGE_P = join(TEST_P, "JPEGImages")
    TEST_LABEL_P = join(TEST_P, "ImageSets/Main")
    TEST_ANNO_P = join(TEST_P, "Annotations")
    
    # ID 划分文件
    SPLIT_TRAIN = join(TRAIN_LABEL_P, "train.txt")
    SPLIT_VAL = join(TRAIN_LABEL_P, "val.txt")
    SPLIT_TRAIN_VAL = join(TRAIN_LABEL_P, "trainval.txt")
    SPLIT_TEST = join(TEST_LABEL_P, "test.txt")
    
    
    """处理 ID 划分"""
    
    # print("--- 第一种方式:从 JPEGImages/ 目录提取 ID ---")
    file_key = lambda s: int(s.split('.')[0])
    
    
    # def get_id_list(path):
    #     id_list = os.listdir(path)
    #     id_list = list(map(file_key, id_list))
    #     print("#files:", len(id_list))
    #     id_set = set(id_list)
    #     print("#unique:", len(id_set))
    #     return id_list
    
    
    # print("- train -")
    # train_img_id = get_id_list(TRAIN_IMAGE_P)  # 5011
    # print("- test -")
    # test_img_id = get_id_list(TEST_IMAGE_P)  # 4952
    
    # print("- 验证 train/val 与 test 无重复 ID -")
    # train_img_id_set = set(train_img_id)
    # test_img_id_set = set(test_img_id)
    # # no intersection in id of train/val & test
    # print("#common in train & test:", len(train_img_id_set.intersection(test_img_id_set)))  # 0
    
    
    print("--- 第二种方式:从 ID 划分文件提取 ID ---")
    
    
    def get_id_list_from_file(_file):
        id_list = []
        with open(_file, "r") as f:
            for line in f:
                id_list.append(int(line))
        print("#id:", len(id_list))
        id_set = set(id_list)
        print("#unique id:", len(id_set))
        return id_list
    
    
    print("- train -")
    id_train = get_id_list_from_file(SPLIT_TRAIN)  # 2501
    print("- val -")
    id_val = get_id_list_from_file(SPLIT_VAL)  # 2510
    print("- train-val -")
    id_train_val = get_id_list_from_file(SPLIT_TRAIN_VAL)  # 5011
    print("- test -")
    id_test = get_id_list_from_file(SPLIT_TEST)  # 4952
    
    # print("- 验证 train/val 与 test 无重复 ID -")
    # train_val_id_set = set(id_train_val)
    # test_id_set = set(id_test)
    # # train/val 和 test 无重复 ID
    # print("#common in train & test:", len(train_val_id_set.intersection(test_id_set)))  # 0
    # print("- 验证两种方法获取的 ID 划分一致 -")
    # print("#common in train:", len(train_img_id_set.intersection(train_val_id_set)))  # 5011
    # print("#common in test:", len(test_img_id_set.intersection(test_id_set)))  # 4952
    
    # print("- check id complete -")
    id_all = id_train_val + id_test
    print("#id:", len(id_all), ", max id:", max(id_all))
    n_id = max(id_all)
    # for i in range(1, n_id + 1):
    #     if i not in id_all:
    #         print("id absent:", i)
    # print("complete check done")
    
    
    print("- save indices -")
    id_train = np.array(id_train) - 1
    id_val = np.array(id_val) - 1
    id_train_val = np.array(id_train_val) - 1
    id_test = np.array(id_test) - 1
    print("id train-val:", id_train_val.max(), id_train_val.min())
    print("id test:", id_test.max(), id_test.min())
    
    np.save(join(P, "idx_train.npy"), id_train)
    np.save(join(P, "idx_val.npy"), id_val)
    np.save(join(P, "idx_train_val.npy"), id_train_val)
    np.save(join(P, "idx_test.npy"), id_test)
    
    
    """将全部 image 移到同一个目录"""
    # since all IDs are distinct
    # we can move all image into one dir
    
    if not os.path.exists(ALL_IMAGE_P):
        os.makedirs(ALL_IMAGE_P)
    
    
    def copy_image(path):
        img_ls = os.listdir(path)
        for i, f in enumerate(img_ls):
            # os.system("cp {} {}".format(join(path, f), ALL_IMAGE_P))  # linux
            os.system("copy {} {}".format(join(path, f), ALL_IMAGE_P))  # windows
            if i % 100 == 0:
                print(i)
    
    
    copy_image(TRAIN_IMAGE_P)
    copy_image(TEST_IMAGE_P)
    
    
    """处理 label"""
    # 2 method for processing label
    # both treat 0 tag as -1
    # http://host.robots.ox.ac.uk/pascal/VOC/voc2007/htmldoc/voc.html#SECTION00031000000000000000
    
    test_ls = os.listdir(TEST_LABEL_P)
    test_ls = [f for f in test_ls if "_test" in f]
    N_CLASS = len(test_ls)
    print("#class:", N_CLASS)
    # map id: name -> num
    test_ls = [f.split("_test")[0] for f in test_ls]  # 保留类名
    id_map = {name: num for num, name in enumerate(test_ls)}  # 类名 -> 类 ID
    print(id_map)
    
    
    print("--- 第一种方式:从 ImageSets/Main/ 提取 label ---")
    L_label = np.zeros((n_id, N_CLASS))
    
    
    def proc_label(path, suffix):
        """process by class
        path: {TRAIN_LABEL_P, TEST_LABEL_P}
        suffix: {"_trainval", "_test"}
        """
        file_ls = os.listdir(path)
        for _f in file_ls:
            if suffix not in _f:
                continue
            class_name = _f.split(suffix)[0]
            assert class_name in id_map
            c = id_map[class_name]
            pos_cnt = 0
            with open(join(path, _f), "r") as f:
                for line in f:  # format: ID  1/0/-1
                    line = line.split()
                    if int(line[1]) > 0:  # 只把 1 当正例
                        pos_cnt += 1
                        sid = int(line[0]) - 1  # 0-base
                        L_label[sid][c] = 1
            print("#{}: {}".format(class_name, pos_cnt))
    
    
    print("- train-val label -")
    proc_label(TRAIN_LABEL_P, "_trainval")
    print("- test label -")
    proc_label(TEST_LABEL_P, "_test")
    sum_label = L_label.sum(0)
    print("label statistics:", sum_label)
    np.save(join(P, "labels.l.npy"), L_label)
    
    
    print("--- 第二种方式:从 Annotations/ 提取 label ---")
    # https://github.com/HCPLab-SYSU/SSGRL/blob/master/datasets/voc07dataset.py
    L_anno = np.zeros((n_id, N_CLASS))
    
    
    def proc_annotation(path):
        """process by sample
        path: {TRAIN_ANNO_P, TEST_ANNO_P}
        """
        pos_cnt = {k: 0 for k in id_map.keys()}
        file_ls = os.listdir(path)
        for _f in file_ls:
            sid = file_key(_f) - 1
            DOMTree = minidom.parse(join(path, _f))
            root = DOMTree.documentElement
            objects = root.getElementsByTagName('object')
            for obj in objects:
                if '1' == obj.getElementsByTagName('difficult')[0].firstChild.data:  # 忽略 difficult
                    continue
                class_name = obj.getElementsByTagName('name')[0].firstChild.data.lower()
                assert class_name in id_map
                c = id_map[class_name]
                if 0 == L_anno[sid][c]:
                    L_anno[sid][c] = 1
                    pos_cnt[class_name] += 1
        print("pos count:", pos_cnt)
    
    
    print("- train-val annotation -")
    proc_annotation(TRAIN_ANNO_P)
    print("- test annotation -")
    proc_annotation(TEST_ANNO_P)
    sum_label = L_anno.sum(0)
    print("label statistics:", sum_label)
    np.save(join(P, "labels.a.npy"), L_anno)
    
    print("#diff:", (L_label != L_anno).astype(np.int8).sum())  # 0
    

    Cloud Drive

    链接:https://pan.baidu.com/s/1Mh_nX-y-ijvZEmy3lzTaNw,提取码:oq10
    PASCAL Visual Object Classes Challenge 2007(VOC 2007)数据集预处理_第1张图片

    References

    1. The PASCAL Visual Object Classes Challenge 2007
    2. 8.1.2 Classification Task Image Sets
    3. 2.1 Classification/Detection Image Sets
    4. SSGRL/datasets/voc07dataset.py
    5. 数据集:Pascal VOC 2007数据集分析

    你可能感兴趣的:(机器学习,VOC,2007,数据集,multi-label,classification,python)