自有数据集自动整理成kitti目录格式 python脚本 非常好用

一、背景

为了使用OpenPCDet的网络框架,借用kitti数据集已有的数据处理方法,我们需要把自己已标注数据进行文件夹和文件进行组织分类,便于进行预处理和加载。例如kitti的数据组织方式如下:

自有数据集自动整理成kitti目录格式 python脚本 非常好用_第1张图片

但是通常我们标注好的数据集一般包括两部分(以点云为例,图像类似,联合标定的除外),一是原始数据,包括原始激光雷达点云数据,pcd格式居多,存储的是每个点的三维坐标和对应的反射强度。第二部分是经过人工确认的标签信息,可能包括目标id,类别,坐标(x,y,z),尺寸(长,宽,高),航向角等信息,可以是csv,txt,等多种格式文件。

因此,我们需要手动组织自己的数据集,转换成kitti类似的形式,这里编写脚本根据预设的训练集验证集和测试集比例,自动组织数据

二、大致思路

1、读取工作目录下的数据文件,生成数据文件列表和总数

2、根据比例随机划分各个数据集索引

3、创建目录结构,复制对应索引的文件到指定路径

三、具体实现

import os
import numpy as np
import math
import random
import shutil
from progress.bar import Bar


def Data_PreProcess(rawpath, train_perce, val_perce, test_perce, label_form='txt', data_form='pcd2bin'):
    # 1 读取路径下数据文件,并与标签文件名比对,匹配成功生成数据列表和标签列表,以及无标签数据列表
    # 2、按预设比例生成各个数据集索引列表
    # 3、创建各个数据集文件夹,根据格式要求,重组数据

    label_num = 0
    data_num = 0

    for dir_name in os.listdir(rawpath):
        if dir_name == 'label':

            label_names = os.listdir(rawpath + 'label/')
            label_names.sort()
            label_num = len(label_names)
            print("=====label==========")
            print("Label_nums: ", label_num)
            print("Firstlabel: ", label_names[0])
            print("Lastlabel : ", label_names[label_num - 1])

        elif dir_name == 'data':
            data_names = os.listdir(rawpath + 'data/')
            data_names.sort()
            data_num = len(data_names)
            print("=====data==========")
            print("data_nums: ", data_num)
            print("Firstdata: ", data_names[0])
            print("Lastdata : ", data_names[data_num - 1])
        else:
            continue

    if data_num == 0:
        print("\nError!   can not find 'data'  in ", rawpath)
    if label_num == 0:
        print("\nError!   can not find 'label' in ", rawpath)
    if data_num != label_num:
        print("\nWarning! data_num!=label_num ")

    # 检查数据与标签名是否一致
    j = 0
    for i in range(len(label_names)):
        templabel = label_names[i][:-4]
        tempdata = data_names[i][:-4]
        if templabel != tempdata:
            print("label and data not match", templabel, tempdata, i)
            j = j + 1
    print(" not match  num :", j)

    # 按比例分配数据集
    # 训练集 /和(验证+测试)
    train_data, traindata_index, other_data, other_index = DataSplit(data_names, train_perce)
    if test_perce == 0 and train_perce + val_perce == 1:
        val_data = other_data
        valdata_index = other_index
    else:
        # 再次划分 验证集和测试集
        percent = val_perce / (val_perce + test_perce) * 1.0
        val_data, valdata_index, test_data, test_index = DataSplit(other_data, percent)
    print("=====Spil Set==========")
    print("train_num: ", len(train_data))
    print("Val_num  : ", len(val_data))
    print("test_num : ", len(test_data))

    # 将分配好的数据配置 写入“dataSets”目录下的 train.txt val.txt test.txt
    # 创建目录结构
    pre_root_path = rawpath + 'DataGroup/'
    DataSets_Path = pre_root_path + '/DataSets'
    traindata_Path = pre_root_path + '/training/data'
    trainlabel_Path = pre_root_path + '/training/label'
    testdata_Path = pre_root_path + '/testing/data'
    testlabel_Path = pre_root_path + '/testing/label'
    if not os.path.exists(DataSets_Path):
        os.makedirs(DataSets_Path)
    if not os.path.exists(traindata_Path):
        os.makedirs(traindata_Path)
    if not os.path.exists(trainlabel_Path):
        os.makedirs(trainlabel_Path)
    if not os.path.exists(testdata_Path):
        os.makedirs(testdata_Path)
    if not os.path.exists(testlabel_Path):
        os.makedirs(testlabel_Path)

    # 创建数据分配文本:
    train_filename = DataSets_Path + '/train.txt'
    val_filename = DataSets_Path + '/val.txt'
    test_filename = DataSets_Path + '/test.txt'

    train_cluster = []
    test_cluster = []

    train_file = open(train_filename, 'w')
    for i in train_data:
        index = i.rfind('.')
        name = i[:index]
        train_file.write(name + '\n')
        train_cluster.append(name)
    train_file.close()

    val_file = open(val_filename, 'w')
    for i in val_data:
        index = i.rfind('.')
        name = i[:index]
        val_file.write(name + '\n')
        train_cluster.append(name)
    val_file.close()

    test_file = open(test_filename, 'w')
    for i in test_data:
        index = i.rfind('.')
        name = i[:index]
        test_file.write(name + '\n')
        test_cluster.append(name)
    test_file.close()

    # 将分配好的数据配置 写入“prepcess”目录下的 training val testing
    # training  copy data to
    data_srcfile = rawpath + '/data/'
    label_srcfile = rawpath + '/label/'

    tra_data_dstpath = traindata_Path + '/'
    tra_label_dstpath = trainlabel_Path + '/'

    test_data_dstpath = testdata_Path + '/'
    test_label_dstpath = testlabel_Path + '/'

    bar = Bar('cluster train_data...', max=len(train_cluster), fill='#', suffix='%(percent)d%%')
    for i in train_cluster:
        if data_form=='pcd':
            pcd2bin(data_srcfile + i + '.pcd', tra_data_dstpath + i + '.bin')
        elif data_form == 'bin':
            shutil.copy(data_srcfile + i + '.bin', tra_data_dstpath + i + '.bin')  # 复制文件 到 training/data
        else:
            print("Unkown data form! 'bin' or 'pcd' can be read")

        if label_form=='csv2txt':
            #
            print(1)
        else:
            shutil.copy(label_srcfile + i + '.csv', tra_label_dstpath + i + '.csv')  # 复制文件 到 training/label
        bar.next()
    bar.finish()

    bar = Bar('cluster test_data...', max=len(test_cluster), fill='#', suffix='%(percent)d%%')
    for i in test_cluster:
        if data_form == 'pcd':# 复制文件 到 testing/data
            pcd2bin(data_srcfile + i + '.pcd', test_data_dstpath + i + '.bin')
        elif data_form == 'bin':
            shutil.copy(data_srcfile + i + '.bin', test_data_dstpath + i + '.bin')
        else:
            print("Unkown data form! 'bin' or 'pcd' can be read")

        if label_form=='csv2txt':
            #
            print(1)
        else:
            shutil.copy(label_srcfile + i + '.csv', test_label_dstpath + i + '.csv')  # 复制文件 到 testing/label
        bar.next()
    bar.finish()

def DataSplit(raw_data, get_precet):
    # 获取数据长度,并生成相同长度索引
    dataA = []
    dataB = []
    index_B = []

    length = len(raw_data)
    index_rawdata = np.arange(0, length, 1)  # 连续索引
    index_rawdata_copy = index_rawdata[:]  # 复制一分
    if get_precet < 1 and get_precet > 0:
        get_num = math.floor(length * get_precet)  # 向下取整 避免溢出

        index_A = random.sample(range(0, length), get_num)
        for j in index_A:
            dataA.append(raw_data[j])
            index_rawdata_copy[j] = -1
        for k in index_rawdata_copy:
            if k != -1:
                index_B.append(k)
                dataB.append(raw_data[k])
    else:
        print('Precentage out of range [0,1]')
    return dataA, index_A, dataB, index_B


def read_pcd(filepath):
    lidar = []
    i=0
    with open(filepath, 'r') as f:
        line = f.readline().strip()
        while line:
            linestr = line.split(" ")
            i = i + 1
            if i>10:#skip pcd file head
                if len(linestr) == 3:# only x,y,z
                    linestr_convert = list(map(float, linestr))
                    linestr_convert.append(0)
                    lidar.append(linestr_convert)
                if len(linestr) == 4:# x,y,z,i
                    linestr_convert = list(map(float, linestr))
                    lidar.append(linestr_convert)
            line = f.readline().strip()
    return np.array(lidar)

def pcd2bin(pcd_fullname, bin_fullname):
    pl = read_pcd(pcd_fullname)
    pl = pl.reshape(-1, 4).astype(np.float32) # x,y,z,i
    pl.tofile(bin_fullname)



if __name__ == '__main__':
    Data_PreProcess("/data/OpenPCDet/data/rawdata/", 0.5, 0.3, 0.2, 'txt', 'bin')

四、最终形成的目录结构  

自有数据集自动整理成kitti目录格式 python脚本 非常好用_第2张图片

你可能感兴趣的:(深度学习,深度学习,机器学习,人工智能)