207 迁移并构建数据集到 205

  1. Dataturks 导出 Retail Product Dataset.json

  2. 从 Retail Product Dataset.json 中抽离 Retail Cigar Dataset.json,并做以下操作:

  • cp 新图片 到 205 路径
  • line 更新 content 路径
def filter_cigars_to_json():
    old_dir = '/nfs/xs/retail'
    new_dir = '/nfs/xs/docker/vipaturks'
    old_projectId = '2c9180826d47a650016d5e359eaf0004'  # Retail Products Dataset
    new_projectId = '2c9180836ea666e4016ea8019d5e0001'  # CigarSeg
    in_path = os.path.join('data', 'Retail Products Dataset.json')
    out_path = os.path.join('data', 'Retail Cigar Dataset.json')

    with open(out_path, 'w', encoding='utf-8') as fw:
        with open(in_path, 'r', encoding='utf-8') as fr:
            lines = fr.readlines()
            print('total img:', len(lines))
            cigar_cnt = 0
            cp_cnt = 0
            for line in lines:
                product_dict = json.loads(line)

                # judge if img has been evaluated CORRECT
                if product_dict['metadata']['evaluation'] != 'CORRECT':
                    continue
                # judge if annotation is empty
                anns = product_dict['annotation']
                if anns is None:
                    continue

                has_cigar = False
                for ann in anns:
                    cat = ann['label'][0]
                    # 要设置结尾匹配,不然 BIG_ROLL 这样也会匹配到
                    if re.match('^.+_[A-Z]$', cat) or re.match('^.+_[a-z]$', cat):
                        has_cigar = True
                        cigar_cnt += 1
                        break

                if has_cigar:
                    # 1.cp img from old_path(207) to new_path(205)
                    old_content = product_dict['content']
                    new_content = old_content.replace(old_projectId, new_projectId)
                    old_img_path = old_dir + old_content  # path '/uploads', can't directly use os.path.join
                    new_img_path = new_dir + new_content
                    if not os.path.exists(new_img_path):
                        shutil.copyfile(old_img_path, new_img_path)
                        cp_cnt += 1
                        print('\rcp', cp_cnt, end='')  # print basename so slow!

                    # 2.update line content, and write to json
                    line = line.replace(old_content, new_content)  # if str(dict) directly, ' may not work
                    fw.write(line)

            print('\ncigar img:', cigar_cnt)
  1. 构建数据集,注意修改 dataset_utils.py 中 convert_to_coco(),不同任务,转化的 box 不同
def build_top_k_dataset(dataset, top_k=None):
    """ create dataset with top_k classes """
    filted_cats, filted_cats_num, train_num, val_num, test_num = split_and_save_coco_dataset(dataset, dataset_dir=dataset_dir, top_k=top_k)
    prefix = '{}_'.format(top_k) if top_k else ''
    data_cfg = {
        'name': 'Cigar Rotated Box',
        'cats_num': filted_cats_num,
        'classes': len(filted_cats),
        'train': train_num,
        'valid': val_num,
        'test': test_num
    }
    dump_json(data_cfg, out_path=os.path.join(dataset_dir, prefix + '{}_cfg.json'.format(dataset_name)))


def build_rbox_dataset():
    """ create mulit rbox dataset with multi top_k classes """
    in_path = os.path.join('data', 'Retail Cigar Dataset.json')
    dataset = create_dataset_from_dataturks_json(dataturks_json_path=in_path)
    # build with top k classes
    top_ks = 20, 40
    for tk in top_ks:
        build_top_k_dataset(dataset, top_k=tk)
    # build with all classes
    build_top_k_dataset(dataset)
  1. 如果想在 Dataturks 中更新 CigarSeg,可以
  • 清空数据集
def delete_d_hits_by_name(project_name):
    sql = "delete from d_hits where projectId='{}'".format(query_projectId_by_name(project_name))
    db.session.execute(sql)
    db.session.commit()
  • 重新上传 Retail Cigar Dataset.json

至此,Cigar 数据迁移完毕。

你可能感兴趣的:(207 迁移并构建数据集到 205)