[资料整理]将场景文字数据集ICDAR2013,ICDAR2015转换为PASCAL_VOC数据集格式

写在前面:常见的目标检测框架,如faster-rcnn,SSD等,一般都提供了对pascal_voc数据集的读取接口。而将目标检测的框架应用到场景文字检测上,往往可以取得不错的效果。那么,接下来,我就介绍一下,将几种常见的场景文字数据集,如ICDAR2013,ICDAR2015,ICDAR2017,coco-text等转换为pascal_voc数据集的方式。

 

1.ICDAR2013

ICDAR2013包括四个文件夹,分别是:

训练图像集:Challenge2_Training_Task12_Images

训练标注集:Challenge2_Training_Task1_GT

测试图像集:Challenge2_Test_Task12_Images

测试标注集:Challenge2_Test_Task1_GT

标注格式:

xmin, ymin, xmax, ymax, text

举例:38, 43, 920, 215, "Tiredness"

准备工作:新建文件夹VOC2007,目录如下:

VOC2007

-VOC2007/Annotations

-VOC2007/ImageSets

-VOC2007/ImageSets/Main

-VOC2007/ImageSets/Main/test.txt

-VOC2007/ImageSets/Main/trainval.txt

-VOC2007/JPEGImages

转换代码如下,其中转换的结果在VOC2007文件夹下面:

 

#! /usr/bin/python
#coding:utf-8

import os, sys
import glob
from PIL import Image

# target dir
base_dir = "VOC2007"

target_img_dir = base_dir + "/" + "JPEGImages"
target_ann_dir = base_dir + "/" + "Annotations"
target_set_dir = base_dir + "/" + "ImageSets"

# source train dir
train_img_dir = "Challenge2_Training_Task12_Images"
train_txt_dir = "Challenge2_Training_Task1_GT"


# source test dir
test_img_dir = "Challenge2_Test_Task12_Images"
test_txt_dir = "Challenge2_Test_Task1_GT"


# rename and move img to target_img_dir
# train img 
for file in os.listdir(train_img_dir):
    os.rename(os.path.join(train_img_dir,file),os.path.join(target_img_dir,"ICDAR2013_Train_" + os.path.basename(file)))

# test img
for file in os.listdir(test_img_dir):
    os.rename(os.path.join(test_img_dir,file),os.path.join(target_img_dir,"ICDAR2013_Test_" + os.path.basename(file)))

# create annotations to target_ann_dir
img_Lists = glob.glob(target_img_dir + '/*.jpg')

img_basenames = [] # e.g. 100.jpg
for item in img_Lists:
    img_basenames.append(os.path.basename(item))

img_names = [] # e.g. 100
for item in img_basenames:
    temp1, temp2 = os.path.splitext(item)
    img_names.append(temp1)

for img in img_names:
    im = Image.open((target_img_dir + '/' + img + '.jpg'))
    width, height = im.size

    # open the crospronding txt file
    if 'Train' in img:
    	gt = open(train_txt_dir + '/gt_' + img.split('_')[-1] + '.txt').read().splitlines()
        # write in xml file
    	xml_file = open((target_ann_dir + '/' + img + '.xml'), 'w')
    	xml_file.write('\n')
    	xml_file.write('    VOC2007\n')
    	xml_file.write('    ' + str(img) + '.jpg' + '\n')
    	xml_file.write('    \n')
    	xml_file.write('        ' + str(width) + '\n')
    	xml_file.write('        ' + str(height) + '\n')
    	xml_file.write('        3\n')
    	xml_file.write('    \n')

    	# write the region of text on xml file
    	for img_each_label in gt:
	    spt = img_each_label.split(' ')
	    xml_file.write('    \n')
	    xml_file.write('        text\n')
	    xml_file.write('        Unspecified\n')
	    xml_file.write('        0\n')
	    xml_file.write('        0\n')
	    xml_file.write('        \n')
	    xml_file.write('            ' + str(spt[0]) + '\n')
	    xml_file.write('            ' + str(spt[1]) + '\n')
	    xml_file.write('            ' + str(spt[2]) + '\n')
	    xml_file.write('            ' + str(spt[3]) + '\n')
	    xml_file.write('        \n')
	    xml_file.write('    \n')

        xml_file.write('')

    if 'Test' in img:
        gt = open(test_txt_dir + '/gt_img_' + img.split('_')[-1] + '.txt').read().splitlines()
	# write in xml file
    	xml_file = open((target_ann_dir + '/' + img + '.xml'), 'w')
    	xml_file.write('\n')
    	xml_file.write('    VOC2007\n')
    	xml_file.write('    ' + str(img) + '.jpg' + '\n')
    	xml_file.write('    \n')
    	xml_file.write('        ' + str(width) + '\n')
    	xml_file.write('        ' + str(height) + '\n')
    	xml_file.write('        3\n')
    	xml_file.write('    \n')

    	# write the region of text on xml file
    	for img_each_label in gt:
	    spt = img_each_label.split(',')
	    xml_file.write('    \n')
	    xml_file.write('        text\n')
	    xml_file.write('        Unspecified\n')
	    xml_file.write('        0\n')
	    xml_file.write('        0\n')
	    xml_file.write('        \n')
	    xml_file.write('            ' + str(spt[0]) + '\n')
	    xml_file.write('            ' + str(spt[1]) + '\n')
	    xml_file.write('            ' + str(spt[2]) + '\n')
	    xml_file.write('            ' + str(spt[3]) + '\n')
	    xml_file.write('        \n')
	    xml_file.write('    \n')

        xml_file.write('')

# write info into target_set_dir
img_lists = glob.glob(target_ann_dir + '/*.xml')
img_names = []
for item in img_lists:
    temp1, temp2 = os.path.splitext(os.path.basename(item))
    img_names.append(temp1)

train_fd = open(target_set_dir + "/Main/trainval.txt", 'w')
test_fd = open(target_set_dir + "/Main/test.txt", 'w')

for item in img_names:
    if 'Train' in item:
        train_fd.write(str(item) + '\n')
    if 'Test' in item:
        test_fd.write(str(item) + '\n')
    

 

 

 

 

 

2.ICDAR2015

ICDAR2015包含三个文件夹:

训练图像集:ch4_training_images

训练标注集:ch4_training_localization_transcription_gt

测试图像集:ch4_test_images

其中,ICDAR2015不包含测试标注集,但提供了测试web接口。因此,这里只对训练集进行了转换。

标注格式:x1,y1,x2,y2,x3,y3,x4,y4,text

其中,x1,y1为左上角坐标,x2,y2为右上角坐标,x3,y3为右下角坐标,x4,y4为左下角坐标。‘###’表示text难以辨认。

377,117,463,117,465,130,378,130,Genaxis Theatre

374,155,409,155,409,170,374,170,###

创建文件夹的准备工作同上。

转换代码如下,同样,将结果保存在voc2007中。

 

#! /usr/bin/python
#coding:utf-8

import os, sys
import glob
from PIL import Image
import cv2
import numpy as np

# target dir
base_dir = "ICDAR2015/VOC2007"

target_img_dir = base_dir + "/" + "JPEGImages/"
target_ann_dir = base_dir + "/" + "Annotations/"
target_set_dir = base_dir + "/" + "ImageSets/"

# source train dir
train_img_dir = "ch4_training_images/"
train_txt_dir = "ch4_training_localization_transcription_gt/"

test_img_dir = "ch4_test_images"

# rename and move img to target_img_dir
# train img 

for file in os.listdir(train_img_dir):
    os.rename(os.path.join(train_img_dir,file),os.path.join(target_img_dir,"ICDAR2015_Train_" + os.path.basename(file)))

for file in os.listdir(test_img_dir):
    os.rename(os.path.join(test_img_dir,file),os.path.join(target_img_dir,"ICDAR2015_Test_" + os.path.basename(file)))

gt_list = []	

for file_name in os.listdir(target_img_dir):
	img_list.append(file_name)


for idx in range(len(img_list)):
	img_name = target_img_dir + img_list[idx]
	gt_name = train_txt_dir + 'gt_img_' + img_list[idx].split('.')[0].split('_')[3]+'.txt'

	#print gt_name
	gt_obj = open(gt_name, 'r')

	gt_txt = gt_obj.read()

	gt_split = gt_txt.split('\n')

	img = cv2.imread(img_name)
        
        im = Image.open(img_name)  
        imgwidth, imgheight = im.size

	# write in xml file
	xml_file = open((target_ann_dir + img_list[idx].split('.')[0] + '.xml'), 'w')
	xml_file.write('\n')
	xml_file.write('    VOC2007\n')
	xml_file.write('    ' + img_list[idx] + '\n')
	xml_file.write('    \n')
	xml_file.write('        ' + str(imgwidth) + '\n')
	xml_file.write('        ' + str(imgheight) + '\n')
	xml_file.write('        3\n')
	xml_file.write('    \n')

	f = False
	difficult = 0
	for gt_line in open(gt_name):
		gt_ind = gt_line.split(',')
		if len(gt_ind) > 3:
			gt_ind[0] = filter(str.isdigit, gt_ind[0])
			pt1 = (int(gt_ind[0]), int(gt_ind[1]))
			pt2 = (int(gt_ind[2]), int(gt_ind[3]))
			pt3 = (int(gt_ind[4]), int(gt_ind[5]))
			pt4 = (int(gt_ind[6]), int(gt_ind[7]))
                        dtxt = gt_ind[8]
			if "###" in dtxt:
				difficult = 1
			else:
				difficult = 0
		
			edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) * (pt1[1] - pt2[1]))
			edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) * (pt2[1] - pt3[1]))

			angle = 0
		
			if edge1 > edge2:
			
				width = edge1
				height = edge2
				if pt1[0] - pt2[0] != 0:
					angle = -np.arctan(float(pt1[1] - pt2[1]) / float(pt1[0] - pt2[0])) / 3.1415926 * 180
				else:
					angle = 90.0
			elif edge2 >= edge1:
				width = edge2
				height = edge1
				#print pt2[0], pt3[0]
				if pt2[0] - pt3[0] != 0:
					angle = -np.arctan(float(pt2[1] - pt3[1]) / float(pt2[0] - pt3[0])) / 3.1415926 * 180
				else:
					angle = 90.0
			if angle < -45.0:
				angle = angle + 180

			x_ctr = float(pt1[0] + pt3[0]) / 2#pt1[0] + np.abs(float(pt1[0] - pt3[0])) / 2
			y_ctr = float(pt1[1] + pt3[1]) / 2#pt1[1] + np.abs(float(pt1[1] - pt3[1])) / 2

		

			# write the region of text on xml file
			xml_file.write('    \n')
			xml_file.write('        text\n')
			xml_file.write('        Unspecified\n')
			xml_file.write('        0\n')
			xml_file.write('        ' + str(difficult) + '\n')
			xml_file.write('        \n')
			xml_file.write('            ' + str(x_ctr) + '\n')
			xml_file.write('            ' + str(y_ctr) + '\n')
			xml_file.write('            ' + str(width) + '\n')
			xml_file.write('            ' + str(height) + '\n')
			xml_file.write('            ' + str(angle) + '\n')
			xml_file.write('        \n')
			xml_file.write('    \n')

	xml_file.write('')

# write info into target_set_dir
img_lists = glob.glob(target_ann_dir + '/*.xml')
img_names = []
for item in img_lists:
	temp1, temp2 = os.path.splitext(os.path.basename(item))
	img_names.append(temp1)

train_fd = open(target_set_dir + "/Main/trainval.txt", 'w')
for item in img_names:
	train_fd.write(str(item) + '\n')

 

 

 

 

 

3.ICDAR2017-Chinese

 

 

ICDAR2017-Chinese包含两个文件夹:

训练集:icdar2017rctw_train_v1.2

-icdar2017rctw_train_v1.2/train

训练图像文件和训练标注文件都放在了icdar2017rctw_train_v1.2/train文件夹中。

标注格式:x1,y1,x2,y2,x3,y3,x4,y4,difficult,text

其中,x1,y1为左上角坐标,x2,y2为右上角坐标,x3,y3为右下角坐标,x4,y4为左下角坐标。difficult为1表示text难以辨认,0表示容易辨认。

举例:390,902,1856,902,1856,1225,390,1225,0,"金氏眼镜"

测试集:icdar2017rctw_test

创建文件夹的准备工作同上。

转换代码如下,同样,将结果保存在voc2007中。

 

#! /usr/bin/python
#coding:utf-8

import os, sys
import glob
from PIL import Image
import cv2
import numpy as np

# target dir
base_dir = "ICDAR2017/VOC2007"

target_img_dir = base_dir + "/" + "JPEGImages/"
target_ann_dir = base_dir + "/" + "Annotations/"
target_set_dir = base_dir + "/" + "ImageSets/"

# source train dir
train_img_dir = "icdar2017rctw_train_v1.2/train/"
train_txt_dir = "icdar2017rctw_train_v1.2/train/"

# rename and move img to target_img_dir
# train img 
for file in os.listdir(train_img_dir):
    os.rename(os.path.join(train_img_dir,file),os.path.join(target_img_dir,"ICDAR2017_Train_" + os.path.basename(file)))

gt_list = []
img_list = []		

for file_name in os.listdir(target_img_dir):  
    if file_name.split('.')[-1] == 'jpg':  
        print file_name  
        img_list.append(file_name)  
  
for file_name in os.listdir(target_img_dir):  
    if file_name.split('.')[-1] == 'txt':  
        print file_name  
        gt_list.append(file_name)

for idx in range(len(img_list)):
	img_name = target_img_dir + img_list[idx]
	gt_name = train_txt_dir + 'image_' + img_list[idx].split('.')[0].split('_')[3]+'.txt'

	#print gt_name
	gt_obj = open(gt_name, 'r')

	gt_txt = gt_obj.read()

	gt_split = gt_txt.split('\n')

	img = cv2.imread(img_name)
        
        im = Image.open(img_name)  
        imgwidth, imgheight = im.size

	# write in xml file
	xml_file = open((target_ann_dir + img_list[idx].split('.')[0] + '.xml'), 'w')
	xml_file.write('\n')
	xml_file.write('    VOC2007\n')
	xml_file.write('    ' + img_list[idx] + '\n')
	xml_file.write('    \n')
	xml_file.write('        ' + str(imgwidth) + '\n')
	xml_file.write('        ' + str(imgheight) + '\n')
	xml_file.write('        3\n')
	xml_file.write('    \n')

	f = False
	for gt_line in open(gt_name):
		gt_ind = gt_line.split(',')
		if len(gt_ind) > 3:
			#condinate_list = gt_ind[2].split(',')
			#print gt_ind
			pt1 = (int(gt_ind[0]), int(gt_ind[1]))
			pt2 = (int(gt_ind[2]), int(gt_ind[3]))
			pt3 = (int(gt_ind[4]), int(gt_ind[5]))
			pt4 = (int(gt_ind[6]), int(gt_ind[7]))
                        difficult = gt_ind[8]
			
		
			edge1 = np.sqrt((pt1[0] - pt2[0]) * (pt1[0] - pt2[0]) + (pt1[1] - pt2[1]) * (pt1[1] - pt2[1]))
			edge2 = np.sqrt((pt2[0] - pt3[0]) * (pt2[0] - pt3[0]) + (pt2[1] - pt3[1]) * (pt2[1] - pt3[1]))

			angle = 0
		
			if edge1 > edge2:
			
				width = edge1
				height = edge2
				if pt1[0] - pt2[0] != 0:
					angle = -np.arctan(float(pt1[1] - pt2[1]) / float(pt1[0] - pt2[0])) / 3.1415926 * 180
				else:
					angle = 90.0
			elif edge2 >= edge1:
				width = edge2
				height = edge1
				#print pt2[0], pt3[0]
				if pt2[0] - pt3[0] != 0:
					angle = -np.arctan(float(pt2[1] - pt3[1]) / float(pt2[0] - pt3[0])) / 3.1415926 * 180
				else:
					angle = 90.0
			if angle < -45.0:
				angle = angle + 180

			x_ctr = float(pt1[0] + pt3[0]) / 2#pt1[0] + np.abs(float(pt1[0] - pt3[0])) / 2
			y_ctr = float(pt1[1] + pt3[1]) / 2#pt1[1] + np.abs(float(pt1[1] - pt3[1])) / 2

		

			# write the region of text on xml file
			xml_file.write('    \n')
			xml_file.write('        text\n')
			xml_file.write('        Unspecified\n')
			xml_file.write('        0\n')
			xml_file.write('        ' + str(difficult) + '\n')
			xml_file.write('        \n')
			xml_file.write('            ' + str(x_ctr) + '\n')
			xml_file.write('            ' + str(y_ctr) + '\n')
			xml_file.write('            ' + str(width) + '\n')
			xml_file.write('            ' + str(height) + '\n')
			xml_file.write('            ' + str(angle) + '\n')
			xml_file.write('        \n')
			xml_file.write('    \n')

	xml_file.write('')

# write info into target_set_dir
img_lists = glob.glob(target_ann_dir + '/*.xml')
img_names = []
for item in img_lists:
	temp1, temp2 = os.path.splitext(os.path.basename(item))
	img_names.append(temp1)

train_fd = open(target_set_dir + "/Main/trainval.txt", 'w')
for item in img_names:
	train_fd.write(str(item) + '\n')

 

 

 

 

 

 

你可能感兴趣的:(场景文字检测与识别,场景文字检测与识别)