从VOC,ImageNet,COCO,OpenImage等数据集中挑选自己需要的类别数据

       VOC,ImageNet,COCO,OpenImage是CV行业常用的开源数据集,其中VOC比较古老了,ImageNet名气最响,种类和图片量较多,OpenImage种类和图片量最大,COCO介于他们之间吧。很多时候我们并不需要这些数据集中的所有数据,而是其中一小部分,因此需要从原始数据中挑选出自己需要的类别数据,下面就来我在这些数据集合中挑选我需要类别的一点小结。

      首先要说明的是,这些数据集的标记文件都是不一样的,具体如下:

     VOC:xml。标记中直接使用目标类别名称,和绝对尺寸的标记框

     ImageNet:xml。标记中直接使用目标类别词条索引,和绝对尺寸的标记框

     COCO:json。标记中直接使用目标类别名称,和绝对尺寸的标记框

    OpenImage:csv。标记中直接使用目标类别词条索引,和相对尺寸的标记框

       因为我还需要自己标注一些数据,采用的是labelimg这个软件标注的,为了方便数据查看和格式统一,这里我把挑选的数据标注文件格式都统一转化为和VOC相同的xml格式,数据具体结构也和VOC大致一样,不过为了方便管理增添了一些信息,这个我在下面挑选VOC数据集时会说明一下。

      为了简洁,我直接附上代码吧。还有为了便于数据管理,我采用了用类别名来+图片编号来命名图片名和标注文件名,为啥采用这种格式,主要考虑基于下面几点(最根本原因是方便图片管理):

      还是先上一张VOC图片命名方式(年号+图像编号)的图片吧,有图有真相

从VOC,ImageNet,COCO,OpenImage等数据集中挑选自己需要的类别数据_第1张图片

       再上一张我的图片命名样式图吧

从VOC,ImageNet,COCO,OpenImage等数据集中挑选自己需要的类别数据_第2张图片

这两个什么不同吧,你看第一种命名方式,你不细看图片内容,你能知道图片有啥吗?鬼知道呀。但我采用的是类别+当前类别图片编号命名,如果一个图片有多个类别,优先级高的类别具有图片命名权,比如我在VOC中挑选的类别有car和bird,其中car的优先级最高,因此当一张图片中同时含有car和bird时,car具有图片命名权。此时你一看文件名字你就知道这张图片至少有个啥目标。这么做主要有一下几点考虑:

1、方便数据管理。你不用统计你特别关注的类别样本数,只要通过最后一个文件的名字就能估计出当前数据库最少有多少个样本

2、方便数据管理。切分train,val,test时不用担心你特别关注的类别样本都划到trian或val后test去了

3、方便数据管理。检测效果不好,你需要补充数据,你知道急需补充的是哪些

嗯,好像大致就这些

首先附上VOC格式的xml操作处理函数,下面挑选数据和转化需要用到。因为函数名基本说明了各个函数干什么用的,在此我就不多解释,有点基础一看就知道了,下面的程序采用的是python2写得,转到python3会有些问题,主要是print,还有字典,python3中去掉了has_key等一些问题,有些地方现在看不是那么优雅,算了懒得改了,python2中反正能用的。

#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 2018/08/15 by DQ

import os,sys
reload(sys)
sys.setdefaultencoding('UTF-8')

from xml.etree.ElementTree import Element, SubElement, ElementTree
try:
	import xml.etree.cElementTree as ET
except ImportError:
	import xml.etree.ElementTree as ET
	
##get object annotation bndbox loc start
def GetImAnnotBoxs(AnotPath,BBoxAreaTol=50*50):
	tree = ET.ElementTree(file=AnotPath)  # open xml
	root = tree.getroot()
	ObjectSet = root.findall('object')
	ObjBndBoxSet = {}
	_,AnnotName=os.path.split(AnotPath)
	for Object in ObjectSet:
		ObjName = Object.find('name').text
		BndBox = Object.find('bndbox')
		try :
			IsDifficult=Object.find('difficult').text
		except:
			IsDifficult=0
			
		if int(IsDifficult):
			continue
		try:
			x1 = int(BndBox.find('xmin').text)  # -1
		except:
			x1 = int(float(BndBox.find('xmin').text))
		try:
			y1 = int(BndBox.find('ymin').text)  # -1
		except:
			y1 = int(float(BndBox.find('ymin').text))  # -1
		try:
			x2 = int(BndBox.find('xmax').text)  # -1
		except:
			x2 = int(float(BndBox.find('xmax').text))  # -1
		try:
			y2 = int(BndBox.find('ymax').text)  # -1
		except:
			y2 = int(float(BndBox.find('ymax').text))  # -1
			
		BndBoxLoc = [x1, y1, x2, y2]
		BBoxArea=(x2-x1)*(y2-y1)
		if BBoxArea>=BBoxAreaTol:
			if ObjBndBoxSet.has_key(ObjName):
				ObjBndBoxSet[ObjName].append(BndBoxLoc)
			else:
				ObjBndBoxSet[ObjName] = [BndBoxLoc]  # why not ues dict(key=val)?
	return ObjBndBoxSet
##get object annotation bndbox loc end

def GetAnnotImWHD(AnotPath):
	tree = ET.ElementTree(file=AnotPath)  # open xml
	root = tree.getroot()
	source = root.find('size')
	ImW = source.find('width').text
	ImH = source.find('height').text
	ImDep=source.find('depth').text
	ImWHD=(int(ImW),int(ImH),int(ImDep))
	return ImWHD

def GetAnnotImOriginalName(AnotPath):
	tree = ET.ElementTree(file=AnotPath)  # open xml
	root = tree.getroot()
	source = root.find('source')
	ImName = source.find('image').text
	return ImName

ImExpName = '.jpg'
AnnotExpName = '.xml'

def WriteObject(Object, ObjectName, Bndbox):  # =SubElement(object,'').text=
    name = SubElement(Object, 'name').text = ObjectName
    truncated = SubElement(Object, 'truncated').text = '0'
    difficult = SubElement(Object, 'difficult').text = '0'

    bndbox = SubElement(Object, 'bndbox')  # =SubElement(bndbox,'').text=
    xmin = SubElement(bndbox, 'xmin').text = str(int(Bndbox[0]))
    ymin = SubElement(bndbox, 'ymin').text = str(int(Bndbox[1]))
    xmax = SubElement(bndbox, 'xmax').text = str(int(Bndbox[2]))
    ymax = SubElement(bndbox, 'ymax').text = str(int(Bndbox[3]))


def VOCXml2MyXml(AnotName, ImSource, ObjectSet, AnotWriteFolder,ImWHD,
                 AnnotImFolder='VOC2012',CurY_M_D={},NotesDict={}):
    SplitStr = AnotName.split('.')
    ImId = SplitStr[0]
    ImName = ImId + ImExpName

    Root = Element('annotation')  # =SubElement(Root,'')
    folder = SubElement(Root, 'folder').text = AnnotImFolder
    filename = SubElement(Root, 'filename').text = ImName

    source = SubElement(Root, 'source')  # =SubElement(source,'').text
    database = SubElement(source, 'database').text ='Foreigner Database 2019'  
    database = SubElement(source, 'image').text = ImSource
    owner = SubElement(source, 'owner').text = 'DQ'
    date = SubElement(source, 'date').text = CurY_M_D
    if NotesDict:
        NoteTextSet=''
        k=0
        for NoteKey, NoteText in NotesDict.iteritems():
            if ObjectSet.has_key(NoteKey):
                k+=1
                NoteTextSet+=unicode(str(k)+'.'+NoteKey+':'+NoteText+';\n')
        if NoteTextSet:
            note = SubElement(source, 'note').text = NoteTextSet

    size = SubElement(Root, 'size')  # =SubElement(size,'').text
    width = SubElement(size, 'width').text = str(ImWHD[0]) #'960'
    height = SubElement(size, 'height').text = str(ImWHD[1])#'540'
    depth = SubElement(size, 'depth').text = str(ImWHD[2]) #'3'

    segmented = SubElement(Root, 'segmented').text = '0'
    # print 'VOCXml2MyXmlForInsulator.py-->VOCXml2MyXml()'

    if ObjectSet:
        for ObjectName, BndboxSet in ObjectSet.iteritems():
            for Bndbox in BndboxSet:
                Object = SubElement(Root, 'object')
                WriteObject(Object, ObjectName, Bndbox)

    Tree = ElementTree(Root)
    AnotPath = os.path.join(AnotWriteFolder, AnotName)
    Tree.write(AnotPath)
    

下面我把一个文件中的包含在这四个数据集中挑选文件的函数分开来陈述:

先附上各个子函数开头需要的公用部分:

#!/usr/bin/python
# -*- coding: UTF-8 -*-
# 2019/09/19 by DQ

import os
import datetime
import shutil
from XmlOptFuns import GetImAnnotBoxs,GetAnnotImWHD,VOCXml2MyXml #Xml标注文件的常用操作函数

Year = datetime.datetime.now().year
Month = str(datetime.datetime.now().month).zfill(2)
Day = str(datetime.datetime.now().day).zfill(2)
Hour=str(datetime.datetime.now().hour).zfill(2)
Minute=str(datetime.datetime.now().minute).zfill(2)
Y_M_D = '{}-{}-{}'.format(Year, Month, Day)
Y_M_D_H_M='{}-{}-{}-{}-{}'.format(Year, Month, Day,Hour,Minute)
ImExpName = '.jpg'#标注图片扩展名,所有其他格式都转化为这种格式
AnnotExpName = '.xml'#标注文件扩展名,所有其他格式都转化为这种格式
SelectDataMainDir='/data/dataset/SelfSelectDataSet'
ObjIdLen=6
BBoxAreaTol=50*50 #标注框面积阈值,去掉比较小的标注框

1、抽取VOC上的数据集并筛选出我需要的类数据

def VOC2012Data2Mine():
	ExpectClss=('person','car','bird')#需要挑选的类别,按优先级排列,如果一个标注图片中同时含有多个类,优先级靠前的具有文件命名权
	ClsCounter={}
	for ExpectCls in ExpectClss:
		ClsCounter[ExpectCls]=0
	
	VOCMainDir='/data/dataset/VOCdevkit2012/VOC2012'
	SelectDataSubDir = os.path.join(SelectDataMainDir, 'SelectVOC')
	VOCAnnotDir=os.path.join(VOCMainDir,'Annotations')
	VOCImDir=os.path.join(VOCMainDir,'JPEGImages')
	AnnotNames=os.listdir(VOCAnnotDir)
	for AnnotName in AnnotNames[:]:
		AnnotPath=os.path.join(VOCAnnotDir,AnnotName)
		ObjBndBoxs=GetImAnnotBoxs(AnnotPath,BBoxAreaTol)#得到标注boxes
		SelectObjBoxs={}
		for ExpectCls in ExpectClss:
			if ObjBndBoxs.has_key(ExpectCls) or ObjBndBoxs.has_key(ExpectCls.capitalize()):#筛选出我需要的类别及boxes
				SelectObjBoxs[ExpectCls]=ObjBndBoxs[ExpectCls]
		if SelectObjBoxs:
			ImPath1=os.path.join(VOCImDir,AnnotName[:-4]+ImExpName)
			ImPath2=os.path.join(VOCImDir,AnnotName[:-4] + ImExpName.upper())
			ImPath=''
			if os.path.exists(ImPath1):
				ImPath=ImPath1
			elif os.path.exists(ImPath2):
				ImPath = ImPath2
			else:
				continue
			for ExpectCls in ExpectClss:#确定一下当前数据用那个类别来命名
				if SelectObjBoxs.has_key(ExpectCls):
					ClsCounter[ExpectCls]+=1
					ExpectClsStr=ExpectCls+str(ClsCounter[ExpectCls]).zfill(ObjIdLen)
					NewAnnotName='VOC-{}_{}'.format(ExpectClsStr,AnnotName)
					break
			
			ImSource='flickr'
			ImWHD = GetAnnotImWHD(AnnotPath)
			SelectAnnotDir = os.path.join(SelectDataSubDir, ExpectCls, 'AnnotSet')#标注文件存放的文件夹
			if not os.path.exists(SelectAnnotDir):
				os.makedirs(SelectAnnotDir)
			VOCXml2MyXml(NewAnnotName, ImSource, SelectObjBoxs, SelectAnnotDir, ImWHD,
			             AnnotImFolder='VOC2012', CurY_M_D=Y_M_D, NotesDict={})#标注文件采用在VOC格式基础上,增补了一些信息,如数据入库时间,数据入库人员,以及备注信息
			
			SelectImDir = os.path.join(SelectDataSubDir, ExpectCls, 'ImSet')#标注图片存放的文件夹
			if not os.path.exists(SelectImDir):
				os.makedirs(SelectImDir)
			NewImName=NewAnnotName[:-4]+ImExpName
			NewImPath=os.path.join(SelectImDir,NewImName)
			shutil.copy(ImPath,NewImPath)
			MsgStr='{}--->{}\n'.format(ImPath,NewImPath)
			print MsgStr
	
	ImClsInfoTxtPath=os.path.join(SelectDataSubDir,'SelectVOCInfo.txt')#统计一下整理的数据量
	with open(ImClsInfoTxtPath,'w') as FId:
		for ExpectCls in ClsCounter.keys():
			LineStr='{}ImNum={}\n'.format(ExpectCls,ClsCounter[ExpectCls])
			FId.writelines(LineStr)

2、抽取COCO上的数据集并筛选出我需要的类数据

"""
#coco数据集格式示例如下
{"images": 
[
{"id": 44, "file_name": "000044.jpg", "width": 600, "height": 338},
{"id": 48, "file_name": "000048.jpg", "width": 600, "height": 338}, 
{"id": 54, "file_name": "000054.jpg", "width": 600, "height": 338}
], 
"type": "instances", 
"annotations": 
[
{"ignore": 0, "image_id": 44, "segmentation": [], "bbox": [109, 274, 29, 27], "area": 783, "category_id": 2, "iscrowd": 0, "id": 1}, 
{"ignore": 0, "image_id": 48, "segmentation": [], "bbox": [301, 39, 80, 48], "area": 3840, "category_id": 1, "iscrowd": 0, "id": 2}, 
{"ignore": 0, "image_id": 48, "segmentation": [], "bbox": [197, 117, 73, 51], "area": 3723, "category_id": 1, "iscrowd": 0, "id": 3}, 
{"ignore": 0, "image_id": 48, "segmentation": [], "bbox": [239, 75, 81, 55], "area": 4455, "category_id": 1, "iscrowd": 0, "id": 4}, 
{"ignore": 0, "image_id": 48, "segmentation": [], "bbox": [158, 159, 76, 54], "area": 4104, "category_id": 1, "iscrowd": 0, "id": 5},
{"ignore": 0, "image_id": 48, "segmentation": [], "bbox": [95, 204, 85, 56], "area": 4760, "category_id": 1, "iscrowd": 0, "id": 6}, 
{"ignore": 0, "image_id": 48, "segmentation": [], "bbox": [14, 265, 81, 72], "area": 5832, "category_id": 1, "iscrowd": 0, "id": 7}, 
{"ignore": 0, "image_id": 48, "segmentation": [], "bbox": [374, 8, 44, 34], "area": 1496, "category_id": 2, "iscrowd": 0, "id": 8}, 
{"ignore": 0, "image_id": 54, "segmentation": [], "bbox": [409, 96, 36, 42], "area": 1512, "category_id": 2, "iscrowd": 0, "id": 9}], 
"categories": 
[
{"supercategory": "none", "id": 1, "name": "Class1"}, #没有写具体的类别
{"supercategory": "none", "id": 3, "name": "Class2"}, 
{"supercategory": "none", "id": 2, "name": "Class3"}, 
{"supercategory": "none", "id": 0, "name": "Class4"}
]
}
"""
#抽取COCO上的数据集并筛选出我需要的类数据
def COCOData2Mine():
	from pycocotools.coco import COCO #coco官方提供的coco数据集合读写接口
	
	def COCOJson2MyXml(AnotName, ImSource, ObjectSet, AnotWriteFolder, ImWHD,
	                   AnnotImFolder, CurY_M_D):
		VOCXml2MyXml(AnotName, ImSource, ObjectSet, AnotWriteFolder, ImWHD,
		             AnnotImFolder, CurY_M_D, NotesDict={})
		
		
	ExpectClss = ('truck','car','person','bird')
	DataTypes = ('train2014','val2014')
	COCODir = '/data/dataset/coco'
	for DataType in DataTypes:
		ImDir = os.path.join(COCODir, 'images', DataType)
		AnnotPath = '{}/annotations/instances_{}.json'.format(COCODir, DataType)
		coco = COCO(AnnotPath)
		SubDirName=DataType[:-4].capitalize()
		SelectDataSubDir = os.path.join(SelectDataMainDir, 'SelectCOCO',SubDirName)
		
		ClsCounter = {}
		for ExpectCls in ExpectClss:
			ClsCounter[ExpectCls] = 0
		ImNames = os.listdir(ImDir)
		for ImName in ImNames[:]:
			SplitStrs=ImName[:-4].split('_')
			ImId=int(SplitStrs[2])
		
			ImInfo=coco.loadImgs(ImId)[0]
			SelectObjBoxs={}
			AnnotIds=coco.getAnnIds(imgIds=ImId)
			AnnotInfos=coco.loadAnns(AnnotIds)
			for AnnotInfo in AnnotInfos:
				CatId=AnnotInfo['category_id']
				CatInfo=coco.loadCats(CatId)[0]
				CatName=CatInfo['name']
				if CatName in ExpectClss:
					bbox=AnnotInfo['bbox']
					BBox=[int(float(bbox[0])),int(float(bbox[1])),int(float(bbox[0]+bbox[2])),int(float(bbox[1]+bbox[3]))]
					BBoxArea=(BBox[2]-BBox[0])*(BBox[3]-BBox[1])
					if BBoxArea>=BBoxAreaTol:
						if SelectObjBoxs.has_key(CatName):
							SelectObjBoxs[CatName].append(BBox)
						else:
							SelectObjBoxs[CatName]=[BBox]
						
			if SelectObjBoxs:
				for ExpectCls in ExpectClss:
					if SelectObjBoxs.has_key(ExpectCls):
						ClsCounter[ExpectCls] += 1
						ExpectClsStr = ExpectCls + str(ClsCounter[ExpectCls]).zfill(ObjIdLen)
						NewImIdStr = str(ImId).zfill(ObjIdLen)
						NewImName = 'COC_{}_{}{}.jpg'.format(ExpectClsStr,SubDirName[0],NewImIdStr)
						break
				
				AnotName=NewImName[:-4]+'.xml'
				ImWHD=[ImInfo['width'],ImInfo['height'],3]
				SelectAnnotDir = os.path.join(SelectDataSubDir, ExpectCls, 'AnnotSet')
				if not os.path.exists(SelectAnnotDir):
					os.makedirs(SelectAnnotDir)
				COCOJson2MyXml(AnotName, ImName, SelectObjBoxs, SelectAnnotDir, ImWHD,
				               AnnotImFolder='COCO2014', CurY_M_D=Y_M_D)
				
				SelectImDir = os.path.join(SelectDataSubDir, ExpectCls, 'ImSet')
				if not os.path.exists(SelectImDir):
					os.makedirs(SelectImDir)
				ImPath=os.path.join(ImDir,ImName)
				NewImPath=os.path.join(SelectImDir,NewImName)
				shutil.copy(ImPath,NewImPath)
				MsgStr = '{}--->{}\n'.format(ImPath, NewImPath)
				print MsgStr
				
		ImClsInfoTxtName='SelectCOCO({})Info.txt'.format(SubDirName)
		ImClsInfoTxtPath = os.path.join(SelectDataSubDir,ImClsInfoTxtName)
		with open(ImClsInfoTxtPath, 'w') as FId:
			TotalImNum=0
			for ExpectCls in ClsCounter.keys():
				TotalImNum+=ClsCounter[ExpectCls]
				LineStr = '{}ImNum={}\n'.format(ExpectCls, ClsCounter[ExpectCls])
				FId.writelines(LineStr)
			LineStr = 'TotalImNum={}\n'.format(TotalImNum)
			FId.writelines(LineStr)

3、抽取ImageNet上的ILSVRC2016_LOC数据集并筛选出我需要的类数据

def ImageNetData2Mine():
	ExpectClss = ('forklift','pickup truck','tractor', 'car', 'person', 'bird')
	ClsIdNameChs = \
	    [
		('n02814533', 'beach wagon', u'轿车'),
		('n03384352', 'forklift', u'叉车'),
		('n03770679', 'minivan', u'面包车'),
		('n03930630', 'pickup truck', u'皮卡'),
		('n04465501', 'tractor', u'拖拉机'),
		]
	NeedCvtClss=\
	[('n02814533', 'car', u'汽车'),
	 ('n03770679', 'car', u'汽车')
	]
	
	for NeedCvtCls in NeedCvtClss:
		for k in range(len(ClsIdNameChs)):
			ClsIdNameCh=ClsIdNameChs[k]
			if NeedCvtCls[0]==ClsIdNameCh[0]:
				ClsIdNameChs[k]=NeedCvtCls
				
	ClsId2ClsNames={}
	for ClsIdNameCh in ClsIdNameChs:
		ClsId2ClsNames[ClsIdNameCh[0]]=ClsIdNameCh[1]
	
	ImageNetDir = '/data/dataset/ImageNet/ILSVRC2016_LOC'
	ImMainDir =os.path.join(ImageNetDir,'Data/CLS-LOC/train')
	AnnotMainDir = os.path.join(ImageNetDir, 'Annotations/CLS-LOC/train')
	SelectDataSubDir=os.path.join(SelectDataMainDir, 'SelectImageNet')
	
	ClsCounter = {}
	for ExpectCls in ExpectClss:
		ClsCounter[ExpectCls] = 0
		
	for ClsIdNameCh in ClsIdNameChs:
		AnnotIms=[]
		ClsId=ClsIdNameCh[0]
		ImDir=os.path.join(ImMainDir,ClsId)
		AnnotDir=os.path.join(AnnotMainDir,ClsId)
		if os.path.isdir(AnnotDir):
			AnnotNames=os.listdir(AnnotDir)
			for AnnotName in AnnotNames:
				AnnotPath=os.path.join(AnnotDir,AnnotName)
				if os.path.isfile(AnnotPath) and AnnotName.endswith(AnnotExpName):
					ImName=AnnotName[:-4]+'.JPEG'
					ImPath=os.path.join(ImDir,ImName)
					if os.path.exists(ImPath):
						ObjBndBoxs1 = GetImAnnotBoxs(AnnotPath, BBoxAreaTol)
						ObjBndBoxs={}
						for ObjId in ObjBndBoxs1.keys():
							ObjBndBoxs[ClsId2ClsNames[ObjId]]=ObjBndBoxs1[ObjId]
							
						SelectObjBoxs = {}
						for ExpectCls in ExpectClss:
							if ObjBndBoxs.has_key(ExpectCls) or ObjBndBoxs.has_key(ExpectCls.capitalize()):
								SelectObjBoxs[ExpectCls] = ObjBndBoxs[ExpectCls]
						if SelectObjBoxs:
							AnnotIms.append(ImName)
							SplitStr=ImName[:-5].split('_')
							assert SplitStr[0]==ClsId,'ClsName is not same'
							for ExpectCls in ExpectClss:
								if SelectObjBoxs.has_key(ExpectCls):
									ClsCounter[ExpectCls] += 1
									ExpectClsStr = ExpectCls + str(ClsCounter[ExpectCls]).zfill(ObjIdLen)
									NewAnnotName = 'ImNt-{}_{}'.format(ExpectClsStr,ImName[:-5]+AnnotExpName)
									break
									
							ImWHD = GetAnnotImWHD(AnnotPath)
							SelectAnnotDir = os.path.join(SelectDataSubDir, ExpectCls, 'AnnotSet')
							if not os.path.exists(SelectAnnotDir):
								os.makedirs(SelectAnnotDir)
							VOCXml2MyXml(NewAnnotName, ImName, SelectObjBoxs, SelectAnnotDir, ImWHD,
							             AnnotImFolder='ImageNet2016', CurY_M_D=Y_M_D, NotesDict={})
							
							SelectImDir = os.path.join(SelectDataSubDir, ExpectCls, 'ImSet')
							if not os.path.exists(SelectImDir):
								os.makedirs(SelectImDir)
							NewImName = NewAnnotName[:-4] + ImExpName
							NewImPath = os.path.join(SelectImDir, NewImName)
							shutil.copy(ImPath, NewImPath)
							MsgStr = '{}--->{}\n'.format(ImPath, NewImPath)
							print MsgStr
									
		DirName = '{}({})_NoAnnot'.format(ClsId, ClsId2ClsNames[ClsId])
		SaveImDir = os.path.join(SelectDataSubDir, ClsId2ClsNames[ClsId], 'NoAnnotImSet', DirName)
		if not os.path.isdir(SaveImDir):
			os.makedirs(SaveImDir)
		ImNames = os.listdir(ImDir)
		NoAnnotIms=set(ImNames).difference(set(AnnotIms))
		for NoAnnotIm in NoAnnotIms:
			ImPath=os.path.join(ImDir,NoAnnotIm)
			shutil.copy(ImPath,SaveImDir)
			
	ImClsInfoTxtName = 'SelectImageNetInfo.txt'
	ImClsInfoTxtPath = os.path.join(SelectDataSubDir, ImClsInfoTxtName)
	with open(ImClsInfoTxtPath, 'w') as FId:
		TotalImNum = 0
		for ExpectCls in ClsCounter.keys():
			TotalImNum += ClsCounter[ExpectCls]
			LineStr = '{}ImNum={}\n'.format(ExpectCls, ClsCounter[ExpectCls])
			FId.writelines(LineStr)
		LineStr = 'TotalImNum={}\n'.format(TotalImNum)
		FId.writelines(LineStr)

4、抽取OpenImage上的数据集并筛选出我需要的类数据

def OpenImageData2Mine():
	import csv,cv2
	
	def GetRealClsNames(ClsDescptBoxPath):
		ClsDescpts = csv.reader(open(ClsDescptBoxPath, 'r'))
		RealClsNames = {}
		for ClsDescpt in ClsDescpts:
			LabelName, ClsName = ClsDescpt[0], ClsDescpt[1]
			RealClsNames[LabelName] = ClsName
		return RealClsNames
	
	def GetAnnotInfos(AnnotPath):
		AnnotInfos = []
		with open(AnnotPath, 'r') as FId:
			for AnnotInfo in csv.reader(FId):
				AnnotInfos.append(AnnotInfo)
		return AnnotInfos
	
	
		
	ImCls='Truck'
	DataTypes = ('train', 'test', 'validation')
	ExpectClss = ('crane', 'forklift', ImCls.lower(),'pickup truck', 'tractor', 'van', 'car', 'person', 'bird')
	OpenImDir='/data/dataset/OpenImage/WeNeedData'
	CSVMainDir = os.path.join(OpenImDir,ImCls,'csv_folder')
	ImMainDir = os.path.join(OpenImDir,ImCls,'Dataset')
	
	ClsDescptBoxPath = os.path.join(CSVMainDir, 'class-descriptions-boxable.csv')
	RealClsNames = GetRealClsNames(ClsDescptBoxPath)

	
	for DataType in DataTypes:
		ClsCounter = {}
		for ExpectCls in ExpectClss:
			ClsCounter[ExpectCls] = 0
		SubDirName = DataType.capitalize()
		SelectDataSubDir = os.path.join(SelectDataMainDir, 'SelectOpenImage',SubDirName)
		if not os.path.exists(SelectDataSubDir):
			os.makedirs(SelectDataSubDir)
			
		AnnotName = '{}-annotations-bbox.csv'.format(DataType)
		AnnotPath = os.path.join(CSVMainDir, AnnotName)
		AnnotInfos = GetAnnotInfos(AnnotPath)
		ImDir = os.path.join(ImMainDir, DataType, ImCls)
		ImNames = os.listdir(ImDir)
		for ImName in ImNames[:]:
			ImPath = os.path.join(ImDir, ImName)
			if os.path.isfile(ImPath):
				ImPath = os.path.join(ImDir, ImName)
				Im = cv2.imread(ImPath)
				ImH, ImW, ImD= Im.shape
				CurImName,CurImExpName=os.path.splitext(ImName)
				SelectObjBoxs = {}
				ContinueFindCount=0#同一张图片的标记框是连续罗列的#现在想想其实设置一个标记符号判断最简单
				for AnnotInfo in AnnotInfos:
					if AnnotInfo[0] == CurImName:

						xxyy = [float(AnnotInfo[4]), float(AnnotInfo[5]), float(AnnotInfo[6]), float(AnnotInfo[7])]
						BBox = [int(xxyy[0] * ImW), int(xxyy[2] * ImH), int(xxyy[1] * ImW), int(xxyy[3] * ImH)]
						
						BBoxArea = (BBox[2] - BBox[0]) * (BBox[3] - BBox[1])
						if BBoxArea >= BBoxAreaTol:
							LabelName = AnnotInfo[2]
							RealClsName = RealClsNames[LabelName].lower()
							if RealClsName in ExpectClss:
								if RealClsName in SelectObjBoxs.keys():
									SelectObjBoxs[RealClsName].append(BBox)
								else:
									SelectObjBoxs[RealClsName] = [BBox]
						 ContinueFindCount+=1#每找到一个计数一下
						 if ContinueFindCount>50:#一张图片中最大有50个框(我猜想的)
							break
				if SelectObjBoxs:
					for ExpectCls in ExpectClss:
						if SelectObjBoxs.has_key(ExpectCls):
							ClsCounter[ExpectCls] += 1
							ExpectClsStr = ExpectCls + str(ClsCounter[ExpectCls]).zfill(ObjIdLen)
							NewAnnotName = 'OpIm-{}_{}'.format(ExpectClsStr, CurImName + AnnotExpName)
							break
							
					ImWHD =(ImW,ImH,ImD)
					SelectAnnotDir = os.path.join(SelectDataSubDir, ExpectCls, 'AnnotSet')
					if not os.path.exists(SelectAnnotDir):
						os.makedirs(SelectAnnotDir)
					VOCXml2MyXml(NewAnnotName, ImName, SelectObjBoxs, SelectAnnotDir, ImWHD,
					             AnnotImFolder='OpenImage2019', CurY_M_D=Y_M_D, NotesDict={})
					
					SelectImDir = os.path.join(SelectDataSubDir, ExpectCls, 'ImSet')
					if not os.path.exists(SelectImDir):
						os.makedirs(SelectImDir)
					NewImName = NewAnnotName[:-4] + ImExpName
					NewImPath = os.path.join(SelectImDir, NewImName)
					if CurImExpName==ImExpName:
						shutil.copy(ImPath, NewImPath)
						MsgStr = '{}--->{}\n'.format(ImPath, NewImPath)
					else:
						cv2.imwrite(NewImPath,Im)
						MsgStr = 'imwrite Im--->{}\n'.format(NewImPath)
					print MsgStr
							
		ImClsInfoTxtName = 'SelectOpenImage({})Info.txt'.format(SubDirName)
		ImClsInfoTxtPath = os.path.join(SelectDataSubDir, ImClsInfoTxtName)
		with open(ImClsInfoTxtPath, 'w') as FId:
			TotalImNum = 0
			for ExpectCls in ClsCounter.keys():
				TotalImNum += ClsCounter[ExpectCls]
				LineStr = '{}ImNum={}\n'.format(ExpectCls, ClsCounter[ExpectCls])
				FId.writelines(LineStr)
			LineStr = 'TotalImNum={}\n'.format(TotalImNum)
			FId.writelines(LineStr)

好了就写到这里吧。

你可能感兴趣的:(Data,process,数据处理)