打cvpr2021分类比赛 ,处理长尾数据的小工具。
将每个类别进行不放回抽样picknumber 为最大抽取的数量。避免头部类和尾部类差异过大。
import os, random, shutil
def rand_sampling(pathDir, n):
"""
:param number_set: 数字列表
:param n: 组合数位数
:return: 组合数
"""
result = []
for i in range(1, n+1):
pic = random.choice(pathDir)
result.append(pic)
pathDir.remove(pic)
return result
def moveFile(source,aim):
pathDir = os.listdir(source) #取图片的原始路径
picknumber = 300
if len(pathDir)<= picknumber:
sample = pathDir
else:
sample = rand_sampling(pathDir, picknumber)
print(source)
print("len:",len(pathDir))
for name in sample:
s = os.path.join(source, name)
a = os.path.join(aim, name)
shutil.copyfile(source+name, aim+name)
return
if __name__ == '__main__':
fileDir = "/home1/zy/train_zy/train/" #源图片文件夹路径
tarDir = '/home1/zy/train_zy/train_ib_60/' #移动到新的文件夹路径
for root, dirs, files in os.walk(fileDir):
for d in dirs:
source = os.path.join(fileDir, d) +'/'
aim = os.path.join(tarDir, d)+'/'
if not os.path.exists(tmp):
os.makedirs(tmp)
moveFile(source,aim)