多进程读取图片并压缩.py
import readImgMultiProcessing, os, random, gc, time
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from imutils import paths
import numpy as np
seed = 300
random.seed(seed)
coreNum = 23
imgPath = r
dataPath = r'
dataTestPath = r
dataEncoderPath = r'babyFaceLabelEncoder.data'
config = {"epochs": 10, "batch_size": 128, 'useIDG':True, "imageResize": (600, 600), "lr": 1e-3}
if (os.path.exists(r'tmp') == False):
os.makedirs(r'tmp')
if __name__ == '__main__':
if os.path.exists(dataPath) and os.path.exists(dataTestPath):
(X_train, X_val, y_train, y_val) = readImgMultiProcessing.readFile(dataPath)
(X_test, Y_test) = readImgMultiProcessing.readFile(dataTestPath)
class_le = readImgMultiProcessing.readFile(dataEncoderPath)
else:
tst = time.time()
class_le = LabelEncoder()
class_le.fit(['睡', '醒'])
# 读取所有的图片路径
imagePaths = sorted(list(paths.list_images(imgPath)))
# 读取婴儿醒睡的图片
faceImagePaths = []
for img_path in imagePaths:
if(img_path.split(os.path.sep)[-2] == '睡' or img_path.split(os.path.sep)[-2] == '醒'):
faceImagePaths.append(img_path)
# 打乱顺序
random.shuffle(faceImagePaths)
# 这里就读取部分,内存不够
# faceImagePaths = faceImagePaths[:234]
# 用train_test_split划分 训练 验证 测试,注意:这里划分的是 路径
trainImgPaths, testImgPaths = train_test_split(faceImagePaths, test_size=0.2, random_state=seed)
trainImgPaths, valImgPaths = train_test_split(trainImgPaths, test_size=0.2, random_state=seed)
print('数据列表划分完成')
# 图片
X_train = readImgMultiProcessing.readImgMultiProcessing(trainImgPaths, coreNum, config)
print('训练集读入完成')
X_val = readImgMultiProcessing.readImgMultiProcessing(valImgPaths, coreNum, config)
print('验证集读入完成')
X_test = readImgMultiProcessing.readImgMultiProcessing(testImgPaths, coreNum, config)
print('测试集读入完成')
X_train = np.asarray(X_train, dtype=np.float) / 255.0
print('训练集处理完成')
X_val = np.asarray(X_val, dtype=np.float) / 255.0
print('验证集处理完成')
X_test = np.array( X_test, dtype=np.float) / 255.0
print('测试集处理完成')
# 婴儿表情的标签
y_train = class_le.transform([x.split(os.path.sep)[-2] for x in trainImgPaths])
y_val = class_le.transform([x.split(os.path.sep)[-2] for x in valImgPaths])
Y_test = class_le.transform([x.split(os.path.sep)[-2] for x in testImgPaths])
# 保存数据
readImgMultiProcessing.toFile(class_le, dataEncoderPath)
readImgMultiProcessing.toFile((X_train, X_val, y_train, y_val), dataPath)
readImgMultiProcessing.toFile((X_test, Y_test), dataTestPath)
print('数据集压缩成功,数据保存完毕')
print(len(trainImgPaths), X_train.shape, len(y_train))
print(len(valImgPaths), X_val.shape, len(y_val))
print(len(testImgPaths), X_test.shape, len(Y_test))
print('用时', time.time()-tst) # 934s
readImgMultiProcessing.py
from multiprocessing import Process,Queue,Pool,Pipe,Manager
import os,time,random
from imutils import paths
import numpy as np
from PIL import Image
import pickle
import cv2
def toFile(data,path):
with open(path, 'wb') as f:
pickle.dump(data, f, protocol = 4) #支持大于4G的文件
def readFile(path):
f = open(path, 'rb')
data = pickle.load(f)
f.close()
return data
def resize_img_keep_ratio(img_name, target_size):
'''
1.resize图片,先计算最长边的resize的比例,然后按照该比例resize。
2.计算四个边需要padding的像素宽度,然后padding
'''
try:
# 用cv2&numpy打开,这样可以读取中文路径
# img = cv2.imdecode(np.fromfile(img_name, dtype=np.uint8), -1)
# 使用PIL读取图片,防止中文路径和png格式的报错
im = Image.open(img_name)
# 转化成数组的格式
im_array = np.array(im)
# 报错提示
except Exception as e:
print(img_name, e)
old_size = im_array.shape[0:2]
ratio = min(float(target_size[i])/(old_size[i]) for i in range(len(old_size)))
new_size = tuple([int(i*ratio) for i in old_size])
img = cv2.resize(im_array,(new_size[1], new_size[0]),interpolation=cv2.INTER_CUBIC) #注意插值算法
pad_w = target_size[1] - new_size[1]
pad_h = target_size[0] - new_size[0]
top,bottom = pad_h//2, pad_h-(pad_h//2)
left,right = pad_w//2, pad_w -(pad_w//2)
# 填充图片,黑边填充
img_new = cv2.copyMakeBorder(img,top,bottom,left,right,cv2.BORDER_CONSTANT,None,(0,0,0))
if(img_name.count('.png')==1 or img_new.shape[-1]==4):
return cv2.cvtColor(img_new, cv2.COLOR_RGBA2RGB)
return img_new
def getData(num, paths, return_dict, config):
Data = []
for img_path in paths:
img = resize_img_keep_ratio(img_path, (config["imageResize"][0], config["imageResize"][1]))
Data.append(img)
Data = np.array(Data, dtype=np.float)
Data /= 255.0
return_dict[num] = Data
def readImgMultiProcessing(imagePaths, coreNum, config):
# 路径的划分
lenPerSt= int(len(imagePaths)/coreNum+1)
paths = []
for i in range(coreNum):
paths.append(imagePaths[i*lenPerSt:(i+1)*lenPerSt])
# 多进程返回值接收器
manager = Manager()
return_dict = manager.dict()
jobs = []
# 执行进程
for i in range(coreNum):
p = Process(target=getData,args=(str(i), paths[i], return_dict, config))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
# 合并数据
data = np.asarray((list(return_dict['0'])))
for i in range(1,coreNum):
x = np.asarray((list(return_dict[str(i)])))
if(int(x.shape[0])>0):
data = np.concatenate((data,x))
return data
# if __name__ == '__main__':
# imgPath = r'E:\新的数据集\'
# imagePaths = sorted(list(paths.list_images(imgPath)))[:100]
# config = {"epochs": 10, "batch_size": 128, 'useIDG':True, # False True
# "imageResize": (600, 600), "lr": 1e-3}
# coreNum = 10
# data = readImgMultiProcessing(imagePaths, coreNum, config)
# print(data.shape)