train_data = pd.read_csv("../data/classify-leaves/train.csv")
test_data = pd.read_csv("../data/classify-leaves/test.csv")
classes = sorted(list(set(train_data['label'])))
classes_to_num = dict(zip(classes,range(n_classes)))
初始化,传入csv文件路径,image公共路径,处理后的长宽,transform
def __init__(self,csv_path,img_path,mode,height = 224,
weight = 224,valid_ratio=0.2,transform = None):
super(LeaveDataset, self).__init__()
self.csv_path = csv_path
self.resize_height = height
self.resize_weight = weight
self.transform = transform
self.img_path = img_path
self.mode = mode
# print(self.mode)
# 读取 csv 文件
# 利用pandas读取csv文件
使用pd.read_csv()读取数据,去除表头
self.data_info = pd.read_csv(csv_path,header=None)
分析获取到的csv数据
是否有验证集,若没有,则将训练集划分为训练集与验证集,通过分析结构,得到图片路径与label,
self.data_len = len(self.data_info.index)-1
self.train_len = int(self.data_len*(1-valid_ratio))
# 划分验证集val_data
# 根据不同模式,进行处理,返回不同结果
if mode == 'train':
# 文件第一列包含图像文件名称
self.img_arr = np.asarray(self.data_info.iloc[1:self.train_len, 0])
# print(self.img_arr)
self.label_arr = np.asarray(self.data_info.iloc[1:self.train_len, 1])
# print(self.label_arr)
elif mode == 'valid':
# 文件第一列包含图像文件名称
self.img_arr = np.asarray(self.data_info.iloc[self.train_len:, 0])
# print(self.img_arr)
self.label_arr = np.asarray(self.data_info.iloc[self.train_len:, 1])
# print(self.label_arr)
elif mode == 'test':
# 文件第一列包含图像文件名称
self.img_arr = np.asarray(self.data_info.iloc[1:, 0])
# print(self.img_arr)
self.real_len = len(self.img_arr)
print('Finished reading the {} set of Leaves Dataset ({} samples found)'
.format(self.mode, self.real_len))
def __getitem__(self, item):
single_image_name = self.img_arr[item]
img = Image.open(self.img_path+single_image_name)
img = self.transform(img)
if self.mode == 'test':
return img
else:
label = self.label_arr[item]
num_label = classes_to_num[label]
return (img,num_label)
def __len__(self):
return self.real_len
test_data与train_data基本相同,没有label
train_dataset = LeaveDataset(csv_path="../data/classify-leaves/train.csv",img_path=Img_PATH,mode='train',transform = train_transform)
valid_dataset = LeaveDataset(csv_path="../data/classify-leaves/train.csv",img_path=Img_PATH,mode='valid',transform = val_test_transform)
test_dataset = LeaveDataset(csv_path="../data/classify-leaves/test.csv",img_path=Img_PATH,mode='test',transform = val_test_transform)
train_loader = DataLoader(train_dataset,batch_size,shuffle=True,num_workers=5)
valid_loader = DataLoader(train_dataset,batch_size,shuffle=True,num_workers=5)
test_loader = DataLoader(test_dataset,batch_size,shuffle=True,num_workers=5)
for epoch in range(num_epoch):
net.train()
loss_sum = 0
loss_correct = 0
for i,data in enumerate(train_loader):
inputs,labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = net(inputs)
loss = loss_func(outputs,labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
_,pred = torch.max(outputs.data,dim =1)
correct = pred.eq(labels.data).cpu().sum()
loss_sum += loss.item()
loss_correct += correct.item()
step += 1
print("train epoch", epoch + 1, "train loss is: ", loss_sum * 1.0 / len(train_loader), "train correct is: ",
loss_correct * 100.0 / len(train_loader) / batch_size)
label_name = [
"airplane",
"automobile",
"bird",
"cat",
"deer",
"dog",
"frog",
"horse",
"ship",
"truck"
]
train_list = glob.glob("C:\document\python\pythonDemo\pytorchTest\data\cifar-10-python\cifar-10-batches-py\data_batch_*")
test_list = glob.glob("C:\document\python\pythonDemo\pytorchTest\data\cifar-10-python\cifar-10-batches-py\\test_batch*")
save_path = "/pytorchTest/data/cifar-10-python/cifar-10-batches-py/test"
for l in test_list:
# print(l)
l_dict = unpickle(l) #使用unpickle函数解压压缩包
# print(l_dict)
# l_dict :
# b'king_charles_spaniel_s_000029.png', b'fawn_s_001506.png', b'compact_car_s_001759.png',
# print(l_dict.keys())
# l_dict.keys() :dict_keys([b'batch_label', b'labels', b'data', b'filenames'])
for im_idx,im_data in enumerate(l_dict[b'data']):
# print(im_idx)
# print(im_data)
im_label = l_dict[b'labels'][im_idx]
im_name = l_dict[b'filenames'][im_idx]
# print(im_label,im_name)
im_label_name = label_name[im_label]
im_data = np.reshape(im_data,[3,32,32])
im_data = np.transpose(im_data,(1,2,0))
# cv2.imshow("im_data", cv2.resize(im_data, (200, 200)))
# cv2.waitKey(0)
if not os.path.exists("{}/{}".format(save_path,im_label_name)):
os.mkdir("{}/{}".format(save_path,im_label_name))
cv2.imwrite("{}/{}/{}".format(save_path,im_label_name,im_name.decode("utf-8")),im_data)
abel_name = ["airplane", "automobile", "bird",
"cat", "deer", "dog",
"frog", "horse", "ship", "truck"]
label_dict = {}
for idx, name in enumerate(label_name):
label_dict[name] = idx
并规定以什么样的方式打开
def default_loader(path):
return Image.open(path).convert("RGB")
train_transform = transforms.Compose([
transforms.RandomHorizontalFlip(),
transforms.RandomVerticalFlip(),
transforms.RandomRotation(90),
transforms.ColorJitter(brightness=0.2, contrast=0.2, hue=0.2),
transforms.RandomGrayscale(0.2),
transforms.RandomCrop(28),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010)),
])
test_transform = transforms.Compose([
transforms.CenterCrop((32, 32)),
transforms.ToTensor(),
transforms.Normalize((0.4914, 0.4822, 0.4465),
(0.2023, 0.1994, 0.2010)),
])
传入图片地址集合,数据增强方案transform
**根据文件夹名字为分类,可以采用split获取该图片的label,**并使用label_dict转为数字。使用append加入imgs列表中
def __init__(self, im_list,
transform=None,
loader = default_loader):
super(MyDataset, self).__init__()
imgs = []
for im_item in im_list:
#"/home/kuan/dataset/CIFAR10/TRAIN/" \
#"airplane/aeroplane_s_000021.png"
im_label_name = im_item.split("\\")[-2]
imgs.append([im_item, label_dict[im_label_name]])
self.imgs = imgs
self.transform = transform
self.loader = loader
根据保存起来的imgs,里面有图片地址,以及分类。调用default_loader打开图片,然后进行数据增强,返回增强后的图片与分类
def __getitem__(self, index):
im_path, im_label = self.imgs[index]
im_data = self.loader(im_path)
if self.transform is not None:
im_data = self.transform(im_data)
return im_data, im_label
def __len__(self):
return len(self.imgs)
im_train_list = glob.glob("C:\\document\\python\\pythonDemo\\pytorchTest\\data\\cifar-10-python\\cifar-10-batches-py\\train\\*\\*.png")
im_test_list = glob.glob("C:\\document\\python\\pythonDemo\\pytorchTest\\data\\cifar-10-python\\cifar-10-batches-py\\test\\*\\*.png")
train_dataset = MyDataset(im_train_list,transform=train_transform)
test_dataset = MyDataset(im_test_list,transform =test_transform)
train_loader = DataLoader(dataset=train_dataset,batch_size=128,shuffle=True,num_workers=4)
test_loader = DataLoader(dataset=test_dataset,batch_size=128,shuffle=False,num_workers=4)
# -*- coding: utf-8 -*-
"""
将数据集划分为训练集,验证集,测试集
"""
import os
import random
import shutil
# 创建保存图像的文件夹
def makedir(new_dir):
if not os.path.exists(new_dir):
os.makedirs(new_dir)
random.seed(1) # 随机种子
# 1.确定原图像数据集路径
dataset_dir = "C:\document\python\pythonDemo\pytorchTest\data\Rice_Image_Dataset\Rice_Image_Dataset" ##原始数据集路径
# 2.确定数据集划分后保存的路径
split_dir = "C:\document\python\pythonDemo\pytorchTest\data\Rice_Image_Dataset" ##划分后保存路径
train_dir = os.path.join(split_dir, "train")
valid_dir = os.path.join(split_dir, "val")
test_dir = os.path.join(split_dir, "test")
# 3.确定将数据集划分为训练集,验证集,测试集的比例
train_pct = 0.9
valid_pct = 0.1
test_pct = 0.1
# 4.划分
for root, dirs, files in os.walk(dataset_dir):
for sub_dir in dirs: # 遍历0,1,2,3,4,5...9文件夹
imgs = os.listdir(os.path.join(root, sub_dir)) # 展示目标文件夹下所有的文件名
imgs = list(filter(lambda x: x.endswith('.jpg'), imgs)) # 取到所有以.png结尾的文件,如果改了图片格式,这里需要修改
random.shuffle(imgs) # 乱序图片路径
img_count = len(imgs) # 计算图片数量
train_point = int(img_count * train_pct) # 0:train_pct
valid_point = int(img_count * (train_pct + valid_pct)) # train_pct:valid_pct
for i in range(img_count):
if i < train_point: # 保存0-train_point的图片到训练集
out_dir = os.path.join(train_dir, sub_dir)
elif i < valid_point: # 保存train_point-valid_point的图片到验证集
out_dir = os.path.join(valid_dir, sub_dir)
else: # 保存valid_point-结束的图片到测试集
out_dir = os.path.join(test_dir, sub_dir)
makedir(out_dir) # 创建文件夹
target_path = os.path.join(out_dir, imgs[i]) # 指定目标保存路径
src_path = os.path.join(dataset_dir, sub_dir, imgs[i]) #指定目标原图像路径
shutil.copy(src_path, target_path) # 复制图片
print('Class:{}, train:{}, valid:{}, test:{}'.format(sub_dir, train_point, valid_point-train_point,
img_count-valid_point))
返回所有匹配的文件路径列表。它只有一个参数pathname,定义了文件路径匹配规则,这里可以是绝对路径,也可以是相对路径。下面是使用glob.glob的例子:
import glob
#获取指定目录下的所有图片
print (glob.glob(r"/home/qiaoyunhao/*/*.png"),"\n")#加上r让字符串不转义
#获取上级目录的所有.py文件
print (glob.glob(r'../*.py')) #相对路径
转换输入为数组 array
输入参数
a:类数组。输入数据,可以是转换为数组的任意形式。比如列表、元组列表、元组、元组元组、列表元组和 ndarray;
dtype:数据类型,可选。默认情况下,该参数与数据数据类型相同。
order:{‘C’,‘F’},可选。选择是行优先(C-style)或列优先(Fortran-style)存储。默认为行优先。
返回值
out:ndarray。‘a’ 的数组形式。如果输入已经是匹配 dtype 和 order 参数的 ndarray 形式,则不执行复制,如果输入是 ndarray 的一个子类,则返回一个基类 ndarray。
enumerate() 函数用于将一个可遍历的数据对象(如列表、元组或字符串)组合为一个索引序列,同时列出数据和数据下标,一般用在 for 循环当中。