对应的流程也就是:
# 创建Dateset(可以自定义)
dataset = face_dataset #face_dataset是上面定义过的Dataset类
# Dataset传递给DataLoader
dataloader =torch.utils.data.DataLoader(dataset,batch_size=64,shuffle=False,num_workers=8)
# DataLoader迭代产生训练数据提供给模型
for i in range(epoch):
for index,(img,label) in enumerate(dataloader):
pass
DataLoader(dataset, batch_size=1, shuffle=False, sampler=None,
batch_sampler=None, num_workers=0, collate_fn=None,
pin_memory=False, drop_last=False, timeout=0,
worker_init_fn=None)
(1)dataset (Dataset) :传入实例化好的dataset数据集(定义的是dataset子类,对其进行实例化后才可以传入)
(2)batch_size = int:传入batch的容量,默认值是1
(3)shuffle =T/F: 每一个epoch的batch样本是相同还是随机 (default: False),是否将数据集洗牌之后再打包切分成batch。
(4)sampler (Sampler, optional) :决定数据集中采样的方法. 如果有,则shuffle参数必须为False,如果要设置特定的采样方法,就不要提前洗牌了。
(5)batch_sampler (Sampler, optional) :和 sampler 类似,但是一次返回的是一个batch内所有样本的index
(6)num_workers (python:int, optional) – 多少个子程序同时工作来获取数据,多线程。 (default: 0)
sampler 重点参数,采样器,是一个迭代器。PyTorch提供了多种采样器,用户也可以自定义采样器。
class Sampler(object):
# """Base class for all Samplers.
# Every Sampler subclass has to provide an __iter__ method, providing a way
# to iterate over indices of dataset elements, and a __len__ method that
# returns the length of the returned iterators.
# """
# 一个 迭代器 基类
def __init__(self, data_source):
pass
def __iter__(self):
raise NotImplementedError
def __len__(self):
raise NotImplementedError
SequentialSampler 很好理解就是顺序采样器。
1、在 def init(self, data_source):拿到和原数据一样的数据资源。
2、在__iter__方法中首先得到一个和data_source一样长度的range可迭代器;每次只会返回一个索引值。
源码如下:
class SequentialSampler(Sampler):
# r"""Samples elements sequentially, always in the same order.
# Arguments:
# data_source (Dataset): dataset to sample from
# """
# 产生顺序 迭代器
def __init__(self, data_source):
self.data_source = data_source
def __iter__(self):
return iter(range(len(self.data_source)))
def __len__(self):
return len(self.data_source)
RandomSampler 采样
参数:
class RandomSampler(Sampler):
# r"""Samples elements randomly. If without replacement, then sample from a shuffled dataset.
# If with replacement, then user can specify ``num_samples`` to draw.
# Arguments:
# data_source (Dataset): dataset to sample from
# num_samples (int): number of samples to draw, default=len(dataset)
# replacement (bool): samples are drawn with replacement if ``True``, default=False
# """
def __init__(self, data_source, replacement=False, num_samples=None):
self.data_source = data_source
self.replacement = replacement
self.num_samples = num_samples
if self.num_samples is not None and replacement is False:
raise ValueError("With replacement=False, num_samples should not be specified, "
"since a random permute will be performed.")
if self.num_samples is None:
self.num_samples = len(self.data_source)
if not isinstance(self.num_samples, int) or self.num_samples <= 0:
raise ValueError("num_samples should be a positive integeral "
"value, but got num_samples={}".format(self.num_samples))
if not isinstance(self.replacement, bool):
raise ValueError("replacement should be a boolean value, but got "
"replacement={}".format(self.replacement))
def __iter__(self):
n = len(self.data_source)
if self.replacement:
return iter(torch.randint(high=n, size=(self.num_samples,), dtype=torch.int64).tolist())
return iter(torch.randperm(n).tolist())
def __len__(self):
return len(self.data_source)
num_workers 参数表示同时参与数据读取的线程数量,多线程技术可以加快数据读取,提供GPU/CPU利用率。
import torchvision
#准备的 测试数据集
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
test_data=torchvision.datasets.CIFAR10("dataset_test",train=False,transform=torchvision.transforms.ToTensor())
test_loader = DataLoader(dataset=test_data,batch_size=64,shuffle=True,num_workers=0,drop_last=False)#drop_last丢弃最后一组不符合batch_size的数据
#测试数据集中第一张图片及target
img,target=test_data[0]
# print(img.shape)
# print(target)
writer = SummaryWriter("dataloader")
step=0
for data in test_loader:
imgs,target=data
# print(img.shape)
# print(target)
writer.add_images("test_data",imgs,global_step=step)
step=step+1
writer.close()
import torchvision
from torch.utils.tensorboard import SummaryWriter
dataset_transform = torchvision.transforms.Compose([
torchvision.transforms.ToTensor()
])
trans_set= torchvision.datasets.CIFAR10(root="./dataset_test",train=True,transform=dataset_transform,download=False)
test_set=torchvision.datasets.CIFAR10(root="./dataset_test",train=False,transform=dataset_transform,download=False)
# print(test_set[0])
# print(test_set.classes)
# img,target=test_set[0]
# print(img)
# img.show()
# print(target)
# print(test_set.classes[target])
#print(test_set[0])
writer = SummaryWriter("p10")
for i in range(10):
img,target=test_set[i]
writer.add_image("test_set",img,i)
writer.close()
重点在下面:
dataset_transform = torchvision.transforms.Compose([
torchvision.transforms.ToTensor()
])
trans_set= torchvision.datasets.CIFAR10(root="./dataset_test",train=True,transform=dataset_transform,download=False)
test_set=torchvision.datasets.CIFAR10(root="./dataset_test",train=False,transform=dataset_transform,download=False)
#如想要对数据集集体进行变化,就书写在transform里面,transform转变的意思
#这里的torchvision.transforms.Compose就是将一些transform的操作打包封装
torchvision.transforms.Compose([********])在内部可以将transform的多种方法进行组合,就相当于一个函数方法,可以在其他的地方释放使用,尤其对接torchvision.datasets.CIFAR10内部的transform参数时候。
torchvision是pytorch的一个图形库,它服务于PyTorch深度学习框架的,主要用来构建计算机视觉模型。torchvision.transforms主要是用于常见的一些图形变换。
在内部可以将transform的多种方法进行组合,就相当于一个函数方法,可以在其他的地方释放使用,尤其对接torchvision.datasets.CIFAR10内部的transform参数时候