以mnist图像生成样本为例,详细解释diffusion的每个步骤和过程
扩散模型包括两个过程:前向过程(forward process)和反向过程(reverse process),其中前向过程又称为扩散过程(diffusion process),如下图所示。无论是前向过程还是反向过程都是一个参数化的马尔可夫链(Markov chain),其中反向过程可以用来生成数据,这里我们将通过变分推断来进行建模和求解。
另外要指出的是,扩散过程往往是固定的,即采用一个预先定义好的variance schedule,比如DDPM就采用一个线性的variance schedule。
扩散过程的一个重要特性是我们可以直接基于原始数据对于任意步长的xt进行采样
记住这个公式,这个公式很重要,因为它可以直接从原图片推断出,经过t次加噪声之后的图像。
反向过程
扩散是将数据噪声化,那么反向的过程就是一个逐步去噪的过程,如果我们知道反向过程每一步的真实的噪声分布,那么从一个随机噪声开始就能生成一个真实的样本,所以反向过程就是生成数据的过程,我们可以用神经网络来估计这些噪声的分布。
推导公式的过程比较麻烦,我这里直接把bubbliiiing大佬结论放在这里
也就是说,我们加噪声的时候,可以一步到位直接从原始图像知道迭代了N步以后的噪声,但是图像还原的时候,要迭代着一步步计算之前的噪声,去噪,然后一步步把图片还原出来
这里我们先实现一个用于预测噪声图像的unet的网络结构
我们模型会有两个输入,一个是时间的步长,表示噪声加了多少步了,第二个是原图,但是时间的步长它是一个数值,比如0,1,2等,它是一个整数,这里我们需要把0,1,2等整数步长转化为离散类型的矩阵tensor才能跟模型的特征图做累加。
把整数转化为矩阵tensor的过程叫做embedding,在更复杂的任务里面它也可以是把文字,词组等转化为矩阵
这个实现,我们手动来实现一个embedding层,(其实就是把步长1,2,3等转化为矩阵)
from typing import Dict, Tuple
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import models, transforms
from torchvision.datasets import MNIST
from torchvision.utils import save_image, make_grid
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, PillowWriter
import numpy as np
class EmbedFC(nn.Module):
def __init__(self, input_dim, emb_dim):
super(EmbedFC, self).__init__()
'''
generic one layer FC NN for embedding things
'''
self.input_dim = input_dim
layers = [
nn.Linear(input_dim, emb_dim),
nn.GELU(),
nn.Linear(emb_dim, emb_dim),
]
self.model = nn.Sequential(*layers)
def forward(self, x):
x = x.view(-1, self.input_dim)
return self.model(x)
## 对于我们的embedding层进行测试一下看看功能
if __name__ == "__main__":
#当前步长为80,总步长为400
N = 80
total_T = 400
x1 = torch.tensor([N/total_T])
# 把1维的输入转化为128维的矩阵
model1 = EmbedFC(1,128)
y1 = model1(x1)
print(y1.shape)
上面的输出为
torch.Size([1, 128])
接下来我们要实现unet的部分;Unet是一个对称结构的网络,很多博客都有,详情看看bubbliiiing大佬的博客,最早用于图像分割领域,这里我们用于预测我们的噪声图:
可以直接拉取别人的改一改也行:
class ResidualConvBlock(nn.Module):
def __init__(
self, in_channels: int, out_channels: int, is_res: bool = False
) -> None:
super().__init__()
'''
standard ResNet style convolutional block
'''
self.same_channels = in_channels==out_channels
self.is_res = is_res
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, 1, 1),
nn.BatchNorm2d(out_channels),
nn.GELU(),
)
self.conv2 = nn.Sequential(
nn.Conv2d(out_channels, out_channels, 3, 1, 1),
nn.BatchNorm2d(out_channels),
nn.GELU(),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.is_res:
x1 = self.conv1(x)
x2 = self.conv2(x1)
# this adds on correct residual in case channels have increased
if self.same_channels:
out = x + x2
else:
out = x1 + x2
return out / 1.414
else:
x1 = self.conv1(x)
x2 = self.conv2(x1)
return x2
class UnetDown(nn.Module):
def __init__(self, in_channels, out_channels):
super(UnetDown, self).__init__()
'''
process and downscale the image feature maps
'''
layers = [ResidualConvBlock(in_channels, out_channels), nn.MaxPool2d(2)]
self.model = nn.Sequential(*layers)
def forward(self, x):
return self.model(x)
class UnetUp(nn.Module):
def __init__(self, in_channels, out_channels):
super(UnetUp, self).__init__()
'''
process and upscale the image feature maps
'''
layers = [
nn.ConvTranspose2d(in_channels, out_channels, 2, 2),
ResidualConvBlock(out_channels, out_channels),
ResidualConvBlock(out_channels, out_channels),
]
self.model = nn.Sequential(*layers)
def forward(self, x, skip):
x = torch.cat((x, skip), 1)
x = self.model(x)
return x
class Unet(nn.Module):
def __init__(self, in_channels, n_feat = 256, n_classes=10):
super(Unet, self).__init__()
self.in_channels = in_channels
self.n_feat = n_feat
self.n_classes = n_classes
self.init_conv = ResidualConvBlock(in_channels, n_feat, is_res=True)
self.down1 = UnetDown(n_feat, n_feat)
self.down2 = UnetDown(n_feat, 2 * n_feat)
self.to_vec = nn.Sequential(nn.AvgPool2d(7), nn.GELU())
self.timeembed1 = EmbedFC(1, 2*n_feat)
self.timeembed2 = EmbedFC(1, 1*n_feat)
self.up0 = nn.Sequential(
# nn.ConvTranspose2d(6 * n_feat, 2 * n_feat, 7, 7), # when concat temb and cemb end up w 6*n_feat
nn.ConvTranspose2d(2 * n_feat, 2 * n_feat, 7, 7), # otherwise just have 2*n_feat
nn.GroupNorm(8, 2 * n_feat),
nn.ReLU(),
)
self.up1 = UnetUp(4 * n_feat, n_feat)
self.up2 = UnetUp(2 * n_feat, n_feat)
self.out = nn.Sequential(
nn.Conv2d(2 * n_feat, n_feat, 3, 1, 1),
nn.GroupNorm(8, n_feat),
nn.ReLU(),
nn.Conv2d(n_feat, self.in_channels, 3, 1, 1),
)
def forward(self, x,t):
x = self.init_conv(x)
down1 = self.down1(x)
down2 = self.down2(down1)
hiddenvec = self.to_vec(down2)
print(t)
temb1 = self.timeembed1(t).view(-1, self.n_feat * 2, 1, 1)
temb2 = self.timeembed2(t).view(-1, self.n_feat, 1, 1)
up1 = self.up0(hiddenvec)
# up2 = self.up1(up1, down2) # if want to avoid add and multiply embeddings
up2 = self.up1(up1+ temb1, down2) # add and multiply embeddings
up3 = self.up2(up2+ temb2, down1)
out = self.out(torch.cat((up3, x), 1))
return out
if __name__ == "__main__":
x = torch.rand(1,1,28,28)
t = torch.tensor([80/400])
model = Unet(in_channels=1,n_feat=128)
y = model(x,t)
print(y.shape)
输出为:torch.Size([1, 1, 28, 28])
这里我们已经把EbedFC的代码加入UNet中用于对特征图中叠加我们的时间步长信息,这样子在我们在训练的过程中就有了随着时间波动的噪声图。到这里网络已经搭建完毕了。接下来我们要训练我们的加噪和去噪的过程。
第一步就是结合上面的公式,从原图直接推导出第N次加噪后的图像。
也就是这里:
实现代码为:
#先随机选取一个步长N,代表了加噪N次后的原图和噪声叠加的图像
n_T = 400 #加噪的总步长
#随机加噪声的步长
_ts = torch.randint(1, n_T+1, (x.shape[0],))
#噪声图
noise = torch.randn_like(x) # eps ~ N(0, 1),这就是公式中的eps
sqrtab = torch.sqrt(alphabar_t)
sqrtmab = torch.sqrt(1 - alphabar_t)
x_t = (
sqrtab[_ts, None, None, None] * x
+ sqrtmab[_ts, None, None, None] * noise
)
## x_t就是经过了_ts步加噪以后的有x生成的噪声图
这里我们就推断出了经过N次加噪以后的合成图,我们的Unet的输入是利用加噪后的图像和步长信息反推出上一步的噪声图,但是上一步的噪声图其实就是服从标准正太分布N(0,1)的噪声图
所以我们利用这两个信息来计算一下loss
loss = nn.MSELoss(noise, uNet(x_t, _ts / self.n_T))
loss的计算就算是搞明白了,接下来我们开始计算恢复的过程
由于恢复图像是通过迭代来恢复图像的,所以我们要把我们的公式拿出来每次去计算上一步的图像
代码如下:
先随机生成一个噪声图像
# 总步长为
n_T= 400
x_i = torch.randn(n_sample, 1,28,28)
nn_model = Unet("./train_model.pth")
for i in range(n_T):
t_is = torch.tensor([i / n_T])
t_is = t_is.repeat(n_sample,1,1,1)
x_i = x_i.repeat(2,1,1,1)
t_is = t_is.repeat(2,1,1,1)
z = torch.randn(n_sample, 1,28,28) if i>1 else 0
eps = nn_model(x_i, t_is)
eps1 = eps[:n_sample]
eps2 = eps[n_sample:]
eps = (1+guide_w)*eps1 - guide_w*eps2
x_i = x_i[:n_sample]
x_i = (
oneover_sqrta[i] * (x_i - eps * mab_over_sqrtmab[i])
+ sqrt_beta_t[i] * z
)
这样经过迭代以后,我们就从噪声中将原图像还原出来了
下面整合一下代码如下:
from typing import Dict, Tuple
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import models, transforms
from torchvision.datasets import MNIST
from torchvision.utils import save_image, make_grid
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation, PillowWriter
import numpy as np
class ResidualConvBlock(nn.Module):
def __init__(
self, in_channels: int, out_channels: int, is_res: bool = False
) -> None:
super().__init__()
'''
standard ResNet style convolutional block
'''
self.same_channels = in_channels==out_channels
self.is_res = is_res
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels, out_channels, 3, 1, 1),
nn.BatchNorm2d(out_channels),
nn.GELU(),
)
self.conv2 = nn.Sequential(
nn.Conv2d(out_channels, out_channels, 3, 1, 1),
nn.BatchNorm2d(out_channels),
nn.GELU(),
)
def forward(self, x: torch.Tensor) -> torch.Tensor:
if self.is_res:
x1 = self.conv1(x)
x2 = self.conv2(x1)
# this adds on correct residual in case channels have increased
if self.same_channels:
out = x + x2
else:
out = x1 + x2
return out / 1.414
else:
x1 = self.conv1(x)
x2 = self.conv2(x1)
return x2
class UnetDown(nn.Module):
def __init__(self, in_channels, out_channels):
super(UnetDown, self).__init__()
'''
process and downscale the image feature maps
'''
layers = [ResidualConvBlock(in_channels, out_channels), nn.MaxPool2d(2)]
self.model = nn.Sequential(*layers)
def forward(self, x):
return self.model(x)
class UnetUp(nn.Module):
def __init__(self, in_channels, out_channels):
super(UnetUp, self).__init__()
'''
process and upscale the image feature maps
'''
layers = [
nn.ConvTranspose2d(in_channels, out_channels, 2, 2),
ResidualConvBlock(out_channels, out_channels),
ResidualConvBlock(out_channels, out_channels),
]
self.model = nn.Sequential(*layers)
def forward(self, x, skip):
x = torch.cat((x, skip), 1)
x = self.model(x)
return x
class EmbedFC(nn.Module):
def __init__(self, input_dim, emb_dim):
super(EmbedFC, self).__init__()
'''
generic one layer FC NN for embedding things
'''
self.input_dim = input_dim
layers = [
nn.Linear(input_dim, emb_dim),
nn.GELU(),
nn.Linear(emb_dim, emb_dim),
]
self.model = nn.Sequential(*layers)
def forward(self, x):
x = x.view(-1, self.input_dim)
return self.model(x)
class Unet(nn.Module):
def __init__(self, in_channels, n_feat = 256, n_classes=10):
super(Unet, self).__init__()
self.in_channels = in_channels
self.n_feat = n_feat
self.n_classes = n_classes
self.init_conv = ResidualConvBlock(in_channels, n_feat, is_res=True)
self.down1 = UnetDown(n_feat, n_feat)
self.down2 = UnetDown(n_feat, 2 * n_feat)
self.to_vec = nn.Sequential(nn.AvgPool2d(7), nn.GELU())
self.timeembed1 = EmbedFC(1, 2*n_feat)
self.timeembed2 = EmbedFC(1, 1*n_feat)
self.contextembed1 = EmbedFC(n_classes, 2*n_feat)
self.contextembed2 = EmbedFC(n_classes, 1*n_feat)
self.up0 = nn.Sequential(
# nn.ConvTranspose2d(6 * n_feat, 2 * n_feat, 7, 7), # when concat temb and cemb end up w 6*n_feat
nn.ConvTranspose2d(2 * n_feat, 2 * n_feat, 7, 7), # otherwise just have 2*n_feat
nn.GroupNorm(8, 2 * n_feat),
nn.ReLU(),
)
self.up1 = UnetUp(4 * n_feat, n_feat)
self.up2 = UnetUp(2 * n_feat, n_feat)
self.out = nn.Sequential(
nn.Conv2d(2 * n_feat, n_feat, 3, 1, 1),
nn.GroupNorm(8, n_feat),
nn.ReLU(),
nn.Conv2d(n_feat, self.in_channels, 3, 1, 1),
)
def forward(self, x,t):
# x is (noisy) image, c is context label, t is timestep,
# context_mask says which samples to block the context on
x = self.init_conv(x)
down1 = self.down1(x)
down2 = self.down2(down1)
hiddenvec = self.to_vec(down2)
# convert context to one hot embedding
# embed context, time step
temb1 = self.timeembed1(t).view(-1, self.n_feat * 2, 1, 1)
temb2 = self.timeembed2(t).view(-1, self.n_feat, 1, 1)
# could concatenate the context embedding here instead of adaGN
# hiddenvec = torch.cat((hiddenvec, temb1, cemb1), 1)
up1 = self.up0(hiddenvec)
# up2 = self.up1(up1, down2) # if want to avoid add and multiply embeddings
up2 = self.up1(up1+ temb1, down2) # add and multiply embeddings
up3 = self.up2(up2+ temb2, down1)
out = self.out(torch.cat((up3, x), 1))
return out
def ddpm_schedules(beta1, beta2, T):
"""
Returns pre-computed schedules for DDPM sampling, training process.
"""
assert beta1 < beta2 < 1.0, "beta1 and beta2 must be in (0, 1)"
beta_t = (beta2 - beta1) * torch.arange(0, T + 1, dtype=torch.float32) / T + beta1
sqrt_beta_t = torch.sqrt(beta_t)
alpha_t = 1 - beta_t
log_alpha_t = torch.log(alpha_t)
alphabar_t = torch.cumsum(log_alpha_t, dim=0).exp()
sqrtab = torch.sqrt(alphabar_t)
oneover_sqrta = 1 / torch.sqrt(alpha_t)
sqrtmab = torch.sqrt(1 - alphabar_t)
mab_over_sqrtmab_inv = (1 - alpha_t) / sqrtmab
return {
"alpha_t": alpha_t, # \alpha_t
"oneover_sqrta": oneover_sqrta, # 1/\sqrt{\alpha_t}
"sqrt_beta_t": sqrt_beta_t, # \sqrt{\beta_t}
"alphabar_t": alphabar_t, # \bar{\alpha_t}
"sqrtab": sqrtab, # \sqrt{\bar{\alpha_t}}
"sqrtmab": sqrtmab, # \sqrt{1-\bar{\alpha_t}}
"mab_over_sqrtmab": mab_over_sqrtmab_inv, # (1-\alpha_t)/\sqrt{1-\bar{\alpha_t}}
}
class DDPM(nn.Module):
def __init__(self, nn_model, betas, n_T, device, drop_prob=0.1):
super(DDPM, self).__init__()
self.nn_model = nn_model.to(device)
# register_buffer allows accessing dictionary produced by ddpm_schedules
# e.g. can access self.sqrtab later
for k, v in ddpm_schedules(betas[0], betas[1], n_T).items():
self.register_buffer(k, v)
self.n_T = n_T
self.device = device
self.drop_prob = drop_prob
self.loss_mse = nn.MSELoss()
def forward(self, x,t):
"""
this method is used in training, so samples t and noise randomly
"""
_ts = torch.randint(1, self.n_T+1, (x.shape[0],)).to(self.device) # t ~ Uniform(0, n_T)
noise = torch.randn_like(x) # eps ~ N(0, 1)
x_t = (
self.sqrtab[_ts, None, None, None] * x
+ self.sqrtmab[_ts, None, None, None] * noise
) # This is the x_t, which is sqrt(alphabar) x_0 + sqrt(1-alphabar) * eps
# We should predict the "error term" from this x_t. Loss is what we return.
# dropout context with some probability
#context_mask = torch.bernoulli(torch.zeros_like(c)+self.drop_prob).to(self.device)
# return MSE between added noise, and our predicted noise
return self.loss_mse(noise, self.nn_model(x_t, _ts / self.n_T))
def sample(self, n_sample, size, device, guide_w = 0.0):
# we follow the guidance sampling scheme described in 'Classifier-Free Diffusion Guidance'
# to make the fwd passes efficient, we concat two versions of the dataset,
# one with context_mask=0 and the other context_mask=1
# we then mix the outputs with the guidance scale, w
# where w>0 means more guidance
x_i = torch.randn(n_sample, (1,28,28)).to(device) # x_T ~ N(0, 1), sample initial noise
# context_mask = context_mask.repeat(2)
# context_mask[n_sample:] = 1. # makes second half of batch context free
x_i_store = [] # keep track of generated steps in case want to plot something
print()
for i in range(self.n_T, 0, -1):
print(f'sampling timestep {i}',end='\r')
t_is = torch.tensor([i / self.n_T]).to(device)
t_is = t_is.repeat(n_sample,1,1,1)
# double batch
x_i = x_i.repeat(2,1,1,1)
t_is = t_is.repeat(2,1,1,1)
z = torch.randn(n_sample, *size).to(device) if i > 1 else 0
# split predictions and compute weighting
eps = self.nn_model(x_i, t_is)
eps1 = eps[:n_sample]
eps2 = eps[n_sample:]
eps = (1+guide_w)*eps1 - guide_w*eps2
x_i = x_i[:n_sample]
x_i = (
self.oneover_sqrta[i] * (x_i - eps * self.mab_over_sqrtmab[i])
+ self.sqrt_beta_t[i] * z
)
if i%20==0 or i==self.n_T or i<8:
x_i_store.append(x_i.detach().cpu().numpy())
x_i_store = np.array(x_i_store)
return x_i, x_i_store
def train_mnist():
# hardcoding these here
n_epoch = 1
batch_size = 1
n_T = 400 # 500
device = "cuda:0"
n_classes = 10
n_feat = 128 # 128 ok, 256 better (but slower)
lrate = 1e-4
save_model = False
save_dir = './data/diffusion_outputs10/'
ws_test = [0.0, 0.5, 2.0] # strength of generative guidance
ddpm = DDPM(nn_model=Unet(in_channels=1, n_feat=n_feat, n_classes=n_classes), betas=(1e-4, 0.02), n_T=n_T, device=device, drop_prob=0.1)
ddpm.to(device)
# optionally load a model
# ddpm.load_state_dict(torch.load("./data/diffusion_outputs/ddpm_unet01_mnist_9.pth"))
tf = transforms.Compose([transforms.ToTensor()]) # mnist is already normalised 0 to 1
dataset = MNIST("./data", train=True, download=True, transform=tf)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=5)
optim = torch.optim.Adam(ddpm.parameters(), lr=lrate)
for ep in range(n_epoch):
print(f'epoch {ep}')
ddpm.train()
# linear lrate decay
optim.param_groups[0]['lr'] = lrate*(1-ep/n_epoch)
pbar = tqdm(dataloader)
loss_ema = None
for x, c in pbar:
optim.zero_grad()
x = x.to(device)
c = c.to(device)
loss = ddpm(x, c)
loss.backward()
if loss_ema is None:
loss_ema = loss.item()
else:
loss_ema = 0.95 * loss_ema + 0.05 * loss.item()
pbar.set_description(f"loss: {loss_ema:.4f}")
optim.step()
# for eval, save an image of currently generated samples (top rows)
# followed by real images (bottom rows)
ddpm.eval()
with torch.no_grad():
n_sample = 4*n_classes
for w_i, w in enumerate(ws_test):
x_gen, x_gen_store = ddpm.sample(n_sample, (1, 28, 28), device, guide_w=w)
# append some real images at bottom, order by class also
x_real = torch.Tensor(x_gen.shape).to(device)
for k in range(n_classes):
for j in range(int(n_sample/n_classes)):
try:
idx = torch.squeeze((c == k).nonzero())[j]
except:
idx = 0
x_real[k+(j*n_classes)] = x[idx]
x_all = torch.cat([x_gen, x_real])
grid = make_grid(x_all*-1 + 1, nrow=10)
save_image(grid, save_dir + f"image_ep{ep}_w{w}.png")
print('saved image at ' + save_dir + f"image_ep{ep}_w{w}.png")
if ep%5==0 or ep == int(n_epoch-1):
# create gif of images evolving over time, based on x_gen_store
fig, axs = plt.subplots(nrows=int(n_sample/n_classes), ncols=n_classes,sharex=True,sharey=True,figsize=(8,3))
def animate_diff(i, x_gen_store):
print(f'gif animating frame {i} of {x_gen_store.shape[0]}', end='\r')
plots = []
for row in range(int(n_sample/n_classes)):
for col in range(n_classes):
axs[row, col].clear()
axs[row, col].set_xticks([])
axs[row, col].set_yticks([])
# plots.append(axs[row, col].imshow(x_gen_store[i,(row*n_classes)+col,0],cmap='gray'))
plots.append(axs[row, col].imshow(-x_gen_store[i,(row*n_classes)+col,0],cmap='gray',vmin=(-x_gen_store[i]).min(), vmax=(-x_gen_store[i]).max()))
return plots
ani = FuncAnimation(fig, animate_diff, fargs=[x_gen_store], interval=200, blit=False, repeat=True, frames=x_gen_store.shape[0])
ani.save(save_dir + f"gif_ep{ep}_w{w}.gif", dpi=100, writer=PillowWriter(fps=5))
print('saved image at ' + save_dir + f"gif_ep{ep}_w{w}.gif")
# optionally save model
if save_model and ep == int(n_epoch-1):
torch.save(ddpm.state_dict(), save_dir + f"model_{ep}.pth")
print('saved model at ' + save_dir + f"model_{ep}.pth")
if __name__ == "__main__":
train_mnist()