写在前面:二维卷积比较简单,原理略过
但还是大致说一下,in_channel = 2,out_channel = 3,所以kernel的数量是6,输出的每个通道由输入的每个通道与各自的kernel进行卷积操作,每个输出通道再将卷积操作的矩阵再加起来,最后将所有输出通道的矩阵拼接。
步长为1的情况下,输出大小 = 输入大小 - 核的大小 + 2*padding大小 + 1
所以我们要让输出和输入大小一致,根据公式可以计算出padding大小
padding=‘same’ 自动计算大小
比较简单,直接略过说明了
import torch
import torch.nn.functional as F
in_channel = 1
out_channel = 1
kernel_size = 3
bias = False
height = 4
weight = 4
batch_size = 1
input_size = [batch_size, in_channel, height, weight] # [1,4,4]
# 实例化一个二维卷积层
conv_layer = torch.nn.Conv2d(in_channel, out_channel, kernel_size, bias=bias)
conv_input = torch.randn(input_size)
print(conv_input, "# conv_input")
conv_output = conv_layer(conv_input)
print(conv_output, "# conv_output")
print(conv_layer.weight.shape, "# [out_ch, in_ch, h, w]")
# 使用functional api
conv_F_output = F.conv2d(conv_input, conv_layer.weight)
print(conv_F_output, "# conv_F_output")
input的形状: batch_size, in channel, h, w
kernel的形状: out_channel, in_channel, kernel_h, kernel_w
bias的是个标量,但是对应每个输出通道会有所不同
四层遍历(这里只是最简单的实现)
分别遍历 batch_size, out_channel, in_channel, h, w
对每个input_channel维度计算区域,最后计算完后要合并到out_channel维度,所以是先h,w,再in_channel,最后out_channel,只考虑了一个样本,所以最外层再套一个batch_size维度
# 原始的矩阵运算实现二位卷积
import math
import torch
import torch.nn.functional as F
batch_size = 2
in_channel = 2
inh = 5
inw = 5
out_channel = 3
kh = 3
kw = 3
tmp_input = torch.ones(batch_size, in_channel, inh, inw)
tmp_kernel = torch.ones(out_channel, in_channel, kh, kw) / 2
# bias的是个标量,但是对应每个输出通道不同
tmp_bias = torch.randn(out_channel)
def matrix_multiplication_for_conv2d_full(input_tensor: torch.Tensor,
kernel: torch.Tensor, stride=1,
padding=0, bias=0):
"""
考虑batch_size维度和channel维度
input和kernel都是4维,
"""
if padding > 0:
# batch_size,channel维度都不需要填充
input_tensor = F.pad(input_tensor, (padding, padding, padding, padding,0,0,0,0))
if bias is None:
bias = torch.zeros(out_channel)
# input shape: batch_size, in channel, h, w
bs, in_ch, input_h, input_w = input_tensor.shape
# kernel shape: out_channel, in_channel, kernel_h, kernel_w
out_ch, in_ch, kernel_h, kernel_w = kernel.shape
output_h = math.floor((input_h - kernel_h)/stride) + 1
output_w = math.floor((input_w - kernel_w)/stride) + 1
# 初始化一个输出矩阵
output = torch.zeros(bs, out_ch, output_h, output_w)
# 5层遍历,逐层遍历batch_size, out_channel, in_channel, h, w
for ind in range(bs):
for oc in range(out_ch):
for ic in range(in_ch):
for i in range(0, input_h - kernel_h + 1, stride):
for j in range(0, input_w - kernel_w + 1, stride):
# 取输入的区域
region = input_tensor[ind, ic, i:i+kernel_h, j:j+kernel_w]
# 输入区域与kernel点乘计算卷积,逐元素相乘,输出通道是所有输入通道的求和,是+=
# kernel也要取出区域,取第oc通道的第ic通道
output[ind, oc, int(i/stride), int(j/stride)] \
+= torch.sum(region * kernel[oc, ic])
# bias是对每个output channel相加
output[ind, oc] += bias[oc]
return output
res1 = matrix_multiplication_for_conv2d_full(tmp_input, tmp_kernel, padding=1,
bias=tmp_bias, stride=2)
res2 = F.conv2d(tmp_input, tmp_kernel,
padding=1, bias=tmp_bias, stride=2)
flag = torch.allclose(res1, res2)
print(flag)
print(res1)
print(res2)
二维卷积可以转化成两个矩阵相乘,每次卷积的区域可以拉直成一个行向量,kernel可以拉直成一个列向量,那这样一个区域矩阵乘法计算就是一个标量,即作为输出。
另外一种做法就是,将kernel padding成和input矩阵一样的shape,直接矩阵相乘,也是后来转置卷积思想的基础
这里先实现第一种思路
# 原始的矩阵运算实现二位卷积
tmp_input = torch.ones(5, 5)
tmp_kernel = torch.ones(3, 3) / 2
tmp_bias = torch.randn(1)
def matrix_multiplication_for_conv2d_flatten(input_tensor: torch.Tensor,
kernel: torch.Tensor, stride=1,
padding=0, bias=0):
"""
不考虑batch_size维度和channel维度,flatten版本
"""
if padding > 0:
input_tensor = F.pad(input_tensor, (padding, padding, padding, padding))
input_h, input_w = input_tensor.shape
kernel_h, kernel_w = kernel.shape
output_h = math.floor((input_h - kernel_h)/stride) + 1
output_w = math.floor((input_w - kernel_w)/stride) + 1
# 初始化一个输出矩阵
output = torch.zeros(output_h, output_w)
# 存储拉平后的特征区域
region_matrix = torch.zeros(output.numel(), kernel.numel())
# 将kernel转成列向量
kernel_matrix = kernel.reshape((kernel.numel(), 1))
row_index = 0
# 遍历
for i in range(0, input_h - kernel_h + 1, stride):
for j in range(0, input_w - kernel_w + 1, stride):
# 取输入的区域
region = input_tensor[i:i+kernel_h, j:j+kernel_w]
region_vector = torch.flatten(region)
region_matrix[row_index] = region_vector
row_index += 1
# 矩阵乘法
output_matrix = region_matrix @ kernel_matrix
output = output_matrix.reshape((output_h, output_w)) + bias
return output
res1 = matrix_multiplication_for_conv2d_flatten(tmp_input, tmp_kernel, padding=1,
bias=tmp_bias)
res2 = F.conv2d(tmp_input.reshape(1,1,tmp_input.shape[0], tmp_input.shape[1]),
tmp_kernel.reshape(1,1,tmp_kernel.shape[0], tmp_kernel.shape[1]),
padding=1, bias=tmp_bias)
flag = torch.allclose(res1, res2)
print(flag)
print(res1)
print(res2)
其实也可以用nn.Unfold API来实现,Unfold干的事情和flatten代码干的原理差不多,该函数是从一个batch图片中,提取出滑动的局部区域块,也就是卷积操作中的提取kernel filter对应的滑动窗口
kernel区域的size是[2,3]
输入形状batch,in_channel,h,w 分别是 2,5,3,4
每次卷积的区域把它拿出来,那就是 in_channel * kernel_h * kernel_w = 5 * 2 * 3 = 30
输入Tensor大小3 * 4,kernel大小2 * 3,默认stride=1,padding=0,那么有4个block,也就是一次卷积输出大小是2*2的
所以Unfold输出的形状是 [2,30,4]
那怎么用呢,Unfold输出不考虑batch_size维度,则是[30,4],转置一下变成[4,30]作为a
假设out_channel = 1,那么将kernel size 为[1,5,2,3],outch=1,inch=5, kernel_h=2, kernel_w=3,[1,5,2,3]reshpe成[1,30],再转置变成[30,1]作为b
最后矩阵乘法即可 res = a@b
unfold = nn.Unfold(kernel_size=(2, 3))
input = torch.randn(2, 5, 3, 4)
output = unfold(input)
# each patch contains 30 values (2x3=6 vectors, each of 5 channels)
# 4 blocks (2x3 kernels) in total in the 3x4 input
output.size()
# Convolution is equivalent with Unfold + Matrix Multiplication + Fold (or view to output shape)
inp = torch.randn(1, 3, 10, 12)
w = torch.randn(2, 3, 4, 5)
inp_unf = torch.nn.functional.unfold(inp, (4, 5))
out_unf = inp_unf.transpose(1, 2).matmul(w.view(w.size(0), -1).t()).transpose(1, 2)
out = torch.nn.functional.fold(out_unf, (7, 8), (1, 1))
# or equivalently (and avoiding a copy),
# out = out_unf.view(1, 2, 7, 8)
(torch.nn.functional.conv2d(inp, w) - out).abs().max()
转置卷积也可以说是反卷积,实现上采样,恢复输入的形状大小
对kernel进行展开,将input拉直,如4×4拉成16×1,每一步的kernel也拉长、填充一个长度为16的向量,拼成一个矩阵(shape = [(input_h - kernel_h + 1) * (input_w - kernel_w + 1), input_h * input_w]),再用这个矩阵([4, 16])与input拉直后的向量([16,1])进行矩阵相乘,这样也能得到一个二维卷积的结果([4,1])。
展开后的kernel矩阵将它转置一下([16,4]),再和输出([4,1])相乘,得到结果([16,1]),这样就起到了一个形状上的上采样的效果,也就是恢复输入的形状(只是形状)。
# kernel展开
def get_kernel_matrix(kernel, input_size):
"""先不考虑batch,channel,padding. 并假设stride=1
得到kernel矩阵,将kernel拉长、填充
如3×3的kernel变成5×5的向量
将所有的向量堆叠起来成一个矩阵
"""
kernel_h, kernel_w = kernel.shape
input_h, input_w = input_size.shape
num_out_feat_map = (input_h - kernel_h + 1) * (input_w - kernel_w + 1)
# 初始化res矩阵
result = torch.zeros((num_out_feat_map, input_h*input_w))
# 分别对高度维和宽度维循环
count = 0
for i in range(0, input_h - kernel_h + 1, 1):
for j in range(0, input_w - kernel_w + 1, 1):
# pad操作:先左右后上下填充0, 使得填充后的大小和输入的大小一致
padded_kernel = F.pad(kernel,[j, input_w - kernel_w - j, i, input_h - kernel_h - i])
# 每次填充后的kernel拉直送入result中
result[count] = padded_kernel.flatten()
count += 1
return result
def test_get_kernel_matrix():
kernel = torch.randn(3, 3)
input = torch.randn(4, 4)
kernel_matrix = get_kernel_matrix(kernel, input)
# print(kernel_matrix, '\n', kernel_matrix.shape)
mm_conv2d_output = kernel_matrix @ input.reshape((-1, 1))
pytorch_conv2d_output = F.conv2d(input.unsqueeze(0).unsqueeze(0),
kernel.unsqueeze(0).unsqueeze(0))
print(mm_conv2d_output, "\n", pytorch_conv2d_output)
test_get_kernel_matrix()
def test_transpose_conv2d_demo():
# 转置卷积实现上采样
kernel = torch.randn(3, 3)
input = torch.randn(4, 4)
kernel_matrix = get_kernel_matrix(kernel, input)
mm_conv2d_output = kernel_matrix @ input.reshape((-1, 1))
pytorch_conv2d_output = F.conv2d(input.unsqueeze(0).unsqueeze(0),
kernel.unsqueeze(0).unsqueeze(0))
# [16,4] @ [4,1]
mm_transposed_conv2d_output = kernel_matrix.transpose(-1, -2) @ mm_conv2d_output
py_transpose_conv2d_output = F.conv_transpose2d(pytorch_conv2d_output,
kernel.unsqueeze(0).unsqueeze(0))
print(mm_transposed_conv2d_output.reshape(4,4))
print(py_transpose_conv2d_output)
test_transpose_conv2d_demo()
注意padpadding是从里往外维度,对每个维度的padding都是先左后右
F.padding(input, Seq())
输入形状
input shape = [batch, in_channel, i_h, i_w]
kernel形状
kernel shape = [out_channel, in_channel, k_h, k_w]
输出高度
out_h = floor((ih - kh)/stride) + 1
输出宽度
out_w = floor((iw - kw)/stride) + 1
输出形状 = [bs, oc, oh, ow]
block数目 out_num = out_h * out_w * in_channel
区域滑块遍历
i in range(0, ih - kh + 1, stride)
j in range(0, iw -kw +1, stride)
截取区域位置
region = input[bs, ic, i:i+k_h, j:j+k_h]
遍历kernel位置
kernel[oc, ic]
区域输出位置
output[bs, oc, int(i/stride), int(j/stride)]
卷积计算
sum(region * kernel[oc, in])
卷积计算,input展开角度
region_flatten_vector shape = [k_n,]
output = region_flatten_matrix @ kernel_vector(shape = [out_n, k_n] @ [k_n, 1] = [out_n, 1])
output.reshape((oh, ow))
卷积计算,kernel展开角度
input_vector shape = [-1, 1] ([ih * iw, 1])
kernel_padded = F.pad(j, iw-kw-j, i, ih-kh-i).flatten (shape = [ih * iw,])
kernel_matrix[count] = kernel_padded
kernel_matrix shape = [out_n, ih * iw]
output = kernel_matrix @ input_vector (shape = [out_n, 1])
reshape((oh, ow))
上采样转置(反)卷积
kernel_matrix shape = [out_n, ih * iw]
output shape = [out_n, 1]
transposed_map = kernel_matrix.transpose(-1, -2) @ output (shape = [ih * iw, out_n] @ [out_n, 1])
等于使用API,不用自己去实现kernel_matrix
F.conv_transpose2d(output, kernel)
原理:矩阵角度思考
y = w * x
∂y/∂x = w^T
y’ = w^T * y
此时y’形状 = x
y[m,n] = w[m, p] @ x[p,n]
w^T[p, m] @ y[m,n] = y’[p, n]
空洞 dilation
nn.Conv2d中参数dilation默认等于1
普通卷积是从输入特征中取一块kernel size的区域,即dilation=1,彼此之间索引差距为1
dilation若等于2,说明第一个元素和第二个元素之间索引相差了2
也就说dilation决定了取出的区域是否是紧凑的
目的是为了保持运算量不变的情况下,增大感受野的面积
def dilation_demo():
a = torch.randn(7,7)
print(a)
# dilation = 1
print(a[0:3, 0:3])
# dilation = 2
print(a[0:5:2, 0:5:2])
# dilation = 3
dilation = 3
print(a[0:7:3, 0:7:3])
dilation_demo()
group=1时,对输入的每个通道进行卷积,再求和赋值给输出的通道
group>1时,将一个大的卷积看作为多个小卷积
假设此时有in channel = 2, out channel = 4
当group=1
此时有8个卷积核
当group=2时
in和out channel被分成两组
sub in channel = 1
sub out channnel = 2
这样有2个卷积核,又因为有两组,所以一共4个卷积核
卷积参数减半
但引入了归纳偏置(前提假设),我们只需做一小部分通道之间的建模就可以了,不需要考虑每个通道与所有通道之间的关系,也就是说group=1,每个输入通道都要做一个混合(卷积),当group>1,每次只在几个通道里面做卷积,下一次在另外几个通道里做卷积,结果拼起来就好了,也也就是说通道融合并不充分,组与组之间不融合,只在组之间融合。
最后进行一个1 * 1 的point-wise convolution 将所有通道之间进行融合就好了
首先是padding,和之前的常规卷积一致,考虑groups情况, 确保ic,oc能被groups整除。reshape 输入和kernel,把groups拆分,计算kernel高度和宽度,计算输出的高度和宽度。后面正常卷积计算遍历。
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
def matrix_multiplication_for_conv2d_final(input, kernel, bias=None, stride=1,
padding=0, dilation=1, groups=1):
if padding > 0:
input = F.pad(input, (padding, padding, padding, padding, 0,0,0,0))
# batch_size, in_channel, input h, input w
bs, ic, ih, iw = input.shape
# out_channel, _, kernel h, kernel w
oc, _ic, kh, kw = kernel.shape
if bias is None:
bias = torch.zeros(oc)
# 考虑groups情况, 确保ic,oc能被groups整除
assert oc % groups == 0 and ic % groups == 0, "groups必须同时被通道数整除!"
# reshape一下,把groups拆开
input = input.reshape((bs, groups, ic//groups, ih, iw))
kernel = kernel.reshape((groups, oc//groups, ic//groups, kh, kw))
# 相邻点之间插入dilation-1个空洞,插入kh - 1次,所以增加的距离一共是
kh = (dilation - 1) * (kh - 1) + kh
kw = (dilation - 1) * (kw - 1) + kw
# 输出 高度和宽度, 不需要考虑 dilation了,因为已经在kh和kw里面了
oh = int(math.floor((ih - kh)/stride)) + 1
ow = int(math.floor((iw - kw)/stride)) + 1
output_shape = (bs, groups, oc//groups, oh, ow)
# 初始化输出
output = torch.zeros(output_shape)
# 遍历计算
for ind in range(bs): # batch遍历
for g in range(groups): # 群组遍历
for oc_ind in range(oc//groups): # 对分组的输出通道遍历
for ic_ind in range(ic//groups): # 对分组的输入通道遍历
for i in range(0, ih-kh+1, stride): # 高度
for j in range(0, iw-kw+1, stride): # 宽度
# 取出区域
region = input[ind, g, ic_ind, i:i+kh:dilation, j:j+kw:dilation]
output[ind, g, oc_ind, int(i/stride), int(j/stride)] += torch.sum(region * kernel[g, oc_ind, ic_ind])
# bias偏置,计算走过多少个通道
output[ind, g, oc_ind] += bias[g*(oc//groups) + oc_ind]
# 还原回4维
output = output.reshape((bs, oc, oh, ow))
return output
def test_conv2d_final():
bs, ic, ih, iw = 2, 2, 5, 5
kh, kw = 3, 3
oc = 4
groups, dilation, stride = 2, 2, 2
padding = 1
input = torch.randn(bs, ic, ih, iw)
# groups大于1,kernel数量会减小,输入通道数减小
kernel = torch.randn(oc, ic//groups, kh, kw)
bias = torch.randn(oc)
py_res = F.conv2d(input, kernel, bias=bias, padding=padding, stride=stride,
dilation=dilation, groups=groups)
my_res = matrix_multiplication_for_conv2d_final(
input, kernel, bias=bias,padding=padding, stride=stride,
dilation=dilation, groups=groups)
flag = torch.allclose(py_res, my_res)
print(flag)
test_conv2d_final()