由于最近一直在研究上下文注意力机制,当对特征图FM提取patch,需要用到函数extract_image_patches,特此记录一下。温馨提示:提笔计算一下,更容易理解。
参考1 https://blog.csdn.net/qq_38675570/article/details/81239774
参考2 http://www.voidcn.com/article/p-svnrphay-btg.html
def extract_image_patches(
images, # 输入的特征图[b,h,w,c]
ksizes=None, # patch大小,图像这块一般常用1,3,5。参数为[1, k_size, k_size, 1]
strides=None,# 步长,[1, stride, stride, 1],滑动的像素
rates=None, # 参考扩张卷积,相邻像素填0个数
padding=None,# “VALID”,每个补丁必须完全包含在图像中;“SAME”,允许补丁不完整
name=None,
sizes=None):
这里以特征图bg=fg_in=[8,32,32,192]为例:
# 先从背景区域中提取块
patch1 = tf.extract_image_patches(bg, [1, k_size, k_size, 1], [1, stride, stride, 1], [1, 1, 1, 1], 'VALID')
print("patch1 size",patch1.shape) # [8,30,30,1728]
# 再从前景区域中提取块,添加了padding
patch2 = tf.extract_image_patches(fg_in, [1,k_size,k_size,1], [1,1,1,1], [1,1,1,1], 'SAME')
print("patch2 size", patch2.shape) # [8,32,32,1728]
输出维度分析:
h=32,stride=1,k_size=3。当padding=0时,计算出的块个数为c=900;当padding=1时,块的个数为32*32=1024。
提出的patch的维度为1728 = 192 * 3 * 3,也就是输出的最后一维是c_in * k_size * k_size。在对patch计算余弦相似度时,处理的就是这一部分。
kn = int((k_size - 1) / 2)
c = 0 # 计算块的个数
for p in range(kn, h - kn, stride):
for q in range(kn, w - kn, stride):
c += 1
参考1 https://blog.csdn.net/LoseInVain/article/details/88139435
上面这条链接将网络层的torch.nn.unfold()函数讲得很透彻,看明白之后大概就理解了。
# extract patches
def extract_patches(x, kernel=3, stride=1):
if kernel != 1:
x = nn.ZeroPad2d(1)(x)
x = x.permute(0, 2, 3, 1)
all_patches = x.unfold(1, kernel, stride).unfold(2, kernel, stride)
return all_patches
调用函数,输入的维度x2=[8,192,32,32],ksize=3,stride=1,提取patch后输出的维度为[8,32,32,192,3,3],和tensorflow的用法差不多,将tensor进行view一下可以达到一样的效果。
w = extract_patches(x2, kernel=self.ksize, stride=self.stride)
print("w",w.size()) # [8,32,32,192,3,3]
下面给出2019 CVPR PEPSI : Fast Image Inpainting With Parallel Decoding Network论文中注意力模块,有做图像修复的朋友可以瞅一瞅,我自己修改了一些添加了一些代码。
论文:地址
翻译:https://blog.csdn.net/baidu_33256174/article/details/102570361
code:https://github.com/Forty-lock/PEPSI
from __future__ import division
# from ops import *
import tensorflow as tf
from tensorflow.keras.layers import Conv2D,ELU
def softmax(input):
k = tf.exp(input - 3)
k = tf.reduce_sum(k, 3, True)
# k = k - num * tf.ones_like(k)
ouput = tf.exp(input - 3) / k
return ouput
def reduce_var(x, axis=None, keepdims=False):
"""Variance of a tensor, alongside the specified axis.
# Arguments
x: A tensor or variable.
axis: An integer, the axis to compute the variance.
keepdims: A boolean, whether to keep the dimensions or not.
If `keepdims` is `False`, the rank of the tensor is reduced
by 1. If `keepdims` is `True`,
the reduced dimension is retained with length 1.
# Returns
A tensor with the variance of elements of `x`.
"""
m = tf.reduce_mean(x, axis=axis, keepdims=True)
devs_squared = tf.square(x - m)
return tf.reduce_mean(devs_squared, axis=axis, keepdims=keepdims)
def reduce_std(x, axis=None, keepdims=False):
"""Standard deviation of a tensor, alongside the specified axis.
# Arguments
x: A tensor or variable.
axis: An integer, the axis to compute the standard deviation.
keepdims: A boolean, whether to keep the dimensions or not.
If `keepdims` is `False`, the rank of the tensor is reduced
by 1. If `keepdims` is `True`,
the reduced dimension is retained with length 1.
# Returns
A tensor with the standard deviation of elements of `x`.
"""
return tf.sqrt(reduce_var(x, axis=axis, keepdims=keepdims))
def contextual_block(bg_in, fg_in, mask, k_size, lamda, name, stride=1):
with tf.variable_scope(name):
b, h, w, dims = [i.value for i in bg_in.get_shape()]
temp = tf.image.resize_nearest_neighbor(mask, (h, w)) # 进行插值扩张到(h,w)
temp = tf.expand_dims(temp[:, :, :, 0], 3) # b 128 128 1
mask_r = tf.tile(temp, [1, 1, 1, dims]) # b 128 128 128 ,复制128次
bg = bg_in * mask_r
kn = int((k_size - 1) / 2)
c = 0 # 计算块的个数
for p in range(kn, h - kn, stride):
for q in range(kn, w - kn, stride):
c += 1
# 先从背景区域中提取块
patch1 = tf.extract_image_patches(bg, [1, k_size, k_size, 1], [1, stride, stride, 1], [1, 1, 1, 1], 'VALID')
print("patch1 size",patch1.shape) # [8,30,30,1728]
# 推算一下
patch1 = tf.reshape(patch1, (b, 1, c, k_size*k_size*dims)) # [b,out_rows,out_cols,k_size*k_size*c]
patch1 = tf.reshape(patch1, (b, 1, 1, c, k_size * k_size * dims))
patch1 = tf.transpose(patch1, [0, 1, 2, 4, 3]) # 扩展成5维
print("patch1 size", patch1.shape) # [8,1,1,1728,900]
# 再从前景区域中提取块,添加了padding
patch2 = tf.extract_image_patches(fg_in, [1,k_size,k_size,1], [1,1,1,1], [1,1,1,1], 'SAME')
print("patch2 size", patch2.shape) # [8,32,32,1728]
ACL = []
for ib in range(b): # 一张一张图的计算
k1 = patch1[ib, :, :, :, :] # 一张图像的背景
print("k1",k1.shape) # [1,1,1728,900]
k1d = tf.reduce_sum(tf.square(k1), axis=2)
print("k1d",k1d.shape) # [1,1,900]
k2 = tf.reshape(k1, (k_size, k_size, dims, c))
print("k2",k2.shape) # (3, 3, 192, 900)
ww = patch2[ib, :, :, :]
print("ww", ww.shape) # [32,32,1728]
wwd = tf.reduce_sum(tf.square(ww), axis=2, keepdims=True)
print("wwd",wwd.shape) # [32,32,1]
ft = tf.expand_dims(ww, 0) # [1,32,32,1728]
CS = tf.nn.conv2d(ft, k1, strides=[1, 1, 1, 1], padding='SAME')
print("CS",CS.shape) # [1,32,32,900]
tt = k1d + wwd
print("tt",tt.shape) # [1,32,32,900]
# 在axis=0处增加一个为1的维度
DS1 = tf.expand_dims(tt, 0) - 2 * CS
# 规范化处理
DS2 = (DS1 - tf.reduce_mean(DS1, 3, True)) / reduce_std(DS1, 3, True)
print('DS2',DS2.shape) # [1,32,32,900]
DS2 = -1 * tf.nn.tanh(DS2)
CA = softmax(lamda * DS2) # 计算得到相似性分数
print("CA",CA.shape) # [1,32,32,900]
# K2为卷积核
ACLt = tf.nn.conv2d_transpose(CA, k2, output_shape=[1, h, w, dims], strides=[1, 1, 1, 1], padding='SAME')
print("ACLt",ACLt.shape) # (1, 32, 32, 192)
ACLt = ACLt / (k_size ** 2)
if ib == 0:
ACL = ACLt
else:
ACL = tf.concat((ACL, ACLt), 0)
ACL = bg + ACL * (1.0 - mask_r) # mask,0表示缺失
con1 = tf.concat([bg_in, ACL], 3) # 整合背景和重构的前景
ACL2 = Conv2D(dims, [1, 1], strides=[1, 1], padding='VALID')(con1)
ACL2 = tf.nn.elu(ACL2)
return ACL2
def ca_test(args):
from PIL import Image
import matplotlib.pyplot as plt
import os
import numpy as np
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
def float_to_uint8(img):
img = img * 255
return img.astype('uint8')
def default_loader(path):
with open(path, 'rb') as f:
image = Image.open(f).convert('RGB')
return image
rate = 2
stride = 2
grid = rate * stride
# b = default_loader(args.imageA)
# w, h = b.size
# # b = b.resize((w//grid*grid//2, h//grid*grid//2), Image.ANTIALIAS)
# b = b.resize((w // (grid * grid*2), h // (grid * grid*2)), Image.ANTIALIAS)
# print('Size of imageA: {}'.format(b.size))
#
# f = default_loader(args.imageB)
# w, h = f.size
# f = f.resize((w // (grid *grid*2), h // (grid *grid*2)), Image.ANTIALIAS)
# # print('Size of imageB: {}'.format(f.size))
b = np.zeros([8,32,32,192],dtype=np.float32)
print(type(b))
mask = default_loader(args.mask)
print("mask",mask.size)
w, h = mask.size
mask = mask.resize((w // (grid*2), h // (grid*2)), Image.ANTIALIAS)
mask = mask.convert("L")
print("Size of mask:{}".format(mask.size))
# variable
batch_size, Height, Width = 8,32,32
bg_var = tf.placeholder(tf.float32, [batch_size, Height, Width, 192])
fg_var = tf.placeholder(tf.float32, [batch_size, Height, Width, 192])
mask_var = tf.placeholder(tf.float32, [batch_size, Height, Width, 1])
y = contextual_block(bg_in=bg_var, fg_in=fg_var, mask=mask_var, k_size=3, lamda=80, name='CA')
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.9 # 限制GPU使用率为40%
config.gpu_options.allow_growth = True # 动态申请显存
with tf.Session(config=config) as sess:
sess.run(tf.global_variables_initializer())
# b = tf.expand_dims(b, axis=0).eval()
# f = tf.expand_dims(f, axis=0).eval()
mask = tf.expand_dims(mask,axis=2)
mask = tf.expand_dims(mask, axis=0)
mask = tf.tile(mask, [8, 1, 1, 1]).eval()
print(type(mask))
print("b:",b.shape,"f:",b.shape,"mask:",mask.shape)
y = sess.run([y],feed_dict={bg_var:b,fg_var:b,mask_var:mask})
# plt.imshow(y)
# plt.show()
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--imageA', default='../image/000147.jpg.jpg', type=str,
help='Image A as background patches to reconstruct image B.')
parser.add_argument('--imageB', default='../image/000201.jpg.jpg', type=str,
help='Image B is reconstructed with image A.')
parser.add_argument('--mask', default='../image/mask_0001.jpg', type=str)
parser.add_argument('--imageOut', default='../image/result.png', type=str,
help='Image B is reconstructed with image A.')
args = parser.parse_args()
ca_test(args)