我总是搞不清啥时候dim=0或者dim=1,总会搞混,刚好在阅读Vision Transformer代码时,看到有人用einops,于是百度了一下,发现这个东西真的很好用!
大家可以参考这两篇帖子对照着学习:
https://zhuanlan.zhihu.com/p/342675997
https://blog.csdn.net/weixin_43135178/article/details/118877384
没有安装的话,首先需要pip install einops
安装完后,大家可以顺着我的代码,依次执行,我也是参考第二个链接,为了加深理解,又自己跑了一遍。
import numpy as np
from einops import rearrange, repeat
# suppose we have a set of 32 images in "h w c" format (height-width-channel)
images = [np.random.randn(30, 40, 3) for _ in range(32)]
# stack along first (batch) axis, output is a single array :(32, 30, 40, 3)
print(rearrange(images, 'b h w c -> b h w c').shape)
# Output
# (32, 30, 40, 3)
# concatenate images along height (vertical axis), 960 = 32 * 30 :(960, 40, 3)
print(rearrange(images, 'b h w c -> (b h) w c').shape)
# Output
# (960, 40, 3)
# concatenated images along horizontal axis, 1280 = 32 * 40 :(30, 1280, 3)
print(rearrange(images, 'b h w c -> h (b w) c').shape)
# Output
# (30, 1280, 3)
# reordered axes to "b c h w" format for deep learning :(32, 3, 30, 40)
print(rearrange(images, 'b h w c -> b c h w').shape)
# Output
# (32, 3, 30, 40)
# 这里(h h1) (w w1)就相当于h与w变为原来的1/h1,1/w1倍
# split each image into 4 smaller (top-left, top-right, bottom-left, bottom-right), 128 = 32 * 2 * 2 :(128, 15, 20, 3)
print(rearrange(images, 'b (h h1) (w w1) c -> (b h1 w1) h w c', h1=2, w1=2).shape)
# Output
# (128, 15, 20, 3)
# space-to-depth operation :(32, 15, 20, 12)
print(rearrange(images, 'b (h h1) (w w1) c -> b h w (c h1 w1)', h1=2, w1=2).shape)
# Output
# (32, 15, 20, 12)
import numpy as np
from einops import rearrange, repeat, reduce
# a grayscale image (of shape height x width)
image = np.random.randn(30, 40)
# change it to RGB format by repeating in each channel:(30, 40, 3)
print(repeat(image, 'h w -> h w c', c=3).shape)
# Output
# (30, 40, 3)
# repeat image 2 times along height (vertical axis):(60, 40)
print(repeat(image, 'h w -> (repeat h) w', repeat=2).shape)
# Output
# (60, 40)
# repeat image 3 times along width:(30, 120)
print(repeat(image, 'h w -> h (repeat w)', repeat=3).shape)
# Output
# (30, 120)
# convert each pixel to a small square 2x2. Upsample image by 2x:(60, 80)
print(repeat(image, 'h w -> (h h2) (w w2)', h2=2, w2=2).shape)
# Output
# (60, 80)
# pixelate image first by downsampling by 2x, then upsampling:(30, 40)
downsampled = reduce(image, '(h h2) (w w2) -> h w', 'mean', h2=2, w2=2)
print(repeat(downsampled, 'h w -> (h h2) (w w2)', h2=2, w2=2).shape)
# Output
# (30, 40)
import numpy as np
from einops import rearrange, reduce
x = np.random.randn(100, 32, 64)
# perform max-reduction on the first axis:(32, 64)
print(reduce(x, 't b c -> b c', 'max').shape)
# Output
# (32, 64)
# 和上面的操作一样,只不过,更易读
# same as previous, but with clearer axes meaning:(32, 64)
print(reduce(x, 'time batch channel -> batch channel', 'max').shape)
x = np.random.randn(10, 20, 30, 40)
# 2d max-pooling with kernel size = 2 * 2 for image processing:(10, 20, 15, 20)
y1 = reduce(x, 'b c (h1 h2) (w1 w2) -> b c h1 w1', 'max', h2=2, w2=2)
print(y1.shape)
# Output
# (10, 20, 15, 20)
# Global average pooling:(10, 20)
print(reduce(x, 'b c h w -> b c', 'mean').shape)
# Output
# (10, 20)