这里使用 pyopencl 这个支持在 python 环境下跑 opencl 的库,用 opencl 简单实现了卷积、池化、激活、全连接、批归一化这五个算子,没有做其他的算法和内存上的优化。
%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void Conv2D(__global const float *ift,
__global float *weight, __global float *bias,
__global int *output_channel, __global int *output_height, __global int *output_width,
__global int *input_channel, __global int *input_height, __global int *input_width,
__global int *feature_height, __global int *feature_width,
__global float *oft) {
int Co = *output_channel, Ho = *output_height, Wo = *output_width;
int Ci = *input_channel, Hi = *input_height, Wi = *input_width;
int Hf = *feature_height, Wf = *feature_width;
int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
int So = Wo*Ho, Sf = Wf*Hf, Si = Wi*Hi;
int Vf = Sf*Ci;
int i = posc*(So) + (posh*Wo+posw);
oft[i] = bias[posc];
for(int l = 0; l < Hf; l++) {
for(int m = 0; m < Wf; m++) {
for(int n = 0; n < Ci; n++) {
oft[i] += ift[(n*Si)+((posh+l)*Wi)+(posw+m)]*weight[(posc*Vf)+(n*Sf)+(l*Wf)+(m)];
}
}
}
}
%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void BatchNorm2D(__global const float *ift,
__global float *mean, __global float *std,
__global float *eps, __global float *weight, __global float *bias,
__global int *channel, __global int *height, __global int *width,
__global float *oft) {
int c = *channel, h = *height, w = *width;
int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
int i = posc*(w*h) + (posh*w+posw);
float e = *eps;
float res1 = ift[i] - mean[posc];
float res2 = sqrt(std[posc] + e);
float res3 = res1 / res2;
oft[i] = res3 * weight[posc] + bias[posc];
}
%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void Linear(__global const float *ift,
__global float *weight, __global float *bias,
__global int *output_channel, __global int *input_channel,
__global float *oft) {
int Co = *output_channel, Ci = *input_channel;
int posCo = get_global_id(0);
oft[posCo] = bias[posCo];
for(int k = 0; k < Ci; k++) {
oft[posCo] += ift[k]*weight[posCo*Ci+k];
}
}
%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void MaxPool2D(__global const float *ift,
__global int *size, __global int *stride,
__global int *channel,
__global int *input_height, __global int *input_width,
__global int *output_height, __global int *output_width,
__global float *oft) {
int sz = *size, sd = *stride;
int C = *channel, Hi = *input_height, Wi = *input_width, Ho = *output_height, Wo = *output_width;
int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
int So = Ho*Wo, Si = Hi*Wi;
int i = (posc*(So))+(posh*Wo)+(posw);
int startX = posw*sd, startY = posh*sd;
oft[i] = ift[(posc*(Si))+(startY*Wi)+startX];
for(int y = 0; y < sz; y++) {
for(int x = 0; x < sz; x++) {
oft[i] = max(oft[i], ift[(posc*(Si))+((startY+y)*Wi)+(startX+x)]);
}
}
}
这个分了两个来写,对应一维和三维的:
%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void ReluD1(__global const float *ift, __global float *oft) {
int i = get_global_id(0);
oft[i] = max((float)0, ift[i]);
}
%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void ReluD3(__global const float *ift, __global float *oft,
__global int *channel, __global int *height, __global int *width) {
int c = *channel;
int h = *height;
int w = *width;
int posc = get_global_id(0);
int posh = get_global_id(1);
int posw = get_global_id(2);
int i = posc*(w*h) + (posh*w+posw);
oft[i] = max((float)0, ift[i]);
}
def conv2d(input_numpy, kernel_weight_numpy, kernel_bias_numpy, padding = 0):
# 卷积前预处理,在CPU端做padding和开辟内存空间
input_numpy = input_numpy
weight_cpu = kernel_weight_numpy
bias_cpu = kernel_bias_numpy
Ci, Hi, Wi = input_numpy.shape
input_cpu = np.zeros((Ci, Hi+2*padding, Wi+2*padding)).astype(np.float32)
if padding > 0:
input_cpu[:, padding:-padding, padding:-padding] = input_numpy
else:
input_cpu = input_numpy
Ci, Hi, Wi = input_cpu.shape
Co, Ci, Hf, Wf = weight_cpu.shape
Ho, Wo = Hi - Hf + 1, Wi - Wf + 1
output_cpu = np.zeros((Co,Ho,Wo)).astype(np.float32)
# 将数据从host搬运到device
input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)
kernel_weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight_cpu)
kernel_bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias_cpu)
output_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Co))
output_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ho))
output_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wo))
input_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ci))
input_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hi))
input_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wi))
feature_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hf))
feature_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wf))
output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
# device执行
Conv2D(queue, output_cpu.shape, None,
input_gpu,
kernel_weight_gpu, kernel_bias_gpu,
output_channel_gpu, output_height_gpu, output_width_gpu,
input_channel_gpu, input_height_gpu, input_width_gpu,
feature_height_gpu, feature_width_gpu,
output_gpu)
# 将结果从device读回到host
cl.enqueue_copy(queue, output_cpu, output_gpu)
# 结果返回
return output_cpu
def batchnorm2d(input_numpy, eps, weight, bias):
# cpu端的预处理
mean = input_numpy.mean(axis=(1,2))
std = input_numpy.std(axis=(1,2))**2
output_cpu = np.empty_like(input_numpy)
# 将数据从host搬运到device
input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_numpy)
mean_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = mean)
std_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = std)
eps_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(eps))
weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight)
bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias)
channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[0]))
height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[1]))
width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[2]))
output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
# device端执行
BatchNorm2D(queue, input_numpy.shape, None, input_gpu,
mean_gpu, std_gpu,
eps_gpu, weight_gpu, bias_gpu,
channel_gpu, height_gpu, width_gpu,
output_gpu)
# 数据写回
cl.enqueue_copy(queue, output_cpu, output_gpu)
# 返回数据
return output_cpu
def relu(input_numpy):
output_cpu = np.empty_like(input_numpy)
input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_numpy)
output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
if len(input_numpy.shape) == 3:
channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[0]))
height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[1]))
width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[2]))
ReluD3(queue, input_numpy.shape, None, input_gpu, output_gpu, channel_gpu, height_gpu, width_gpu)
else:
ReluD1(queue, input_numpy.shape, None, input_gpu, output_gpu)
cl.enqueue_copy(queue, output_cpu, output_gpu)
return output_cpu
def maxpool2d(input_numpy,size=2,stride=2):
# cpu端做一下padding和内存开辟
C, Hi, Wi = input_numpy.shape
Ho, Wo = int(np.floor(Hi/stride)),int(np.floor(Wi/stride))
row_remainder,col_remainder = Hi%stride, Wi%stride
Ho += int(row_remainder!=0)
Wo += int(col_remainder!=0)
input_cpu = np.zeros((C, Hi+size-row_remainder, Wi+size-col_remainder)).astype(np.float32)
input_cpu[:, :Hi, :Wi] = input_numpy
C, Hi, Wi = input_cpu.shape
output_cpu = np.zeros((C,Ho,Wo)).astype(np.float32)
# 从host搬运数据到device
input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)
size_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(size))
stride_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(stride))
channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(C))
input_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hi))
input_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wi))
output_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ho))
output_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wo))
output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
# device执行
MaxPool2D(queue, output_cpu.shape, None,
input_gpu,
size_gpu, stride_gpu,
channel_gpu,
input_height_gpu, input_width_gpu,
output_height_gpu, output_width_gpu,
output_gpu)
# 数据写回
cl.enqueue_copy(queue, output_cpu, output_gpu)
# 返回结果
return output_cpu
def linear(input_numpy, weight_numpy, bias_numpy):
# cpu端简单处理
input_cpu = input_numpy
weight_cpu = weight_numpy
bias_cpu = bias_numpy
Co, Ci = weight_cpu.shape
output_cpu = np.zeros((Co,)).astype(np.float32)
# 将数据从host搬运到device
input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)
weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight_cpu)
bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias_cpu)
output_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Co))
input_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ci))
output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
# device执行
Linear(queue, output_cpu.shape, None,
input_gpu,
weight_gpu, bias_gpu,
output_channel_gpu, input_channel_gpu,
output_gpu)
# 数据写回
cl.enqueue_copy(queue, output_cpu, output_gpu)
# 返回数据
return output_cpu
class LeNet(nn.Module):
def __init__(self, num_classes):
super(LeNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(6),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1 ),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
)
self.fclayer = nn.Sequential(
nn.Linear(16*5*5, 120),
nn.ReLU(),
nn.Linear(120,84),
nn.ReLU(),
nn.Linear(84,num_classes)
)
def forward(self, x):
x = self.layer1(x)
x = self.layer2(x)
x=x.view(-1, 16*5*5)
x=self.fclayer(x)
return x
model = LeNet(10)
model.load_state_dict(torch.load('model.pth'))
def opencl_LeNet(model, input_numpy):
layer1_Conv2d = model.layer1[0]
layer1_BatchNorm2d = model.layer1[1]
layer1_ReLU = model.layer1[2]
layer1_MaxPool2d = model.layer1[3]
layer2_Conv2d = model.layer2[0]
layer2_BatchNorm2d = model.layer2[1]
layer2_ReLU = model.layer2[2]
layer2_MaxPool2d = model.layer2[3]
fclayer_Linear_1 = model.fclayer[0]
fclayer_ReLU_1 = model.fclayer[1]
fclayer_Linear_2 = model.fclayer[2]
fclayer_ReLU_2 = model.fclayer[3]
fclayer_Linear_3 = model.fclayer[4]
layer1_Conv2d_numpy_out = conv2d(input_numpy, layer1_Conv2d.weight.detach().numpy(), layer1_Conv2d.bias.detach().numpy(), padding = 2)
layer1_BatchNorm2d_numpy_out = batchnorm2d(layer1_Conv2d_numpy_out, layer1_BatchNorm2d.eps, layer1_BatchNorm2d.weight.detach().numpy(), layer1_BatchNorm2d.bias.detach().numpy())
layer1_ReLU_numpy_out = relu(layer1_BatchNorm2d_numpy_out)
layer1_MaxPool2d_numpy_out = maxpool2d(layer1_ReLU_numpy_out, 2)
layer2_Conv2d_numpy_out = conv2d(layer1_MaxPool2d_numpy_out, layer2_Conv2d.weight.detach().numpy(), layer2_Conv2d.bias.detach().numpy(), padding = 0)
layer2_BatchNorm2d_numpy_out = batchnorm2d(layer2_Conv2d_numpy_out, layer2_BatchNorm2d.eps, layer2_BatchNorm2d.weight.detach().numpy(), layer2_BatchNorm2d.bias.detach().numpy())
layer2_ReLU_numpy_out = relu(layer2_BatchNorm2d_numpy_out)
layer2_MaxPool2d_numpy_out = maxpool2d(layer2_ReLU_numpy_out, 2)
layer2_MaxPool2d_numpy_out = layer2_MaxPool2d_numpy_out.reshape(-1, 16*5*5)
fclayer_Linear_1_numpy_out = linear(layer2_MaxPool2d_numpy_out, fclayer_Linear_1.weight.detach().numpy(), fclayer_Linear_1.bias.detach().numpy())
fclayer_ReLU_1_numpy_out = relu(fclayer_Linear_1_numpy_out)
fclayer_Linear_2_numpy_out = linear(fclayer_ReLU_1_numpy_out, fclayer_Linear_2.weight.detach().numpy(), fclayer_Linear_2.bias.detach().numpy())
fclayer_ReLU_2_numpy_out = relu(fclayer_Linear_2_numpy_out)
fclayer_Linear_3_numpy_out = linear(fclayer_ReLU_2_numpy_out, fclayer_Linear_3.weight.detach().numpy(), fclayer_Linear_3.bias.detach().numpy())
return fclayer_Linear_3_numpy_out
input = torch.randn(1,1,28,28)
input_numpy = input.numpy()[0,:,:,:]
# opencl测试
time1 = time.time()
out = opencl_LeNet(model,input_numpy)
time2 = time.time()
print('{} ms'.format(1000*(time2-time1)))
print(out)
# pytorch的cpu测试
time1 = time.time()
out = model(input)
time2 = time.time()
print('{} ms'.format(1000*(time2-time1)))
print(out.detach().numpy())