手写 CPU 卷积核加速神经网络计算(2)——OpenCL 简单实现卷积、池化、激活、全连接、批归一化(python 实现)

这里使用 pyopencl 这个支持在 python 环境下跑 opencl 的库,用 opencl 简单实现了卷积、池化、激活、全连接、批归一化这五个算子,没有做其他的算法和内存上的优化。

1 Conv2d

%%cl_kernel -o "-cl-fast-relaxed-math"

__kernel void Conv2D(__global const float *ift, 
                     __global float *weight, __global float *bias,
                     __global int *output_channel, __global int *output_height, __global int *output_width,
                     __global int *input_channel, __global int *input_height, __global int *input_width,
                     __global int *feature_height, __global int *feature_width,
                     __global float *oft) {
    int Co = *output_channel, Ho = *output_height, Wo = *output_width;
    int Ci = *input_channel, Hi = *input_height, Wi = *input_width;
    int Hf = *feature_height, Wf = *feature_width;
    int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
    int So = Wo*Ho, Sf = Wf*Hf, Si = Wi*Hi;
    int Vf = Sf*Ci;
    int i = posc*(So) + (posh*Wo+posw);
    
    oft[i] = bias[posc];
    for(int l = 0; l < Hf; l++) {
        for(int m = 0; m < Wf; m++) {
            for(int n = 0; n < Ci; n++) {
                oft[i] += ift[(n*Si)+((posh+l)*Wi)+(posw+m)]*weight[(posc*Vf)+(n*Sf)+(l*Wf)+(m)];
            }
        }
    }
}

2 BatchNorm2D

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void BatchNorm2D(__global const float *ift, 
        __global float *mean, __global float *std,
        __global float *eps, __global float *weight, __global float *bias,
        __global int *channel, __global int *height, __global int *width,
        __global float *oft) {
    int c = *channel, h = *height, w = *width;
    int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
    int i = posc*(w*h) + (posh*w+posw);
    float e = *eps;
    float res1 = ift[i] - mean[posc];
    float res2 = sqrt(std[posc] + e);
    float res3 = res1 / res2;
    oft[i] = res3 * weight[posc] + bias[posc];
}

3 Linear

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void Linear(__global const float *ift, 
                     __global float *weight, __global float *bias,
                     __global int *output_channel, __global int *input_channel,
                     __global float *oft) {
    int Co = *output_channel, Ci = *input_channel;
    int posCo = get_global_id(0);
    
    oft[posCo] = bias[posCo];
    for(int k = 0; k < Ci; k++) {
        oft[posCo] += ift[k]*weight[posCo*Ci+k];
    }
}

4 MaxPool2D

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void MaxPool2D(__global const float *ift, 
                        __global int *size, __global int *stride,
                        __global int *channel, 
                        __global int *input_height, __global int *input_width, 
                        __global int *output_height, __global int *output_width, 
                        __global float *oft) {
    int sz = *size, sd = *stride;
    int C = *channel, Hi = *input_height, Wi = *input_width, Ho = *output_height, Wo = *output_width;
    int posc = get_global_id(0), posh = get_global_id(1), posw = get_global_id(2);
    
    int So = Ho*Wo, Si = Hi*Wi;
    int i = (posc*(So))+(posh*Wo)+(posw);
    int startX = posw*sd, startY = posh*sd;
    
    oft[i] = ift[(posc*(Si))+(startY*Wi)+startX];
    for(int y = 0; y < sz; y++) {
        for(int x = 0; x < sz; x++) {
            oft[i] = max(oft[i], ift[(posc*(Si))+((startY+y)*Wi)+(startX+x)]);
        }
    }
}

5 ReLU】

这个分了两个来写,对应一维和三维的:

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void ReluD1(__global const float *ift, __global float *oft) {
    int i = get_global_id(0);
    oft[i] = max((float)0, ift[i]);
}

%%cl_kernel -o "-cl-fast-relaxed-math"
__kernel void ReluD3(__global const float *ift, __global float *oft,
                  __global int *channel, __global int *height, __global int *width) {
    int c = *channel;
    int h = *height;
    int w = *width;
    int posc = get_global_id(0);
    int posh = get_global_id(1);
    int posw = get_global_id(2);
    int i = posc*(w*h) + (posh*w+posw);
    oft[i] = max((float)0, ift[i]);
}

6 封装 opencl 代码,这里可以优化一下数据

def conv2d(input_numpy, kernel_weight_numpy, kernel_bias_numpy, padding = 0):
    # 卷积前预处理,在CPU端做padding和开辟内存空间
    input_numpy = input_numpy
    weight_cpu = kernel_weight_numpy
    bias_cpu = kernel_bias_numpy

    Ci, Hi, Wi = input_numpy.shape
    input_cpu = np.zeros((Ci, Hi+2*padding, Wi+2*padding)).astype(np.float32)
    if padding > 0:
        input_cpu[:, padding:-padding, padding:-padding] = input_numpy
    else:
        input_cpu = input_numpy

    Ci, Hi, Wi = input_cpu.shape
    Co, Ci, Hf, Wf = weight_cpu.shape
    Ho, Wo = Hi - Hf + 1, Wi - Wf + 1
    output_cpu = np.zeros((Co,Ho,Wo)).astype(np.float32)
    # 将数据从host搬运到device
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)

    kernel_weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight_cpu)
    kernel_bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias_cpu)

    output_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Co))
    output_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ho))
    output_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wo))
    input_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ci))
    input_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hi))
    input_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wi))
    feature_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hf))
    feature_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wf))

    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    # device执行
    Conv2D(queue, output_cpu.shape, None, 
           input_gpu, 
           kernel_weight_gpu, kernel_bias_gpu,
           output_channel_gpu, output_height_gpu, output_width_gpu,
           input_channel_gpu, input_height_gpu, input_width_gpu,
           feature_height_gpu, feature_width_gpu,
           output_gpu)
    # 将结果从device读回到host
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    # 结果返回
    return output_cpu

def batchnorm2d(input_numpy, eps, weight, bias):
    # cpu端的预处理
    mean = input_numpy.mean(axis=(1,2))
    std = input_numpy.std(axis=(1,2))**2
    output_cpu = np.empty_like(input_numpy)
    # 将数据从host搬运到device
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_numpy)

    mean_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = mean)
    std_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = std)

    eps_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(eps))
    weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight)
    bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias)

    channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[0]))
    height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[1]))
    width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[2]))

    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    # device端执行
    BatchNorm2D(queue, input_numpy.shape, None, input_gpu, 
                mean_gpu, std_gpu, 
                eps_gpu, weight_gpu, bias_gpu, 
                channel_gpu, height_gpu, width_gpu, 
                output_gpu)
    # 数据写回
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    # 返回数据
    return output_cpu

def relu(input_numpy):
    output_cpu = np.empty_like(input_numpy)
    
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_numpy)
    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    if len(input_numpy.shape) == 3:
        channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[0]))
        height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[1]))
        width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(input_numpy.shape[2]))
        
        ReluD3(queue, input_numpy.shape, None, input_gpu, output_gpu, channel_gpu, height_gpu, width_gpu)
    else:
        ReluD1(queue, input_numpy.shape, None, input_gpu, output_gpu)
        
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    return output_cpu

def maxpool2d(input_numpy,size=2,stride=2):
    # cpu端做一下padding和内存开辟
    C, Hi, Wi = input_numpy.shape
    Ho, Wo = int(np.floor(Hi/stride)),int(np.floor(Wi/stride))
    row_remainder,col_remainder = Hi%stride, Wi%stride
    Ho += int(row_remainder!=0)
    Wo += int(col_remainder!=0)
    input_cpu = np.zeros((C, Hi+size-row_remainder, Wi+size-col_remainder)).astype(np.float32)
    input_cpu[:, :Hi, :Wi] = input_numpy

    C, Hi, Wi = input_cpu.shape

    output_cpu = np.zeros((C,Ho,Wo)).astype(np.float32)
    # 从host搬运数据到device
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)

    size_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(size))
    stride_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(stride))

    channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(C))
    input_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Hi))
    input_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wi))
    output_height_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ho))
    output_width_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Wo))

    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    # device执行
    MaxPool2D(queue, output_cpu.shape, None, 
               input_gpu, 
               size_gpu, stride_gpu,
               channel_gpu,
               input_height_gpu, input_width_gpu,
               output_height_gpu, output_width_gpu,
               output_gpu)
    # 数据写回
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    # 返回结果
    return output_cpu

def linear(input_numpy, weight_numpy, bias_numpy):
    # cpu端简单处理
    input_cpu = input_numpy
    weight_cpu = weight_numpy
    bias_cpu = bias_numpy

    Co, Ci = weight_cpu.shape

    output_cpu = np.zeros((Co,)).astype(np.float32)
    # 将数据从host搬运到device
    input_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = input_cpu)

    weight_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = weight_cpu)
    bias_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = bias_cpu)

    output_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Co))
    input_channel_gpu = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf = np.int32(Ci))

    output_gpu = cl.Buffer(ctx, mf.WRITE_ONLY, output_cpu.nbytes)
    # device执行
    Linear(queue, output_cpu.shape, None, 
               input_gpu, 
               weight_gpu, bias_gpu,
               output_channel_gpu, input_channel_gpu,
               output_gpu)
    # 数据写回
    cl.enqueue_copy(queue, output_cpu, output_gpu)
    # 返回数据
    return output_cpu

7 加载 pytorch 的模型获取参数

class LeNet(nn.Module):
    def __init__(self, num_classes):
        super(LeNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(6),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.layer2 = nn.Sequential(
            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1 ),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )
        self.fclayer = nn.Sequential(
            nn.Linear(16*5*5, 120),
            nn.ReLU(),
            nn.Linear(120,84),
            nn.ReLU(),
            nn.Linear(84,num_classes)
        ) 
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x=x.view(-1, 16*5*5)
        x=self.fclayer(x)
        return x

model = LeNet(10)
model.load_state_dict(torch.load('model.pth'))

8 opencl 的 lenet 推理代码

def opencl_LeNet(model, input_numpy):
    layer1_Conv2d = model.layer1[0]
    layer1_BatchNorm2d = model.layer1[1]
    layer1_ReLU = model.layer1[2]
    layer1_MaxPool2d = model.layer1[3]

    layer2_Conv2d = model.layer2[0]
    layer2_BatchNorm2d = model.layer2[1]
    layer2_ReLU = model.layer2[2]
    layer2_MaxPool2d = model.layer2[3]

    fclayer_Linear_1 = model.fclayer[0]
    fclayer_ReLU_1 = model.fclayer[1]
    fclayer_Linear_2 = model.fclayer[2]
    fclayer_ReLU_2 = model.fclayer[3]
    fclayer_Linear_3 = model.fclayer[4]

    
    
    layer1_Conv2d_numpy_out = conv2d(input_numpy, layer1_Conv2d.weight.detach().numpy(), layer1_Conv2d.bias.detach().numpy(), padding = 2)
    layer1_BatchNorm2d_numpy_out = batchnorm2d(layer1_Conv2d_numpy_out, layer1_BatchNorm2d.eps, layer1_BatchNorm2d.weight.detach().numpy(), layer1_BatchNorm2d.bias.detach().numpy())
    layer1_ReLU_numpy_out = relu(layer1_BatchNorm2d_numpy_out)
    layer1_MaxPool2d_numpy_out = maxpool2d(layer1_ReLU_numpy_out, 2)

    layer2_Conv2d_numpy_out = conv2d(layer1_MaxPool2d_numpy_out, layer2_Conv2d.weight.detach().numpy(), layer2_Conv2d.bias.detach().numpy(), padding = 0)
    layer2_BatchNorm2d_numpy_out = batchnorm2d(layer2_Conv2d_numpy_out, layer2_BatchNorm2d.eps, layer2_BatchNorm2d.weight.detach().numpy(), layer2_BatchNorm2d.bias.detach().numpy())
    layer2_ReLU_numpy_out = relu(layer2_BatchNorm2d_numpy_out)
    layer2_MaxPool2d_numpy_out = maxpool2d(layer2_ReLU_numpy_out, 2)

    layer2_MaxPool2d_numpy_out = layer2_MaxPool2d_numpy_out.reshape(-1, 16*5*5)

    fclayer_Linear_1_numpy_out = linear(layer2_MaxPool2d_numpy_out, fclayer_Linear_1.weight.detach().numpy(), fclayer_Linear_1.bias.detach().numpy())
    fclayer_ReLU_1_numpy_out = relu(fclayer_Linear_1_numpy_out)
    fclayer_Linear_2_numpy_out = linear(fclayer_ReLU_1_numpy_out, fclayer_Linear_2.weight.detach().numpy(), fclayer_Linear_2.bias.detach().numpy())
    fclayer_ReLU_2_numpy_out = relu(fclayer_Linear_2_numpy_out)
    fclayer_Linear_3_numpy_out = linear(fclayer_ReLU_2_numpy_out, fclayer_Linear_3.weight.detach().numpy(), fclayer_Linear_3.bias.detach().numpy())

    return fclayer_Linear_3_numpy_out

input = torch.randn(1,1,28,28)
input_numpy = input.numpy()[0,:,:,:]

# opencl测试
time1 = time.time()
out = opencl_LeNet(model,input_numpy)
time2 = time.time()
print('{} ms'.format(1000*(time2-time1)))
print(out)

# pytorch的cpu测试
time1 = time.time()
out = model(input)
time2 = time.time()
print('{} ms'.format(1000*(time2-time1)))
print(out.detach().numpy())
  • 把 batch 这一维度去掉了,opencl 好像只能并行展开 1 维、2 维、3 维,考虑到 nchw 里面 chw 维度较大,展开性价比较高,n 这个分开一个一个的推理又不是不行,就直接把 n 去掉了。
  • 每一层输入输出都是 cpu 的,每一层的处理其实挺复杂的,很多需要有一定的预处理,这个预处理在 opencl 做比较复杂,所以每一层都是在 cpu 做预处理,把数据写到 device,然后执行 device,再把数据从 device 写回到 host,然后再返回。从这里也可以看出来,这个数据在 host 跟 device 之间频繁交换数据,瓶颈也基本上是在这个地方,所有的纯 device 执行的时间,大概是 6ms,比pytorch 的 cpu 慢一倍左右。所以这里有个可以优化的点,就是在 cpu 端把预处理一次性全做了,然后把数据一次性交付到 device,这样能有比较显著的速度提升。
  • 每一个算子都是简单的按照之前的 for 循环展开了一下,内部还有很多可以优化的细节

你可能感兴趣的:(深度学习,ai,compiler,学习笔记,神经网络,python,深度学习)