CUDA核函数的实现——矩阵卷积

实现二维矩阵卷积操作,初学者水平,欢迎批评指正


# -*- coding: utf-8 -*-
"""
Created on Sun Feb 20 15:50:20 2022
@author: xuning
"""

import pycuda.autoinit
from pycuda.compiler import SourceModule
import pycuda.gpuarray as gpuarray
import numpy as np
import cv2
import torch
import torch.nn as nn
from scipy import signal
from scipy import misc
import sys
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

print("python ready\n")
print(f'The version of PyCUDA: {pycuda.VERSION}')
print(f'The version of Python: {sys.version}')

gpu_code = r"""
/* gpuConv:GPU二维矩阵卷积核函数
*  仅实现单通道卷积,即卷积核、被卷矩阵、结果矩阵,都只有1个通道
*  调用前,被卷矩阵要先padding完毕,再传入本函数
*  卷积核的行、列数,均不能超过被卷矩阵(padding之后的)
*  支持长方形卷积核
*  若边上有剩余的行、列,但是又不够滑动一个步长stride的,则剩余的行、列无法进行卷积操作,so调用者要设计好
*  map:被卷积的图像或者featuremap矩阵指针
*  ker:卷积核矩阵指针
*  res:结果矩阵指针; 尺寸还可通过传入参数计算得到,注意,调用者要确保传入的待填充结果矩阵的尺寸与计算相符
*  stride,卷积核滑动步长
*/
void __global__ gpuConv(float *map, float *ker, float *res, unsigned int *map_row, unsigned int *map_col, 
    unsigned int *ker_row, unsigned int *ker_col, unsigned int *res_row, unsigned int *res_col, unsigned int *stride)
{
    /*
    二维矩阵卷积,卷积核在被卷矩阵上,以stride大小为步长进行滑动,从左到右,从上到下,每次重叠时,重叠部分算出点积再累加
    */
    
    unsigned int uiThdCountInBlock=0, uiBlkCountInGrid=0, uiThdCountInGrid, uiNoBlkInGrid=0, uiNoThdInBlock=0, uiNoThdInGrid=0;
    unsigned int uiMapRow=0, uiMapCol=0, uiKerRow=0, uiKerCol=0, uiResRow=0, uiResCol=0, uiStride=0, uiCalRow=0, uiCalCol=0;
    unsigned int i=0, j=0, ii=0, jj=0;
    unsigned uiBaseRow = 0, uiBaseCol = 0;
    unsigned int uiThdCountOfPreStrides = 0; // 此前,gpu线程组复用虚拟总次数(线程总数*已循环次数)
    
    /// { 一些基本的计算    
    // 一个block中的线程数量
    uiThdCountInBlock = blockDim.x * blockDim.y * blockDim.z;
    // 一个grid中的block数量
    uiBlkCountInGrid = gridDim.x * gridDim.y * gridDim.z;
    // 一个grid中的thread数量(似乎就是一个stride?此处的stride,不是卷积滑动步长,而是指gpu线程并发复用步长)
    uiThdCountInGrid = uiThdCountInBlock * uiBlkCountInGrid;
    // 当前是所属grid中第几个block(从0开始编号)
    uiNoBlkInGrid = blockIdx.x + blockIdx.y*gridDim.x + blockIdx.z*(gridDim.x*gridDim.y);
    // 当前是所属block中第几个thread(从0开始编号)
    uiNoThdInBlock = threadIdx.x + threadIdx.y*blockDim.x + threadIdx.z*(blockDim.x*blockDim.y);
    // 当前是所属grid中的第几个thread(从0开始编号)
    uiNoThdInGrid = uiNoThdInBlock + uiThdCountInBlock*uiNoBlkInGrid;
    
    uiMapRow = (unsigned int)(map_row[0]);
    uiMapCol = (unsigned int)(map_col[0]);
    uiKerRow = (unsigned int)(ker_row[0]);
    uiKerCol = (unsigned int)(ker_col[0]);
    uiResRow = (unsigned int)(res_row[0]);
    uiResCol = (unsigned int)(res_col[0]);
    uiStride = (unsigned int)(stride[0]);
    
    // 如果卷积核尺寸超出了被卷矩阵,不支持这样的操作
    if( uiMapRow= uiResRow*uiResCol )
    {
        //printf("uiNoThdInGrid:%d >= %d*%d\n", uiNoThdInGrid, uiResRow, uiResCol);
        return;
    }
    
    // 如果线程总数不足,则有的线程要做多组“加乘”计算,当然,也包括线程总数和“加乘计算组”数量刚好相等的特例
    uiThdCountOfPreStrides = 0;
    while( uiNoThdInGrid + uiThdCountOfPreStrides < uiResRow*uiResCol )
    {
        // 结果矩阵的第几行
        i = (uiNoThdInGrid + uiThdCountOfPreStrides) / uiResCol;
        // 结果矩阵的第几列
        //j = (uiNoThdInGrid + uiThdCountOfPreStrides) - i * uiResCol;
        j = (uiNoThdInGrid + uiThdCountOfPreStrides) % uiResCol;
        
        //printf( "one muladd_group: GridDim:(%d,%d,%d), BlockDim:(%d,%d,%d), BlockIdx:(%d,%d,%d), ThreadIdx(%d,%d,%d), Map(%d,%d), kernel(%d,%d), ResX(%d,%d), stride:%d, uiNoThdInGrid=%d, i=%d, j=%d\n",
        //    gridDim.x, gridDim.y, gridDim.z, blockDim.x, blockDim.y, blockDim.z, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y, threadIdx.z, uiMapRow, uiMapCol, uiKerRow, uiKerCol, uiResRow, uiResCol, uiStride, uiNoThdInGrid, i, j);

        // 结果矩阵的(i,j)格数据 = 被卷矩阵与卷积核重叠的部分,与卷积核,先做点积,再累加点积结果
        // 似乎直接使用二维数组的写法,会编译错误
        res[i*uiResCol+j] = 0.0;
        uiBaseRow = i*uiStride, uiBaseCol = j*uiStride;
        for( ii=0; ii1之后该怎么padding
def my_conv( map, kernel, stride=1, padding=0, padtype='0'):
    if stride <= 0:
        print( 'stride {} is invalid, must > 0, so do nothing'.format(stride) )
        return None

    if padding < 0:
        print( 'padding {} is invalid, must >= 0, so do nothing'.format(padding) )
        return None
    elif padding > 0: # 暂时仅支持padding = 1
        if type(padding) != int:
            # print( '!! ERR: type of padding par is invalid, {}', type(padding) )
            # return None
            print( '!! Warning: type of padding par is invalid, {}, regarded as 1', type(padding) )

        if padtype == 'same':
            # catV = np.zeros((padding, map.shape[1]))
            catV = map[0, :]
            # print(catV)
            map = np.vstack((catV, map))
            # print(map)
            catV = map[-1:, :]
            # print(catV)
            map = np.vstack((map, catV))
            # print(map)

            # 用hstack的待修改
            # catH = np.zeros((map.shape[0], padding))
            # catH = (map.T)[padding]
            # catH = catH.reshape(map.shape[0],1).T
            # print(catH)
            # map = np.hstack((catH, map))
            # map = np.hstack((map, catH))

            # 转置过来,也用vstack,再转置回去
            map = map.T
            catH = map[0,:]
            # print(catH)
            map = np.vstack((catH, map))
            # print(map)
            catH = map[-1,:]
            # print(catH)
            map = np.vstack((map, catH))
            # print(map)
            map = map.T
            # print(map)
        else:
            catV = np.zeros((padding, map.shape[1]))
            map = np.vstack((catV, map))
            map = np.vstack((map, catV))
            catH = np.zeros((map.shape[0], padding))
            map = np.hstack((catH, map))
            map = np.hstack((map, catH))

        print('after padding, map is\n{}', map)

    res_row = (map.shape[0]-kernel.shape[0]) // stride + 1  # 这里如果用“/”,结果会变成float类型,造成后续参数不匹配
    res_col = (map.shape[1]-kernel.shape[1]) // stride + 1
    # print(padding)
    # print(stride)
    # print( res_row )
    # print( res_col )
    res = np.zeros( (res_row, res_col), dtype=np.float32 )

    # (float *map, float *ker, float *res, unsigned int *map_row, unsigned int *map_col, 
    # unsigned int *ker_row, unsigned int *ker_col, unsigned int *res_row, unsigned int *res_col, unsigned int *stride)
    map_g = gpuarray.to_gpu( map.astype(np.float32) )
    kernel_g = gpuarray.to_gpu( kernel.astype(np.float32) )
    res_g = gpuarray.to_gpu( res.astype(np.float32) )
    map_row_g = gpuarray.to_gpu( np.array([map.shape[0]], dtype=np.uint32) )
    map_col_g = gpuarray.to_gpu( np.array([map.shape[1]], dtype=np.uint32) )
    ker_row_g = gpuarray.to_gpu( np.array([kernel.shape[0]], dtype=np.uint32) )
    ker_col_g = gpuarray.to_gpu( np.array([kernel.shape[1]], dtype=np.uint32) )
    res_row_g = gpuarray.to_gpu( np.array([res.shape[0]], dtype=np.uint32) )
    res_col_g = gpuarray.to_gpu( np.array([res.shape[1]], dtype=np.uint32) )
    stride_g = gpuarray.to_gpu( np.array([stride], dtype=np.uint32) )

    gconv( map_g, kernel_g, res_g, map_row_g, map_col_g, ker_row_g, ker_col_g, res_row_g, res_col_g, stride_g, grid=(1,), block=(3,2,2) )
    return res_g.get()


m = 5
n = 4
k1 = 3
k2 = 3
s = 1
A = np.array([
    [1,2,3,4],
    [5,6,7,8],
    [9,1,2,3],
    [4,5,6,7],
    [8,9,1,2]
])
B = np.array([
    [-1,-1,-1],
    [0,0,0],
    [1,1,1]
])
# A = np.ones((m,n))
# B = np.ones((k1,k2))*3
# A = np.random.randint(10, 1000, size=(m, n))
# B = np.random.randint(1,100, size=(k1, k2))
# A = np.random.rand(m, n)
# B = np.random.rand(k1, k2)
A = A.astype(np.float32)
B = B.astype(np.float32)
print('A(shape {})=\n{}'.format(A.shape, A))
print('B(shape {})=\n{}'.format(B.shape, B))

C = my_conv( A, B, stride=s, padding=1, padtype='same')
# print('after my_conv, A(shape {})=\n{}'.format(A.shape, A))
# print('after my_conv, B(shape {})=\n{}'.format(B.shape, B))
# print("my_conv result:")

# CC = signal.convolve2d(A, B, mode='valid') # boundary='symm'
CC = signal.convolve2d(A, B, mode='same') # boundary='symm'

# AA = torch.tensor(A)
# BB = torch.tensor(B)
# CC = torch.matmul(AA, BB)

CCC = cv2.filter2D(A, -1, B)

# np.convolve() ,只对一维数组
# nn.Conv2d() 不行,无法传入卷积核参数
# CCC = np.dot(A,B)
# print('CCC(shape {})=\n{}'.format(CCC.shape, CCC))

# print('A(shape {})=\n{}'.format(A.shape, A))
# print('B(shape {})=\n{}'.format(B.shape, B))
print('C(shape {})=\n{}'.format(C.shape, C))
print('CC(shape {})=\n{}'.format(CC.shape, CC))
print('CCC(shape {})=\n{}'.format(CCC.shape, CCC))


# cpu和gpu的计算结果比较
# 奇怪,opev和scipy的卷积函数,算出来的和我的gpu核函数很多不一样,并且这两个函数算出来的相互也不一样
# opev和scipy的卷积函数,有很多东西没搞明白,也似乎不能控制,比如如何padding,因此,结果不同,可能是另有原因。
# 我自己的核函数,在简单矩阵上做了一些人工验算,是正确的。
print(f'(1)Is the host computation 1 close to the GPU computation? : {np.allclose(C, CC)}')
print(f'(2)Is the host computation 2 close to the GPU computation? : {np.allclose(C, CCC)}')
print(f'(3)Are the two host computations closed? : {np.allclose(CC, CCC)}')

你可能感兴趣的:(AI,矩阵,线性代数,计算机视觉,cuda,卷积)