图中的M向量为卷积码,N向量为输入,P向量为输出。其中P[2] = N[0] * M[0] + ... + N[4] * M[4]。卷积计算需要考虑边界问题,如图,码长度为5,在计算前2个元素和后2个元素时需要的输入数据位置会越过边界。
这时我们需要手动地添加被称为Ghost Cell的元素,该元素的值视情况而定,这里为了简便,取0值即可。在由的应用里面也会把这个值设定为P[0].
一维卷积kernel 代码:
__global__ void convolution_1D_basic_kernel(float* N, float* M, float* P, int Mask_Width, int Width){ int i = blockIdx.x*blockDim.x + threadIdx.x; float Pvalue = 0; int N_start_point = i - (Mask_Width/2); for(int j = 0; j < Mask_Width; j++) { if(N_start_point + j >= 0 && N_start_point + j < Width){ Pvalue+= N[N_start_point+j] * M[j]; } } P[i] = Pvalue; }
注意由此产生的Ghost Cell也是二维的。
如果计算P4,需要使用到N2,N3,N4,N5,N6.计算P5,需要使用到N3,N4,N5,N6,N7. 计算P7,需要使用到N5,N6,N7,N8,N9.
#define TILE_WIDTH 4
#define MASK_WIDTH 5
#define n MASK_WIDTH/2
dim3 dimBlock(BLOCK_WIDTH,1, 1);
dim3 dimGrid((Width-1)/O_TILE_WIDTH+1, 1, 1).
float output = 0.0f;
if((index_i >= 0) && (index_i < Width) ) {
Ns[tx] = N[index_i];
Ns[tx] = 0.0f;
if (threadIdx.x < O_TILE_WIDTH){
output = 0.0f;
for(j = 0; j < Mask_Width; j++) {
output += M[j] * Ns[j+threadIdx.x]; //从sharedMemory中取值.
P[index_o] = output;
二维卷积基于一维卷积, M由于是常量,可以利用 constant memory来提高访问速度. constant缓存在一个warp中是以广播的形式发送给每个thread.
Image Matrix Type in HPP Course
// Image Matrix Structure declaration
typedef struct {
int width;
int height;
int pitch;
int channels;
float* data;
} * wbImage_t;
图像一个点表示一个信息,由于很多图像是彩色的,所以一个点是不够的,所以有这个channels的概念,比如RGB就分别表示R channel, G channel, B channel.
二维卷积的code1 :
#include <wb.h> #define wbCheck(stmt) do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \ return -1; \ } \ } while(0) #define MASK_WIDTH 5 #define MASK_RADIUS MASK_WIDTH/2 #define O_TILE_WIDTH 16 //12 #define BLOCK_WIDTH (O_TILE_WIDTH + MASK_WIDTH - 1) #define CLAMP(x) (min(max((x), 0.0), 1.0)) //@@ INSERT CODE HERE __global__ void convolution_2d_kernel(float *I, const float* __restrict__ M, float *P, int channels, int width, int height) { __shared__ float Ns[BLOCK_WIDTH][BLOCK_WIDTH]; int i,j,k; int tx = threadIdx.x; int ty = threadIdx.y; int row_o = blockIdx.y*O_TILE_WIDTH + ty; int col_o = blockIdx.x*O_TILE_WIDTH + tx; int row_i = row_o - MASK_RADIUS; int col_i = col_o - MASK_RADIUS; for (k = 0; k < channels; k++) { if((row_i >=0 && row_i < height) && (col_i >=0 && col_i < width)) Ns[ty][tx] = I[(row_i * width + col_i) * channels + k]; else Ns[ty][tx] = 0.0f; __syncthreads(); float output = 0.0f; if(ty < O_TILE_WIDTH && tx < O_TILE_WIDTH){ for(i = 0; i < MASK_WIDTH; i++) { for(j = 0; j < MASK_WIDTH; j++) { output += M[j * MASK_WIDTH + i] * Ns[i+ty][j+tx]; } } if(row_o < height && col_o < width) P[(row_o * width + col_o) * channels + k] = CLAMP(output); } __syncthreads(); // printf("kernel %f \n ",P[row_o * width + col_o]); } } int main(int argc, char* argv[]) { wbArg_t args; int maskRows; int maskColumns; int imageChannels; int imageWidth; int imageHeight; char * inputImageFile; char * inputMaskFile; wbImage_t inputImage; wbImage_t outputImage; float * hostInputImageData; float * hostOutputImageData; float * hostMaskData; float * deviceInputImageData; float * deviceOutputImageData; float * deviceMaskData; args = wbArg_read(argc, argv); /* parse the input arguments */ inputImageFile = wbArg_getInputFile(args, 0); inputMaskFile = wbArg_getInputFile(args, 1); inputImage = wbImport(inputImageFile); hostMaskData = (float *) wbImport(inputMaskFile, &maskRows, &maskColumns); assert(maskRows == 5); /* mask height is fixed to 5 in this mp */ assert(maskColumns == 5); /* mask width is fixed to 5 in this mp */ imageWidth = wbImage_getWidth(inputImage); imageHeight = wbImage_getHeight(inputImage); imageChannels = wbImage_getChannels(inputImage); printf("imageChannels =%d\n", imageChannels); outputImage = wbImage_new(imageWidth, imageHeight, imageChannels); hostInputImageData = wbImage_getData(inputImage); hostOutputImageData = wbImage_getData(outputImage); wbTime_start(GPU, "Doing GPU Computation (memory + compute)"); wbTime_start(GPU, "Doing GPU memory allocation"); cudaMalloc((void **) &deviceInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)); cudaMalloc((void **) &deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)); cudaMalloc((void **) &deviceMaskData, maskRows * maskColumns * sizeof(float)); wbTime_stop(GPU, "Doing GPU memory allocation"); wbTime_start(Copy, "Copying data to the GPU"); cudaMemcpy(deviceInputImageData, hostInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(deviceMaskData, hostMaskData, maskRows * maskColumns * sizeof(float), cudaMemcpyHostToDevice); wbTime_stop(Copy, "Copying data to the GPU"); wbTime_start(Compute, "Doing the computation on the GPU"); //@@ INSERT CODE HERE dim3 dimGrid(ceil((float)imageWidth/O_TILE_WIDTH), ceil((float)imageHeight/O_TILE_WIDTH)); dim3 dimBlock(BLOCK_WIDTH, BLOCK_WIDTH, 1); convolution_2d_kernel<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, imageChannels, imageWidth, imageHeight); cudaDeviceSynchronize(); // note this wbTime_stop(Compute, "Doing the computation on the GPU"); wbTime_start(Copy, "Copying data from the GPU"); cudaMemcpy(hostOutputImageData, deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float), cudaMemcpyDeviceToHost); wbTime_stop(Copy, "Copying data from the GPU"); wbTime_stop(GPU, "Doing GPU Computation (memory + compute)"); wbSolution(args, outputImage); cudaFree(deviceInputImageData); cudaFree(deviceOutputImageData); cudaFree(deviceMaskData); free(hostMaskData); wbImage_delete(outputImage); wbImage_delete(inputImage); return 0; }
二维卷积的code2,和code1的区别是,block的设置不同,这里设置的blocksize 是tile size,所以input 的值是不够的,在kernel中加载input分成两次来做.
#include <wb.h> // Check for more information #define wbCheck(stmt) do { \ cudaError_t err = stmt; \ if (err != cudaSuccess) { \ wbLog(ERROR, "Failed to run stmt ", #stmt); \ return -1; \ } \ } while(0) #define Mask_width 5 #define Mask_radius Mask_width/2 #define TILE_WIDTH 16 #define w (TILE_WIDTH + Mask_width - 1) #define clamp(x) (min(max((x), 0.0), 1.0)) //@@ INSERT CODE HERE __global__ void convolution(float *I, const float* __restrict__ M, float *P, int channels, int width, int height) { __shared__ float N_ds[w][w]; int k; for (k = 0; k < channels; k++) { // First batch loading int dest = threadIdx.y * TILE_WIDTH + threadIdx.x, destY = dest / w, destX = dest % w, srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius, srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius, src = (srcY * width + srcX) * channels + k; if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width) N_ds[destY][destX] = I[src]; else N_ds[destY][destX] = 0; // Second batch loading dest = threadIdx.y * TILE_WIDTH + threadIdx.x + TILE_WIDTH * TILE_WIDTH; destY = dest / w, destX = dest % w; srcY = blockIdx.y * TILE_WIDTH + destY - Mask_radius; srcX = blockIdx.x * TILE_WIDTH + destX - Mask_radius; src = (srcY * width + srcX) * channels + k; if (destY < w) { if (srcY >= 0 && srcY < height && srcX >= 0 && srcX < width) N_ds[destY][destX] = I[src]; else N_ds[destY][destX] = 0; } __syncthreads(); float accum = 0; int y, x; for (y = 0; y < Mask_width; y++) for (x = 0; x < Mask_width; x++) accum += N_ds[threadIdx.y + y][threadIdx.x + x] * M[y * Mask_width + x]; y = blockIdx.y * TILE_WIDTH + threadIdx.y; x = blockIdx.x * TILE_WIDTH + threadIdx.x; if (y < height && x < width) P[(y * width + x) * channels + k] = clamp(accum); __syncthreads(); } } int main(int argc, char* argv[]) { wbArg_t arg; int maskRows; int maskColumns; int imageChannels; int imageWidth; int imageHeight; char * inputImageFile; char * inputMaskFile; wbImage_t inputImage; wbImage_t outputImage; float * hostInputImageData; float * hostOutputImageData; float * hostMaskData; float * deviceInputImageData; float * deviceOutputImageData; float * deviceMaskData; arg = wbArg_read(argc, argv); /* parse the input arguments */ inputImageFile = wbArg_getInputFile(arg, 0); inputMaskFile = wbArg_getInputFile(arg, 1); inputImage = wbImport(inputImageFile); hostMaskData = (float *) wbImport(inputMaskFile, &maskRows, &maskColumns); assert(maskRows == 5); /* mask height is fixed to 5 in this mp */ assert(maskColumns == 5); /* mask width is fixed to 5 in this mp */ imageWidth = wbImage_getWidth(inputImage); imageHeight = wbImage_getHeight(inputImage); imageChannels = wbImage_getChannels(inputImage); outputImage = wbImage_new(imageWidth, imageHeight, imageChannels); hostInputImageData = wbImage_getData(inputImage); hostOutputImageData = wbImage_getData(outputImage); wbTime_start(GPU, "Doing GPU Computation (memory + compute)"); wbTime_start(GPU, "Doing GPU memory allocation"); cudaMalloc((void **) &deviceInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)); cudaMalloc((void **) &deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float)); cudaMalloc((void **) &deviceMaskData, maskRows * maskColumns * sizeof(float)); wbTime_stop(GPU, "Doing GPU memory allocation"); wbTime_start(Copy, "Copying data to the GPU"); cudaMemcpy(deviceInputImageData, hostInputImageData, imageWidth * imageHeight * imageChannels * sizeof(float), cudaMemcpyHostToDevice); cudaMemcpy(deviceMaskData, hostMaskData, maskRows * maskColumns * sizeof(float), cudaMemcpyHostToDevice); wbTime_stop(Copy, "Copying data to the GPU"); wbTime_start(Compute, "Doing the computation on the GPU"); //@@ INSERT CODE HERE dim3 dimGrid(ceil((float)imageWidth/TILE_WIDTH), ceil((float)imageHeight/TILE_WIDTH)); dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); convolution<<<dimGrid, dimBlock>>>(deviceInputImageData, deviceMaskData, deviceOutputImageData, imageChannels, imageWidth, imageHeight); wbTime_stop(Compute, "Doing the computation on the GPU"); wbTime_start(Copy, "Copying data from the GPU"); cudaMemcpy(hostOutputImageData, deviceOutputImageData, imageWidth * imageHeight * imageChannels * sizeof(float), cudaMemcpyDeviceToHost); wbTime_stop(Copy, "Copying data from the GPU"); wbTime_stop(GPU, "Doing GPU Computation (memory + compute)"); wbSolution(arg, outputImage); cudaFree(deviceInputImageData); cudaFree(deviceOutputImageData); cudaFree(deviceMaskData); free(hostMaskData); wbImage_delete(outputImage); wbImage_delete(inputImage); return 0; }