在第一个代码附录中就把包括主函数在内的所有代码均贴出来了,后续的几种情况将只贴核函数。
#include
#include
#include //for time()
#include //for srand()/rand()
#include //for gettimeofday()/struct timeval
#define KEN_CHECK(r) \
{\
cudaError_t rr = r; \
if (rr != cudaSuccess)\
{\
fprintf(stderr, "CUDA Error %s, function: %s, line: %d\n", \
cudaGetErrorString(rr), __FUNCTION__, __LINE__); \
exit(-1);\
}\
}
#define M 3001 //three thousand and one nights
__managed__ int shark[M][M]; //input matrix
__managed__ int gpu_shark_T[M][M];//GPU result
__managed__ int cpu_shark_T[M][M];//CPU result
__global__ void _jvav_transpose(int A[M][M], int B[M][M])
{
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x < M && y < M)
{
//you cannot do this if the transpose is in-place.
B[y][x] = A[x][y]; //naive transpose on global memory.
}
}
void _sparks_transpose_cpu(int A[M][M], int B[M][M])
{
for (int j = 0; j < M; j++)
{
for (int i = 0; i < M; i++)
{
B[i][j] = A[j][i];
}
}
}
void DDBDDH_init(int A[M][M])
{
uint32_t seed = (uint32_t)time(NULL); //make huan happy
srand(seed); //reseeding the random generator
//filling the matrix with random data
for (int j = 0; j < M; j++)
{
for (int i = 0; i < M; i++)
{
A[j][i] = rand();
}
}
}
double get_time()
{
struct timeval tv;
gettimeofday(&tv, NULL);
return ((double)tv.tv_usec * 0.000001 + tv.tv_sec);
}
int main()
{
//**********************************
fprintf(stderr, "DDBDDH is filling the %dx%d maxtrix with random data\n",
M, M);
DDBDDH_init(shark);
//**********************************
//Now we are going to kick start your kernel.
cudaDeviceSynchronize(); //steady! ready! go!
//Good luck & have fun!
fprintf(stderr, "Running on GPU...\n");
double t0 = get_time();
int n = (M + 15) / 16; //what the hell is this!
dim3 grid_shape(n, n);
dim3 block_shape(16, 16);
_jvav_transpose<<<grid_shape, block_shape>>>(shark, gpu_shark_T);
KEN_CHECK(cudaGetLastError()); //checking for launch failures
KEN_CHECK(cudaDeviceSynchronize()); //checking for run-time failurs
double t1 = get_time();
//**********************************
//Now we are going to exercise your CPU...
fprintf(stderr, "Running on CPU...\n");
double t2 = get_time();
_sparks_transpose_cpu(shark, cpu_shark_T);
double t3 = get_time();
//******The last judgement**********
for (int j = 0; j < M; j++)
{
for (int i = 0; i < M; i++)
{
if (gpu_shark_T[j][i] != cpu_shark_T[j][i])
{
fprintf(stderr, "Test failed!\n");
exit(-1);
}
}
}
fprintf(stderr, "Test Passed!\n");
//****and some timing details*******
fprintf(stderr, "GPU time %.3f ms\n", (t1 - t0) * 1000.0);
fprintf(stderr, "CPU time %.3f ms\n", (t3 - t2) * 1000.0);
return 0;
}
基础版本的矩阵转置其实没有什么特别深入的点需要注意,只需要将每个线程对应的矩阵元素的位置标注出来后,一一赋值即可。可能需要注意的一点就是要防止越界。
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x < M && y < M)
{
B[y][x] = A[x][y]; //naive transpose on global memory.
}
__global__ void _ZHI_transpose(int A[M][M], int B[M][M])
{
__shared__ int rafa[TILE_SIZE][TILE_SIZE + 1];
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x < M && y < M)
{
rafa[threadIdx.y][threadIdx.x] = A[y][x];
}
__syncthreads();
int y2 = threadIdx.y + blockDim.x * blockIdx.x;
int x2 = threadIdx.x + blockDim.y * blockIdx.y;
if (x2 < M && y2 < M)
{
B[y2][x2] = rafa[threadIdx.x][threadIdx.y];
}
}
解决这个问题作者认为可以分为两部分:
__shared__ int rafa[TILE_SIZE][TILE_SIZE + 1];
之后解决第一步——每个线程读取数据:
int x = threadIdx.x + blockDim.x * blockIdx.x;
int y = threadIdx.y + blockDim.y * blockIdx.y;
if (x < M && y < M)
{
rafa[threadIdx.y][threadIdx.x] = A[y][x];
}
接下来第二步为每个线程向结果写入数据
int y2 = threadIdx.y + blockDim.x * blockIdx.x;
int x2 = threadIdx.x + blockDim.y * blockIdx.y;
if (x2 < M && y2 < M)
{
B[y2][x2] = rafa[threadIdx.x][threadIdx.y];
}
其实对于矩阵转置而言,最重要的就是要通过每个线程所对应的blockIdx与threadIdx来找到原矩阵和输出矩阵对应元素之间的关系,找到后就是傻瓜式的赋值了。
__global__ void ip_transpose(int* data, int n)
{
int x = blockIdx.x * blockDim.x + threadIdx.x;
int y = blockIdx.y * blockDim.y + threadIdx.y;
int tmp;
if(x < n && y < n){
if (x < y){
tmp = data[y*n+x];
data[y*n+x] = data[x*n+y];
data[x*n+y] = tmp;
}
}
}
当输入和输出为同一矩阵的时候执行矩阵转置有一个非常需要注意的问题就是你不能让所有的线程执行交换操作,因为对角线除外(本文中的矩阵全部默认为方阵),左下角元素对应的线程执行完一遍后,矩阵实际上已经完成了转置,当右上角的元素所对应的线程再执行一遍后,矩阵实际上是执行了转置的转置,即又回到了原来的阶段。(实际上可能并不能回来。如果in-place的transpose上了所有的线程的话,最终得到的可能并不是原矩阵。而是对称位置的原本元素们,一些是自己的镜像的重复,一些则可能被丢弃了。)
因次当输入和输出为同一矩阵的时候执行矩阵转置,防止“越界”就成了一个非常重要的问题,此越界并非指向了未申请的空间,而是只执行矩阵左下角对应的线程或者右上角对应的线程,二者选其一。
if (x < y){
tmp = data[y*n+x];
data[y*n+x] = data[x*n+y];
data[x*n+y] = tmp;
}
__global__ void ip_transpose(int* data)
{
__shared__ int tile_s[TILE_DIM][TILE_DIM+1];
__shared__ int tile_d[TILE_DIM][TILE_DIM+1];
int x = blockIdx.x * TILE_DIM + threadIdx.x;
int y = blockIdx.y * TILE_DIM + threadIdx.y;
//Threads in the triangle below
if (blockIdx.y>blockIdx.x) {
int dx = blockIdx.y * TILE_DIM + threadIdx.x;
int dy = blockIdx.x * TILE_DIM + threadIdx.y;
if(x<N && y<N)
{
tile_s[threadIdx.y][threadIdx.x] = data[(y)*N + x];
}
if(dx<N && dy<N)
{
tile_d[threadIdx.y][threadIdx.x] = data[(dy)*N + dx];
}
__syncthreads();
if(dx<N && dy<N)
{
data[(dy)*N + dx] = tile_s[threadIdx.x][threadIdx.y];
}
if(x<N && y<N)
{
data[(y)*N + x] = tile_d[threadIdx.x][threadIdx.y];
}
}
else if (blockIdx.y==blockIdx.x)//Threads on the diagonal
{
if(x<N && y<N)
{
tile_s[threadIdx.y][threadIdx.x] = data[(y)*N + x];
}
__syncthreads();
if(x<N && y<N)
{
data[(y)*N + x] = tile_s[threadIdx.x][threadIdx.y];
}
}
}
本算法的思路以及相关技巧在前面几种算法中已经有所总结,在这里作者就不再过多重复。