使用CUDA判断是否为素数程序,每个线程判断一个数是否可以被整除,将每线程判断结果写入shared memory内,然后统计结果,如果全部不能被整除,那就是素数,代码如下:
#include <stdio.h> #include <stdlib.h> #include <cuda_runtime.h> #define TEST 999961 #define THREAD_NUM 256 #define BLOCK_NUM 32 long data[TEST]; //host code //初始化CUDA // bool InitCUDA() { int count; cudaGetDeviceCount(&count); //取得支持的设备个数 if(count == 0) { //fprintf(stderr, “There is no device./n”); printf("There in no device./n"); return false; } int i; for(i = 0; i < count; i++) { cudaDeviceProp prop; if(cudaGetDeviceProperties(&prop, i) == cudaSuccess) { if(prop.major >= 1) { break; } } } if(i == count) { // fprintf(stderr, “There is no device supporting CUDA 1.x./n”); printf("There is no device supporting CUDA 1.X./n"); return false; } cudaSetDevice(i); return true; } //host code //产生一个要被测试的数组 // void GenerateNumbers(long *number, int size) { for(int i = 0; i < size-2; i++) { number[i] = i+2; } } //device code //内核函数 // __global__ static void IsPrime(long *num, bool* result, clock_t* time) { extern __shared__ bool shared[]; const int tid = threadIdx.x; //块内线程索引 const int bid = blockIdx.x; //网格中线程块索引 result[bid]=false; int i; if(tid == 0) time[bid] = clock(); for(i = bid * THREAD_NUM + tid; i < TEST;i += BLOCK_NUM * THREAD_NUM) { if(TEST % num[bid*bid * THREAD_NUM + tid] == 0) //能整除 { shared[tid]=true; } else { shared[tid]=false; } } __syncthreads(); //同步函数 if(tid == 0) { for(i=0;i<THREAD_NUM;i++) { if(shared[i]) { result[bid]=true; } } } if(tid == 0) time[bid + BLOCK_NUM] = clock(); time[bid + BLOCK_NUM] = clock(); } //host code //主函数 // int main() { if(!InitCUDA()) //初始化函数 { return 0; } GenerateNumbers(data, TEST); //产生要测试的数组 long* gpudata; bool* result; clock_t* time; cudaMalloc((void**) &gpudata, sizeof(long) * TEST); cudaMalloc((void**) &result, sizeof(bool)*BLOCK_NUM); cudaMalloc((void**) &time, sizeof(clock_t)*BLOCK_NUM*2); cudaMemcpy(gpudata, data, sizeof(long) * TEST,cudaMemcpyHostToDevice); //调用内核函数 IsPrime<<<BLOCK_NUM, THREAD_NUM, THREAD_NUM * sizeof(bool)>>>(gpudata, result,time); bool sum[BLOCK_NUM]; clock_t time_used[BLOCK_NUM * 2]; cudaMemcpy(&sum, result, sizeof(bool)*BLOCK_NUM, cudaMemcpyDeviceToHost); cudaMemcpy(&time_used, time, sizeof(clock_t)*BLOCK_NUM*2, cudaMemcpyDeviceToHost); cudaFree(gpudata); cudaFree(result); cudaFree(time); clock_t min_start, max_end; min_start = time_used[0]; max_end = time_used[BLOCK_NUM]; for(int i = 1; i < BLOCK_NUM; i++) { if(min_start > time_used[i]) min_start = time_used[i]; if(max_end < time_used[i + BLOCK_NUM]) max_end = time_used[i + BLOCK_NUM]; } bool isprime=true; for(int i = 0; i < BLOCK_NUM; i++) { if(sum[i]) { isprime=false; break; } else { } } double PTime = (double)(max_end - min_start)/ CLOCKS_PER_SEC; if(isprime) { printf("%d is a prime /n time: %d %5.2lf seconds/n",TEST, max_end - min_start, PTime); } else { printf("%d is not a prime /n time: %d %5.2lf seconds/n",TEST, max_end - min_start, PTime); } system("pause"); return 0; }