使用CUDA判断是否为素数
使用CUDA判断是否为素数程序,每个线程判断一个数是否可以被整除,将每线程判断结果写入shared memory内,然后统计结果,如果全部不能被整除,那就是素数,代码如下
使用CUDA判断是否为素数程序,每个线程判断一个数是否可以被整除,将每线程判断结果写入shared memory内,然后统计结果,如果全部不能被整除,那就是素数,代码如下
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <cuda_runtime.h>
#include <vector_types.h>
#define TEST_DATA 9999941
#define DATA_SIZE 1048576
#define MILLION 1000000
#define GPU_FRQNC_K 918000
#define THREAD_NUM 256
#define BLOCK_NUM 32
long data[TEST_DATA];
void GenerateNumbers(long *number, int size) {
for (int i = 0; i < size - 2; i++) {
number[i] = i + 2;
}
}
__global__ static void isPrime(long *num, bool *result, clock_t *time) {
extern __shared__ bool shareMem[];
const int blockID = blockIdx.x;
const int threadID = threadIdx.x;
int CHUNK_SIZE = BLOCK_NUM * THREAD_NUM;
int i;
int iOffset;
if(0 == threadID){
time[blockID] = clock();
}
shareMem[threadID] = true;
for(i = (blockID * THREAD_NUM + threadID); i < TEST_DATA -2; i += CHUNK_SIZE){
if((TEST_DATA % num[i]) == 0){
shareMem[threadID] = false;
}
}
__syncthreads();
iOffset = THREAD_NUM / 2;
while (iOffset > 0) {
if (threadID < iOffset) {
shareMem[threadID] &= shareMem[threadID + iOffset];
}
iOffset >>= 1;
__syncthreads();
}
if(0 == threadID){
result[blockID] = shareMem[0];
time[blockID + BLOCK_NUM] = clock();
}
}
int main() {
struct timeval tBegin;
struct timeval tEnd;
double ftimeused = 0.0;
bool sum[BLOCK_NUM];
bool final_sum = false;
long *gpudata;
bool *result;
clock_t *time;
clock_t time_used[BLOCK_NUM * 2];
clock_t time_begin_min, time_end_max;
clock_t time_gpu_used;
int i, j;
GenerateNumbers(data, TEST_DATA);
cudaMalloc((void**) &gpudata, sizeof(long) * TEST_DATA);
cudaMalloc((void**) &result, sizeof(bool) * BLOCK_NUM);
cudaMalloc((void**) &time, sizeof(time) * BLOCK_NUM * 2);
cudaMemcpy(gpudata, data, sizeof(long) * TEST_DATA, cudaMemcpyHostToDevice);
gettimeofday(&tBegin, NULL);
isPrime<<<BLOCK_NUM, THREAD_NUM, sizeof(bool)*THREAD_NUM>>>(gpudata, result, time);
cudaMemcpy(&sum, result, sizeof(bool) * BLOCK_NUM, cudaMemcpyDeviceToHost);
cudaMemcpy(&time_used, time, sizeof(clock_t) * BLOCK_NUM * 2,
cudaMemcpyDeviceToHost);
final_sum = true;
for (i = 0; i < BLOCK_NUM; i++) {
final_sum &= sum[i];
}
gettimeofday(&tEnd, NULL);
ftimeused = ((tEnd.tv_sec - tBegin.tv_sec) * MILLION + (tEnd.tv_usec
- tBegin.tv_usec)) / (double) MILLION;
cudaFree(gpudata);
cudaFree(result);
cudaFree(time);
time_begin_min = time_used[0];
time_end_max = time_used[BLOCK_NUM];
for (i = 0, j = BLOCK_NUM; i < BLOCK_NUM; i++, j++) {
if (time_begin_min > time_used[i]) {
time_begin_min = time_used[i];
}
if (time_end_max < time_used[j]) {
time_end_max = time_used[j];
}
}
time_gpu_used = time_end_max - time_begin_min;
const char *str_result_gpu = final_sum ? "TRUE" : "FALSE";
printf("return(GPU): %s timeused=%f s GPU count\n", str_result_gpu,
(time_gpu_used / (double) GPU_FRQNC_K) / 1000);
printf("return(GPU): %s timeused=%f s GPU count\n", str_result_gpu,
(time_gpu_used / (double) GPU_FRQNC_K) / 1000);
printf("return(GPU): %s timeused=%f s CPU count\n", str_result_gpu, ftimeused);
final_sum = true;
gettimeofday(&tBegin, NULL);
for (int i = 2; i < TEST_DATA - 2; i++) {
if((TEST_DATA % i) == 0){
final_sum = false;
}
}
gettimeofday(&tEnd, NULL);
const char *str_result_cpu = final_sum ? "TRUE" : "FALSE";
ftimeused = ((tEnd.tv_sec - tBegin.tv_sec) * MILLION + (tEnd.tv_usec
- tBegin.tv_usec)) / (double) MILLION;
printf("return(CPU): %s timeused=%f s CPU count\n", str_result_cpu, ftimeused);
}