最近研究用CUDA实现排序算法,这是尝试之一,下面是QuickSort排序算法用CUDA实现的排序核心代码。全部代码正在完善中,欢迎纠错....
#include "gpuqsort.h"
#undef THREADS
#define THREADS blockDim.x
extern __shared__ unsigned int sarray[];
#ifdef HASATOMICS
__device__ unsigned int ohtotal = 0;
#endif
__device__ inline void swap(unsigned int& a, unsigned int& b)
{
unsigned int tmp = a;
a = b;
b = tmp;
}
__device__ inline
void bitonicSort(unsigned int* fromvalues, unsigned int* tovalues, unsigned int from, unsigned int size)
{
unsigned int* shared = (unsigned int*)sarray;
unsigned int coal = (from&0xf);
size = size + coal;
from = from - coal;
int sb = 2 << (int)(__log2f(size));
for(int i=threadIdx.x;i0; j /= 2)
{
for(int tid=threadIdx.x;tid tid)
{
if ((tid & k) == 0)
{
if (shared[tid] > shared[ixj])
{
swap(shared[tid], shared[ixj]);
}
}
else
{
if (shared[tid] < shared[ixj])
{
swap(shared[tid], shared[ixj]);
}
}
}
}
__syncthreads();
}
}
__syncthreads();
for(int i=threadIdx.x;i=coal)
tovalues[i+from] = shared[i];
__syncthreads();
}
__device__ inline void cumcount(unsigned int *lblock, unsigned int *rblock)
{
int tx = threadIdx.x;
int offset = 1;
__syncthreads();
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
{
__syncthreads();
if (tx < d)
{
int ai = offset*(2*tx+1)-1;
int bi = offset*(2*tx+2)-1;
lblock[bi] += lblock[ai];
rblock[bi] += rblock[ai];
}
offset *= 2;
}
__syncthreads();
if (tx == 0)
{
lblock[THREADS] = lblock[THREADS-1];
rblock[THREADS] = rblock[THREADS-1];
lblock[THREADS - 1] =0;
rblock[THREADS - 1] =0;
} __syncthreads();
for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
{
offset >>= 1;
__syncthreads();
if (tx < d)
{
int ai = offset*(2*tx+1)-1;
int bi = offset*(2*tx+2)-1;
int t = lblock[ai];
lblock[ai] = lblock[bi];
lblock[bi] += t;
t = rblock[ai];
rblock[ai] = rblock[bi];
rblock[bi] += t;
}
}
}
__global__ void part1(unsigned int* data, struct Params* params, struct Hist* hist, Length* lengths)
{
const int tx = threadIdx.x;
unsigned int* lblock = (unsigned int*)sarray;
unsigned int* rblock = (unsigned int*)(&lblock[(blockDim.x+1)]);
unsigned int* minpiv = (unsigned int*)(&rblock[(blockDim.x+1)]);
unsigned int* maxpiv = (unsigned int*)(&minpiv[blockDim.x]);
unsigned int start = params[blockIdx.x].from;
unsigned int end = params[blockIdx.x].end;
unsigned int pivot = params[blockIdx.x].pivot;
minpiv[tx] = data[start+tx];
maxpiv[tx] = data[start+tx];
__syncthreads();
int ll=0;
int lr=0;
__syncthreads();
int coal = (start&0xf);
start = start-coal;
if(tx+startpivot)
lr++;
minpiv[tx] = min(minpiv[tx],d);
maxpiv[tx] = max(maxpiv[tx],d);
}
}
for(unsigned int i=tx+start+THREADS;ipivot)
lr++;
minpiv[tx] = min(minpiv[tx],d);
maxpiv[tx] = max(maxpiv[tx],d);
}
lblock[tx]=ll;
rblock[tx]=lr;
__syncthreads();
cumcount((unsigned int*)lblock,(unsigned int*)rblock);
if(tx==0)
{
for(int i=0;ileft[blockIdx.x*(THREADS)+threadIdx.x] = lblock[threadIdx.x+1];
hist->right[blockIdx.x*(THREADS)+threadIdx.x] = rblock[threadIdx.x+1];
lengths->left[blockIdx.x] = lblock[THREADS];
lengths->right[blockIdx.x] = rblock[THREADS];
lengths->minpiv[blockIdx.x] = minpiv[0];
lengths->maxpiv[blockIdx.x] = maxpiv[0];
}
__global__ void part2(unsigned int* data, unsigned int* data2, struct Params* params, struct Hist* hist, Length* lengths)
{
const int tx = threadIdx.x;
const int bx = blockIdx.x;
unsigned int x = lengths->left[bx] + hist->left[bx*(THREADS)+tx]-1;// - 1;
unsigned int y = lengths->right[bx] - hist->right[bx*(THREADS)+tx];
unsigned int start = params[bx].from;
unsigned int end = params[bx].end;
unsigned int pivot = params[bx].pivot;
__syncthreads();
int coal = (start&0xf);
start = start-coal;
if(tx+startpivot)
data2[y++]=d;
}
}
__syncthreads();
for(unsigned int i=start+tx+THREADS;ipivot)
data2[y++]=d;
}
return;
}
__global__ void part3(unsigned int* data, struct Params* params, struct Hist* hist, Length* lengths)
{
const int tx = threadIdx.x;
const int bx = blockIdx.x;
if(params[bx].last)
{
unsigned int x = lengths->left[bx] + hist->left[bx*THREADS+THREADS-1] + tx;
unsigned int y = lengths->right[bx] - hist->right[bx*THREADS+THREADS-1];
unsigned int pivot = params[bx].pivot;
for(;x=0)
{
__syncthreads();
if(tx==0)
{
from = beg[bi];
to = end[bi];
if(!flip[bi])
{
data = adata2;
data2 = adata;
}
else
{
data = adata;
data2 = adata2;
}
}
__syncthreads();
if((to-from)<(sbsize-16))
{
if((to-from>=1)&&(lphase!=2))
bitonicSort(data,adata,from,to-from);
__syncthreads();
if(tx==0)
bi--;
__syncthreads();
continue;
}
if(tx==0)
{
unsigned int mip = min(min(data[from],data[to-1]),data[(from+to)/2]);
unsigned int map = max(max(data[from],data[to-1]),data[(from+to)/2]);
pivot = min(max(mip/2+map/2,mip),map);
}
unsigned int ll=0;
unsigned int lr=0;
__syncthreads();
unsigned int coal = (from)&0xf;
if(tx+from-coalpivot)
lr++;
}
}
for(int i=from+tx+THREADS-coal;ipivot)
lr++;
}
lblock[tx]=ll;
rblock[tx]=lr;
__syncthreads();
cumcount((unsigned int*)lblock,(unsigned int*)rblock);
__syncthreads();
if(tx==0)
{
flip[bi+1] = !flip[bi];
flip[bi] = !flip[bi];
if(lblock[THREADS]pivot)
data2[y++] = d;
}
}
for(unsigned int i=from+tx+THREADS-coal;ipivot)
data2[y++] = d;
}
__syncthreads();
for(unsigned int i=from+lblock[THREADS]+tx;i