CUDA实现QuickSortr排序算法(一)

 

最近研究用CUDA实现排序算法,这是尝试之一,下面是QuickSort排序算法用CUDA实现的排序核心代码。全部代码正在完善中,欢迎纠错....

#include "gpuqsort.h"

#undef THREADS
#define THREADS blockDim.x

extern __shared__ unsigned int sarray[];

#ifdef HASATOMICS
__device__ unsigned int ohtotal = 0;
#endif


__device__ inline void swap(unsigned int& a, unsigned int& b)
{
    unsigned int tmp = a;
    a = b;
    b = tmp;
}

__device__ inline
void bitonicSort(unsigned int* fromvalues, unsigned int* tovalues, unsigned int from, unsigned int size)
{
 unsigned int* shared = (unsigned int*)sarray;

 unsigned int coal = (from&0xf);
 size = size + coal;
 from = from - coal;

 int sb = 2 << (int)(__log2f(size));


 for(int i=threadIdx.x;i0; j /= 2)
        {
   for(int tid=threadIdx.x;tid tid)
    {
     if ((tid & k) == 0)
     {
      if (shared[tid] > shared[ixj])
      {
       swap(shared[tid], shared[ixj]);
      }
     }
     else
     {
      if (shared[tid] < shared[ixj])
      {
       swap(shared[tid], shared[ixj]);
      }
     }
    }
            }
           
            __syncthreads();
        }
    }
 __syncthreads();

 for(int i=threadIdx.x;i=coal)
      tovalues[i+from] = shared[i];
 __syncthreads();
}


__device__ inline void cumcount(unsigned int *lblock, unsigned int *rblock)
{
 int tx = threadIdx.x;

    int offset = 1;
 
    __syncthreads();

 for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up the tree
    {
        __syncthreads();

  if (tx < d)   
        {
   int ai = offset*(2*tx+1)-1;
            int bi = offset*(2*tx+2)-1;
            lblock[bi] += lblock[ai];
   rblock[bi] += rblock[ai];
  }
        offset *= 2;
    }
 __syncthreads();
    if (tx == 0)
 {
  lblock[THREADS] = lblock[THREADS-1];
  rblock[THREADS] = rblock[THREADS-1];
  lblock[THREADS - 1] =0;
  rblock[THREADS - 1] =0;
 } __syncthreads();

    for (int d = 1; d < THREADS; d *= 2) // traverse down tree & build scan
    {
        offset >>= 1;
        __syncthreads();
 
        if (tx < d)
        {
   int ai = offset*(2*tx+1)-1;
            int bi = offset*(2*tx+2)-1;
 
            int t   = lblock[ai];
   lblock[ai]  = lblock[bi];
            lblock[bi] += t;

            t   = rblock[ai];
            rblock[ai]  = rblock[bi];
            rblock[bi] += t;

        }
    }
}


__global__ void part1(unsigned int* data, struct Params* params, struct Hist* hist, Length* lengths)
{
 const int tx = threadIdx.x;

 unsigned int* lblock = (unsigned int*)sarray;
 unsigned int* rblock = (unsigned int*)(&lblock[(blockDim.x+1)]);
 unsigned int* minpiv = (unsigned int*)(&rblock[(blockDim.x+1)]);
 unsigned int* maxpiv = (unsigned int*)(&minpiv[blockDim.x]);



 unsigned int start = params[blockIdx.x].from;
 unsigned int end = params[blockIdx.x].end;
 unsigned int pivot = params[blockIdx.x].pivot;

 
 minpiv[tx] = data[start+tx];
 maxpiv[tx] = data[start+tx];

 __syncthreads();
 int ll=0;
 int lr=0;

 __syncthreads();


 int coal = (start&0xf);
 start = start-coal;


 if(tx+startpivot)
   lr++;

 
  minpiv[tx] = min(minpiv[tx],d);
  maxpiv[tx] = max(maxpiv[tx],d);
  }
 }



 for(unsigned int i=tx+start+THREADS;ipivot)
   lr++;


  minpiv[tx] = min(minpiv[tx],d);
  maxpiv[tx] = max(maxpiv[tx],d);
 }

 lblock[tx]=ll;
 rblock[tx]=lr;

 __syncthreads();


    cumcount((unsigned int*)lblock,(unsigned int*)rblock);

    if(tx==0)
    {

  for(int i=0;ileft[blockIdx.x*(THREADS)+threadIdx.x]  = lblock[threadIdx.x+1];
 hist->right[blockIdx.x*(THREADS)+threadIdx.x] = rblock[threadIdx.x+1];

 lengths->left[blockIdx.x]  = lblock[THREADS];
 lengths->right[blockIdx.x] = rblock[THREADS];


 lengths->minpiv[blockIdx.x] = minpiv[0];
 lengths->maxpiv[blockIdx.x] = maxpiv[0];

}



__global__ void part2(unsigned int* data, unsigned int* data2, struct Params* params, struct Hist* hist, Length* lengths)
{
 const int tx = threadIdx.x;
 const int bx = blockIdx.x;


 unsigned int x = lengths->left[bx] + hist->left[bx*(THREADS)+tx]-1;// - 1;
 unsigned int y = lengths->right[bx] - hist->right[bx*(THREADS)+tx];


 unsigned int start = params[bx].from;
 unsigned int end = params[bx].end;
 unsigned int pivot = params[bx].pivot;

 __syncthreads();

 int coal = (start&0xf);
 start = start-coal;


 if(tx+startpivot)
   data2[y++]=d;
  }
 }

 __syncthreads();


 for(unsigned int i=start+tx+THREADS;ipivot)
   data2[y++]=d;
 }

 return;
}



__global__ void part3(unsigned int* data, struct Params* params, struct Hist* hist, Length* lengths)
{
 const int tx = threadIdx.x;
 const int bx = blockIdx.x;


 if(params[bx].last)
 {

  unsigned int x = lengths->left[bx] + hist->left[bx*THREADS+THREADS-1] + tx;
  unsigned int y = lengths->right[bx] - hist->right[bx*THREADS+THREADS-1];
  unsigned int pivot = params[bx].pivot;


  for(;x=0)
 {
  __syncthreads();

  if(tx==0)
  {
   from = beg[bi];
   to = end[bi];


   if(!flip[bi])
   {
    data = adata2;
    data2 = adata;
   }
   else
   {
    data = adata;
    data2 = adata2;
   }

  }


  __syncthreads();


  if((to-from)<(sbsize-16))
  {
   if((to-from>=1)&&(lphase!=2))
    bitonicSort(data,adata,from,to-from);
   __syncthreads();


   if(tx==0)
    bi--;
   __syncthreads();

   continue;
  }

 
  if(tx==0)
  {
   unsigned int mip = min(min(data[from],data[to-1]),data[(from+to)/2]);
   unsigned int map = max(max(data[from],data[to-1]),data[(from+to)/2]);
   pivot = min(max(mip/2+map/2,mip),map);
  }


  unsigned int ll=0;
  unsigned int lr=0;

  __syncthreads();
  
  unsigned int coal = (from)&0xf;

  if(tx+from-coalpivot)
     lr++;
   }
  }



  for(int i=from+tx+THREADS-coal;ipivot)
    lr++;
  }


  lblock[tx]=ll;
  rblock[tx]=lr;
  
  __syncthreads();


  cumcount((unsigned int*)lblock,(unsigned int*)rblock);

  __syncthreads();


  if(tx==0)
  {

   flip[bi+1] = !flip[bi];
   flip[bi] = !flip[bi];


   if(lblock[THREADS]pivot)
     data2[y++] = d;
   }
  }


  for(unsigned int i=from+tx+THREADS-coal;ipivot)
    data2[y++] = d;
   
  }

  __syncthreads();


  for(unsigned int i=from+lblock[THREADS]+tx;i

 

你可能感兴趣的:(CUDA实现QuickSortr排序算法(一))