CUDA实现多batch基数排序

CUDA实现多batch基数排序

基数排序是具有固定迭代次数的排序算法, 其通过对最低位到最高位的一一比较,对数值排序。GPU版的基数排序将数据分为N个部分并行进行基数排序,随后并行规约得到排序后的数组。
这里实现了一版多batch的基数排序实例,并同时输出原数组的序号,基本实现了argsort的功能,代码如下所示:

#include
#include


__device__ void preprocess_float(float* const data, int batch, int numData, int tidx,int tidy)
{
    if(tidy>numData) return;
    if(tidx>batch) return;    
    for(int i = tidy;i<numData;i+=blockDim.y)
    {
        unsigned int *data_temp = (unsigned int *)(&data[i + tidx*numData]);    
        *data_temp = (*data_temp >> 31 & 0x1)? ~(*data_temp): (*data_temp) | 0x80000000; 
    }
}

__device__ void Aeprocess_float(float* const data, int batch, int numData, int tidx,int tidy)
{
    for(int i = tidy;i<numData;i+=blockDim.y)
    {
        unsigned int* data_temp = (unsigned int *)(&data[i + tidx*numData]);
        *data_temp = (*data_temp >> 31 & 0x1)? (*data_temp) & 0x7fffffff: ~(*data_temp);
    }
}


__device__ void radixKernel(float* data0,float* data1,int tidx,int tidy,int batch,int numData){
    for(int bit=0;bit<32;bit++){
        unsigned int mask = 1 << bit;
        unsigned int cnt0 = 0,cnt1 = 0;   
        for(int i=tidy;i<numData;i+=blockDim.y){
            unsigned int *temp =(unsigned int *) &data0[i + tidx*numData];
            if(*temp&mask){
                data1[tidy+cnt1 + tidx*numData] = data0[i + tidx*numData];
                cnt1 += blockDim.y;
            }
            else{
                data0[tidy+cnt0+ tidx*numData] = data0[i+ tidx*numData];
                cnt0 += blockDim.y;
            }
        }    
        for(int j=0;j<cnt1;j+=blockDim.y){
            data0[j+cnt0+tidy+ tidx*numData] = data1[j+tidy+ tidx*numData];
        } 
    }
    return;
}

__device__ void mergeKernel(float* data0,float* data1,int* index,int tidx,int tidy,int batch,int numData){
    int numPerList = ceil((float)numData / blockDim.y);
    extern __shared__ int listIndexrecordValrecordTid[]; 
    int* listIndex = (int*)listIndexrecordValrecordTid; //记录线程上指针位置
    float* recordVal = (float*)listIndexrecordValrecordTid + batch*blockDim.y;  //得到应比较的数
    int * recordTid = (int*)listIndexrecordValrecordTid + 2*batch*blockDim.y;  // 记录当前线程
    int* recordSrcIndex = (int*)listIndexrecordValrecordTid + 3*batch*blockDim.y;  // 记录原index
    listIndex[tidy + tidx * blockDim.y] = 0;
    recordVal[tidy + tidx * blockDim.y] = 0;
    recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y;
    recordSrcIndex[tidy + tidx * blockDim.y] = 0;
    __syncthreads();

    for(int i=0;i<numData;i++){
        recordVal[tidy + tidx * blockDim.y] = 0;
        recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y;
        recordSrcIndex[tidy + tidx * blockDim.y] = 0; 
        if(listIndex[tidy + tidx * blockDim.y] < numPerList)
        {
            int src_index = tidy + tidx * numData + listIndex[tidy + tidx * blockDim.y]*blockDim.y;
            int batch_index = tidy + listIndex[tidy + tidx * blockDim.y]*blockDim.y;
            if(batch_index < numData)
            {
                recordVal[tidy + tidx * blockDim.y] = data0[src_index];
                recordSrcIndex[tidy + tidx * blockDim.y] = src_index;
            }
            else{
                unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y];
                *temp = 0xffffffff;
            }
        }else{
                unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y];
                *temp = 0xffffffff;
        }

        __syncthreads();
        int tidMax = blockDim.y >> 1;
        while (tidMax!=0)
        {
            if(tidy < tidMax)
            {
                unsigned int* temp1 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y];
                unsigned int *temp2 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y + tidMax];
                if(*temp2 < *temp1)
                {
                    recordVal[tidy + tidx * blockDim.y] = recordVal[tidy + tidx * blockDim.y + tidMax];
                    recordTid[tidy + tidx * blockDim.y] = recordTid[tidy + tidx * blockDim.y + tidMax];
                    recordSrcIndex[tidy + tidx * blockDim.y] = recordSrcIndex[tidy + tidx * blockDim.y + tidMax];
                }
            }
            tidMax = tidMax >> 1;
            __syncthreads();

        }
        if(tidy==0){
            listIndex[recordTid[tidx * blockDim.y]]++;
            data1[i + tidx * numData] = recordVal[tidx * blockDim.y];
            index[i + tidx * numData] = recordSrcIndex[tidx * blockDim.y]%numData;
        }
        __syncthreads();
        
    }
    return;
}

__global__ void radixSortGpu(float* src_data, float* dst_data, int* index,int batch, int dataLen){
    int tidx = threadIdx.x;
    int tidy = threadIdx.y;
    preprocess_float(src_data, batch,dataLen, tidx,tidy); 
    __syncthreads();
    radixKernel(src_data,dst_data,tidx,tidy,batch,dataLen);
    __syncthreads();
    mergeKernel(src_data,dst_data,index,tidx,tidy,batch,dataLen);
    __syncthreads();
    Aeprocess_float(dst_data, batch,dataLen, tidx,tidy);
    return;
}


int main(){
    using namespace std;
    int batch=4;
    int inputsLen = 40;
    int numBolcks=128;
    float* inputs;
    float* outputs;
    int* index;
    srand(100);    
    cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&index,sizeof(int)*inputsLen*batch);

    cout<<"input rand :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX;
            index[i+j*inputsLen] = i;
            cout<<inputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;

    const dim3 blockSize(batch,numBolcks);
    const dim3 girdSize(1,1);
    radixSortGpu<<<girdSize,blockSize,4*batch*numBolcks*sizeof(float)>>>(inputs,outputs,index,batch,inputsLen);
    cudaDeviceSynchronize();

    cout<<"output :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;   

    cout<<"index :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<index[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;  

    return 0;
}

编译执行代码,可以得到结果:

input rand :
0.315598, 0.284943, 0.240601, 0.484127, 0.375793, 0.0537027, 0.570274, 0.970005, 0.515422, 0.429529, 0.408115, 0.150135, 0.586551, 0.631635, 0.61386, 0.411339, 0.107092, 0.871626, 0.264386, 0.621543, 0.670743, 0.358033, 0.208356, 0.534175, 0.384512, 0.844556, 0.883552, 0.461531, 0.650512, 0.772418, 0.496347, 0.96611, 0.0573612, 0.736949, 0.450236, 0.433154, 0.790652, 0.0205103, 0.403159, 0.306074,  
0.450039, 0.811274, 0.456208, 0.0365907, 0.442909, 0.0700681, 0.44793, 0.550001, 0.941694, 0.712316, 0.171544, 0.612436, 0.0703487, 0.3799, 0.146612, 0.45486, 0.224456, 0.0301636, 0.916391, 0.874968, 0.802581, 0.412738, 0.841078, 0.859943, 0.149687, 0.291314, 0.293097, 0.940339, 0.311825, 0.696256, 0.246413, 0.761864, 0.50753, 0.702621, 0.798455, 0.950439, 0.772689, 0.246385, 0.50044, 0.714383,  
0.9587, 0.671984, 0.326819, 0.0290491, 0.0518843, 0.473431, 0.483909, 0.27634, 0.503595, 0.4003, 0.151308, 0.306176, 0.813039, 0.992386, 0.166119, 0.962726, 0.2837, 0.459215, 0.903065, 0.595525, 0.155472, 0.149477, 0.357389, 0.663002, 0.852098, 0.155843, 0.613441, 0.624787, 0.402228, 0.113881, 0.33917, 0.360928, 0.785866, 0.665989, 0.389977, 0.83775, 0.13942, 0.873886, 0.11409, 0.643015,  
0.274187, 0.265398, 0.949191, 0.0872253, 0.257784, 0.115309, 0.0499512, 0.541484, 0.574525, 0.953016, 0.137009, 0.729996, 0.102493, 0.494398, 0.392998, 0.954591, 0.650241, 0.00643936, 0.579378, 0.0524684, 0.120321, 0.918549, 0.413396, 0.906187, 0.584538, 0.803373, 0.743937, 0.723958, 0.67726, 0.858027, 0.366973, 0.951447, 0.123425, 0.316164, 0.0386718, 0.38121, 0.431473, 0.0886231, 0.922694, 0.00599772,  
 
output :
0.0205103, 0.0537027, 0.0573612, 0.107092, 0.150135, 0.208356, 0.240601, 0.264386, 0.284943, 0.306074, 0.315598, 0.358033, 0.375793, 0.384512, 0.403159, 0.408115, 0.411339, 0.429529, 0.433154, 0.450236, 0.461531, 0.484127, 0.496347, 0.515422, 0.534175, 0.570274, 0.586551, 0.61386, 0.621543, 0.631635, 0.650512, 0.670743, 0.736949, 0.772418, 0.790652, 0.844556, 0.871626, 0.883552, 0.96611, 0.970005,  
0.0301636, 0.0365907, 0.0700681, 0.0703487, 0.146612, 0.149687, 0.171544, 0.224456, 0.246385, 0.246413, 0.291314, 0.293097, 0.311825, 0.3799, 0.412738, 0.442909, 0.44793, 0.450039, 0.45486, 0.456208, 0.50044, 0.50753, 0.550001, 0.612436, 0.696256, 0.702621, 0.712316, 0.714383, 0.761864, 0.772689, 0.798455, 0.802581, 0.811274, 0.841078, 0.859943, 0.874968, 0.916391, 0.940339, 0.941694, 0.950439,  
0.0290491, 0.0518843, 0.113881, 0.11409, 0.13942, 0.149477, 0.151308, 0.155472, 0.155843, 0.166119, 0.27634, 0.2837, 0.306176, 0.326819, 0.33917, 0.357389, 0.360928, 0.389977, 0.4003, 0.402228, 0.459215, 0.473431, 0.483909, 0.503595, 0.595525, 0.613441, 0.624787, 0.643015, 0.663002, 0.665989, 0.671984, 0.785866, 0.813039, 0.83775, 0.852098, 0.873886, 0.903065, 0.9587, 0.962726, 0.992386,  
0.00599772, 0.00643936, 0.0386718, 0.0499512, 0.0524684, 0.0872253, 0.0886231, 0.102493, 0.115309, 0.120321, 0.123425, 0.137009, 0.257784, 0.265398, 0.274187, 0.316164, 0.366973, 0.38121, 0.392998, 0.413396, 0.431473, 0.494398, 0.541484, 0.574525, 0.579378, 0.584538, 0.650241, 0.67726, 0.723958, 0.729996, 0.743937, 0.803373, 0.858027, 0.906187, 0.918549, 0.922694, 0.949191, 0.951447, 0.953016, 0.954591,  
 
index :
37, 5, 32, 16, 11, 22, 2, 18, 1, 39, 0, 21, 4, 24, 38, 10, 15, 9, 35, 34, 27, 3, 30, 8, 23, 6, 12, 14, 19, 13, 28, 20, 33, 29, 36, 25, 17, 26, 31, 7,  
17, 3, 5, 12, 14, 24, 10, 16, 37, 30, 25, 26, 28, 13, 21, 4, 6, 0, 15, 2, 38, 32, 7, 11, 29, 33, 9, 39, 31, 36, 34, 20, 1, 22, 23, 19, 18, 27, 8, 35,  
3, 4, 29, 38, 36, 21, 10, 20, 25, 14, 7, 16, 11, 2, 30, 22, 31, 34, 9, 28, 17, 5, 6, 8, 19, 26, 27, 39, 23, 33, 1, 32, 12, 35, 24, 37, 18, 0, 15, 13,  
39, 17, 34, 6, 19, 3, 37, 12, 5, 20, 32, 10, 4, 1, 0, 33, 30, 35, 14, 22, 36, 13, 7, 8, 18, 24, 16, 28, 27, 11, 26, 25, 29, 23, 21, 38, 2, 31, 9, 15,  

对比numpy中的结果,确信结果无误。

或者使用nvidia官方的cub库也可以实现同样的效果,具体代码如下:

#include
#include
#include


int main(){
    using namespace std;
    int batch=4;
    int inputsLen = 40;
    int* d_offset;
    float* inputs;
    float* outputs;
    int* index;
    int* outIndex;
    srand(100);    
    cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch);
    cudaMallocManaged(&index,sizeof(int)*inputsLen*batch);
    cudaMallocManaged(&outIndex,sizeof(int)*inputsLen*batch);
    cudaMallocManaged(&d_offset,sizeof(int)*(batch+1));

    cout<<"input rand :"<<endl;
    d_offset[0] = 0;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX;
            index[i+j*inputsLen] = i;
            cout<<inputs[i+j*inputsLen]<<", ";
        }
        d_offset[j+1] = inputsLen*(j+1); // 1 2 3 4 --> 0 3 
        cout<<" "<<endl;
    }
    cout<<" "<<endl;
    size_t  temp_storage_bytes  = 0;
    void    *d_temp_storage     = NULL;
    cub::DeviceSegmentedRadixSort::SortPairs(
        d_temp_storage, temp_storage_bytes,
        inputs, outputs,
        index, outIndex,
        batch * inputsLen, batch,
        d_offset, d_offset + 1);
    cudaMalloc(&d_temp_storage, temp_storage_bytes);
    cub::DeviceSegmentedRadixSort::SortPairs(
        d_temp_storage, temp_storage_bytes,
        inputs, outputs,
        index, outIndex,
        batch * inputsLen, batch,
        d_offset,d_offset + 1);
    cudaDeviceSynchronize();
    
    cout<<"output :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outputs[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;   

    cout<<"index :"<<endl;
    for(int j=0;j<batch;j++){
        for(int i=0;i<inputsLen;i++){
            cout<<outIndex[i+j*inputsLen]<<", ";
        }
        cout<<" "<<endl;
    }
    cout<<" "<<endl;  

    return 0;
}

更新:

实现了一个基于cuda加速多batch的双线性插值resize,稍加改造可以用于视频流模型输入时的resize方法,具体代码如下:

#include
#include
#include
#include
#include
#include
#include

using namespace std;
using namespace cv;

__device__ uchar3 getValues(uchar3* input,int x,int y,int b, int H,int W){
    if (x<0 || x>W || y<0 || y>H) return make_uchar3(0,0,0);

    return input[b*H*W + y*W + x];
}

__global__ void bilinearKernel( uchar3*input, uchar3*output, int oriH, int oriW, int outH, int outW, int batch, float scaleX, float scaleY, int shiftX,int shiftY ){

    int b = threadIdx.x + blockDim.x * blockIdx.x;
    int outXY = threadIdx.y + blockDim.y * blockIdx.y;
    int outX = outXY % outW;
    int outY = outXY / outW;
    if (outX>=outW || outY>=outH || b>=batch) return;

    float srcX = (outX - shiftX + 0.5) * scaleX -0.5;
    float srcY = (outY - shiftY + 0.5) * scaleY -0.5;

    int minSrcX = (int)srcX;
    int minSrcY = (int)srcY;

    int maxSrcX = (int)srcX + 1;
    int maxSrcY = (int)srcY + 1;

    float w1 = (srcX-minSrcX) * ( srcY-minSrcY);
    float w2 = (maxSrcX-srcX) * ( srcY-minSrcY);
    float w3 = (maxSrcX-srcX) * ( maxSrcY-srcY);
    float w4 = (srcX-minSrcX) * ( maxSrcY-srcY);

    uchar3 v1 = getValues(input,minSrcX,minSrcY,b,oriH,oriW);
    uchar3 v2 = getValues(input,maxSrcX,minSrcY,b,oriH,oriW);
    uchar3 v3 = getValues(input,maxSrcX,maxSrcY,b,oriH,oriW);
    uchar3 v4 = getValues(input,minSrcX,maxSrcY,b,oriH,oriW);

    output[b*outW*outH + outY*outW + outX].x = (uchar)(w1 * (float)v1.x + w2 * (float)v2.x + w3 * (float)v3.x + w4 * (float)v4.x);
    output[b*outW*outH + outY*outW + outX].y = (uchar)(w1 * (float)v1.y + w2 * (float)v2.y + w3 * (float)v3.y + w4 * (float)v4.y);
    output[b*outW*outH + outY*outW + outX].z = (uchar)(w1 * (float)v1.z + w2 * (float)v2.z + w3 * (float)v3.z + w4 * (float)v4.z);

    return;
}

void stringSplit(string str, const const char split,vector<string>& res)
{
	istringstream iss(str);	// 输入流
	string token;			// 接收缓冲区
	while (getline(iss, token, split))	// 以split为分隔符
	{
		res.push_back(token);
	}
}

int main(int argc,char**argv){

    int outH;
    int outW;
    int keepRatio = 0;
    int keepCenter = 0;
    string imgPath;
    string outPath;

    int opt=0,option_index = 0;
    static struct option opts[]=
    {
        {"outH",required_argument,nullptr,'h'},// 长选项名,required_argument 表明要跟参数,返回值是什么,返回值
        {"outW",required_argument,nullptr,'w'},
        {"keepRatio",no_argument,nullptr,'r'},
        {"keepCenter",no_argument,nullptr,'c'},
        {"imgPath",required_argument,nullptr,'i'},
        {"outPath",required_argument,nullptr,'o'},
        {0,0,0,0}
    };

    while((opt=getopt_long_only(argc,argv,"h:w:i:o:rc",opts,&option_index))!=-1)
    {
        switch (opt)
        {
        case 'h':outH = atoi(optarg);break;
        case 'w':outW = atoi(optarg);break;
        case 'i':imgPath = string(optarg);break;
        case 'o':outPath = string(optarg);break;
        case 'r':keepRatio = 1;break;
        case 'c':keepCenter =1;break;
        
        default:
            break;
        }
    }

    if(imgPath.find(".jpg") != string::npos || imgPath.find(".png") != string::npos){

        Mat img = imread(imgPath);
        int oriWidth = img.size().width;
        int oriHeight = img.size().height;

        uchar3* inputs;
        uchar3* outputs;  
        cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight);
        cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW);

        float scaleX = (oriWidth*1.0f / outW);
        float scaleY = (oriHeight*1.0f / outH);
        float shiftX = 0.f ,shiftY = 0.f;
        if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
        if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;}

        cudaMemcpy(inputs,img.data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice);

        dim3 blockSize(1,512);
        dim3 gridSize(1,(outH*outW+512-1)/512);
        bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,1,scaleX,scaleY,shiftX,shiftY);

        Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0));
        cudaMemcpy(outImg.data,outputs,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost);

        imwrite(outPath,outImg);
    }
    else if (imgPath.find(".txt") != string::npos)
    {
        cout<<"read image list "<<imgPath<<endl;
        ifstream inputImageNameList(imgPath);
        vector<string> fileNames;
        vector<Mat> imgs;
        auto dataptr = imgs.data();
        int oriWidth = 0;
        int oriHeight = 0;
        if(!inputImageNameList.is_open()){
            cout<<"can not read image list "<<imgPath<<endl;
            return 1;
        }
        string strLine;
        while (getline(inputImageNameList,strLine)){
            Mat img = imread(strLine);
            oriWidth = img.size().width;
            oriHeight = img.size().height;  
            imgs.push_back(img);

            vector<string> strList;
            string str2("This-is-a-test");
            stringSplit(strLine, '/', strList);	// 将子串存放到strList中
            int lenStrList = strList.size();
            fileNames.push_back(strList[lenStrList-1]);

        }
        inputImageNameList.close();

        int batch = fileNames.size();

        uchar3* inputs;
        uchar3* outputs;  
        cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight*batch);
        cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW*batch);

        float scaleX = (oriWidth*1.0f / outW);
        float scaleY = (oriHeight*1.0f / outH);
        float shiftX = 0.f ,shiftY = 0.f;
        if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
        if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;}

        Mat outImg_x(oriHeight,oriWidth,CV_8UC3,Scalar(0,0,0));
        for(int b=0;b<batch;++b){
            cudaMemcpy(inputs+oriHeight*oriWidth*b,imgs[b].data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice);
        }
        dim3 blockSize(1,512);
        dim3 gridSize(batch,(outH*outW+512-1)/512);
        bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,batch,scaleX,scaleY,shiftX,shiftY);

        Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0));
        for(int b=0;b<batch;++b){
            cudaMemcpy(outImg.data,outputs+b*outH*outW,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost); 
            imwrite(outPath+"result_"+fileNames[b],outImg);       
        }
        

    }
    return;

}

你可能感兴趣的:(学习所得,c++,算法)