基数排序是具有固定迭代次数的排序算法, 其通过对最低位到最高位的一一比较,对数值排序。GPU版的基数排序将数据分为N个部分并行进行基数排序,随后并行规约得到排序后的数组。
这里实现了一版多batch的基数排序实例,并同时输出原数组的序号,基本实现了argsort的功能,代码如下所示:
#include
#include
__device__ void preprocess_float(float* const data, int batch, int numData, int tidx,int tidy)
{
if(tidy>numData) return;
if(tidx>batch) return;
for(int i = tidy;i<numData;i+=blockDim.y)
{
unsigned int *data_temp = (unsigned int *)(&data[i + tidx*numData]);
*data_temp = (*data_temp >> 31 & 0x1)? ~(*data_temp): (*data_temp) | 0x80000000;
}
}
__device__ void Aeprocess_float(float* const data, int batch, int numData, int tidx,int tidy)
{
for(int i = tidy;i<numData;i+=blockDim.y)
{
unsigned int* data_temp = (unsigned int *)(&data[i + tidx*numData]);
*data_temp = (*data_temp >> 31 & 0x1)? (*data_temp) & 0x7fffffff: ~(*data_temp);
}
}
__device__ void radixKernel(float* data0,float* data1,int tidx,int tidy,int batch,int numData){
for(int bit=0;bit<32;bit++){
unsigned int mask = 1 << bit;
unsigned int cnt0 = 0,cnt1 = 0;
for(int i=tidy;i<numData;i+=blockDim.y){
unsigned int *temp =(unsigned int *) &data0[i + tidx*numData];
if(*temp&mask){
data1[tidy+cnt1 + tidx*numData] = data0[i + tidx*numData];
cnt1 += blockDim.y;
}
else{
data0[tidy+cnt0+ tidx*numData] = data0[i+ tidx*numData];
cnt0 += blockDim.y;
}
}
for(int j=0;j<cnt1;j+=blockDim.y){
data0[j+cnt0+tidy+ tidx*numData] = data1[j+tidy+ tidx*numData];
}
}
return;
}
__device__ void mergeKernel(float* data0,float* data1,int* index,int tidx,int tidy,int batch,int numData){
int numPerList = ceil((float)numData / blockDim.y);
extern __shared__ int listIndexrecordValrecordTid[];
int* listIndex = (int*)listIndexrecordValrecordTid; //记录线程上指针位置
float* recordVal = (float*)listIndexrecordValrecordTid + batch*blockDim.y; //得到应比较的数
int * recordTid = (int*)listIndexrecordValrecordTid + 2*batch*blockDim.y; // 记录当前线程
int* recordSrcIndex = (int*)listIndexrecordValrecordTid + 3*batch*blockDim.y; // 记录原index
listIndex[tidy + tidx * blockDim.y] = 0;
recordVal[tidy + tidx * blockDim.y] = 0;
recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y;
recordSrcIndex[tidy + tidx * blockDim.y] = 0;
__syncthreads();
for(int i=0;i<numData;i++){
recordVal[tidy + tidx * blockDim.y] = 0;
recordTid[tidy + tidx * blockDim.y] = tidy + tidx * blockDim.y;
recordSrcIndex[tidy + tidx * blockDim.y] = 0;
if(listIndex[tidy + tidx * blockDim.y] < numPerList)
{
int src_index = tidy + tidx * numData + listIndex[tidy + tidx * blockDim.y]*blockDim.y;
int batch_index = tidy + listIndex[tidy + tidx * blockDim.y]*blockDim.y;
if(batch_index < numData)
{
recordVal[tidy + tidx * blockDim.y] = data0[src_index];
recordSrcIndex[tidy + tidx * blockDim.y] = src_index;
}
else{
unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y];
*temp = 0xffffffff;
}
}else{
unsigned int *temp = (unsigned int *)&recordVal[tidy + tidx * blockDim.y];
*temp = 0xffffffff;
}
__syncthreads();
int tidMax = blockDim.y >> 1;
while (tidMax!=0)
{
if(tidy < tidMax)
{
unsigned int* temp1 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y];
unsigned int *temp2 = (unsigned int*)&recordVal[tidy + tidx * blockDim.y + tidMax];
if(*temp2 < *temp1)
{
recordVal[tidy + tidx * blockDim.y] = recordVal[tidy + tidx * blockDim.y + tidMax];
recordTid[tidy + tidx * blockDim.y] = recordTid[tidy + tidx * blockDim.y + tidMax];
recordSrcIndex[tidy + tidx * blockDim.y] = recordSrcIndex[tidy + tidx * blockDim.y + tidMax];
}
}
tidMax = tidMax >> 1;
__syncthreads();
}
if(tidy==0){
listIndex[recordTid[tidx * blockDim.y]]++;
data1[i + tidx * numData] = recordVal[tidx * blockDim.y];
index[i + tidx * numData] = recordSrcIndex[tidx * blockDim.y]%numData;
}
__syncthreads();
}
return;
}
__global__ void radixSortGpu(float* src_data, float* dst_data, int* index,int batch, int dataLen){
int tidx = threadIdx.x;
int tidy = threadIdx.y;
preprocess_float(src_data, batch,dataLen, tidx,tidy);
__syncthreads();
radixKernel(src_data,dst_data,tidx,tidy,batch,dataLen);
__syncthreads();
mergeKernel(src_data,dst_data,index,tidx,tidy,batch,dataLen);
__syncthreads();
Aeprocess_float(dst_data, batch,dataLen, tidx,tidy);
return;
}
int main(){
using namespace std;
int batch=4;
int inputsLen = 40;
int numBolcks=128;
float* inputs;
float* outputs;
int* index;
srand(100);
cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch);
cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch);
cudaMallocManaged(&index,sizeof(int)*inputsLen*batch);
cout<<"input rand :"<<endl;
for(int j=0;j<batch;j++){
for(int i=0;i<inputsLen;i++){
inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX;
index[i+j*inputsLen] = i;
cout<<inputs[i+j*inputsLen]<<", ";
}
cout<<" "<<endl;
}
cout<<" "<<endl;
const dim3 blockSize(batch,numBolcks);
const dim3 girdSize(1,1);
radixSortGpu<<<girdSize,blockSize,4*batch*numBolcks*sizeof(float)>>>(inputs,outputs,index,batch,inputsLen);
cudaDeviceSynchronize();
cout<<"output :"<<endl;
for(int j=0;j<batch;j++){
for(int i=0;i<inputsLen;i++){
cout<<outputs[i+j*inputsLen]<<", ";
}
cout<<" "<<endl;
}
cout<<" "<<endl;
cout<<"index :"<<endl;
for(int j=0;j<batch;j++){
for(int i=0;i<inputsLen;i++){
cout<<index[i+j*inputsLen]<<", ";
}
cout<<" "<<endl;
}
cout<<" "<<endl;
return 0;
}
编译执行代码,可以得到结果:
input rand :
0.315598, 0.284943, 0.240601, 0.484127, 0.375793, 0.0537027, 0.570274, 0.970005, 0.515422, 0.429529, 0.408115, 0.150135, 0.586551, 0.631635, 0.61386, 0.411339, 0.107092, 0.871626, 0.264386, 0.621543, 0.670743, 0.358033, 0.208356, 0.534175, 0.384512, 0.844556, 0.883552, 0.461531, 0.650512, 0.772418, 0.496347, 0.96611, 0.0573612, 0.736949, 0.450236, 0.433154, 0.790652, 0.0205103, 0.403159, 0.306074,
0.450039, 0.811274, 0.456208, 0.0365907, 0.442909, 0.0700681, 0.44793, 0.550001, 0.941694, 0.712316, 0.171544, 0.612436, 0.0703487, 0.3799, 0.146612, 0.45486, 0.224456, 0.0301636, 0.916391, 0.874968, 0.802581, 0.412738, 0.841078, 0.859943, 0.149687, 0.291314, 0.293097, 0.940339, 0.311825, 0.696256, 0.246413, 0.761864, 0.50753, 0.702621, 0.798455, 0.950439, 0.772689, 0.246385, 0.50044, 0.714383,
0.9587, 0.671984, 0.326819, 0.0290491, 0.0518843, 0.473431, 0.483909, 0.27634, 0.503595, 0.4003, 0.151308, 0.306176, 0.813039, 0.992386, 0.166119, 0.962726, 0.2837, 0.459215, 0.903065, 0.595525, 0.155472, 0.149477, 0.357389, 0.663002, 0.852098, 0.155843, 0.613441, 0.624787, 0.402228, 0.113881, 0.33917, 0.360928, 0.785866, 0.665989, 0.389977, 0.83775, 0.13942, 0.873886, 0.11409, 0.643015,
0.274187, 0.265398, 0.949191, 0.0872253, 0.257784, 0.115309, 0.0499512, 0.541484, 0.574525, 0.953016, 0.137009, 0.729996, 0.102493, 0.494398, 0.392998, 0.954591, 0.650241, 0.00643936, 0.579378, 0.0524684, 0.120321, 0.918549, 0.413396, 0.906187, 0.584538, 0.803373, 0.743937, 0.723958, 0.67726, 0.858027, 0.366973, 0.951447, 0.123425, 0.316164, 0.0386718, 0.38121, 0.431473, 0.0886231, 0.922694, 0.00599772,
output :
0.0205103, 0.0537027, 0.0573612, 0.107092, 0.150135, 0.208356, 0.240601, 0.264386, 0.284943, 0.306074, 0.315598, 0.358033, 0.375793, 0.384512, 0.403159, 0.408115, 0.411339, 0.429529, 0.433154, 0.450236, 0.461531, 0.484127, 0.496347, 0.515422, 0.534175, 0.570274, 0.586551, 0.61386, 0.621543, 0.631635, 0.650512, 0.670743, 0.736949, 0.772418, 0.790652, 0.844556, 0.871626, 0.883552, 0.96611, 0.970005,
0.0301636, 0.0365907, 0.0700681, 0.0703487, 0.146612, 0.149687, 0.171544, 0.224456, 0.246385, 0.246413, 0.291314, 0.293097, 0.311825, 0.3799, 0.412738, 0.442909, 0.44793, 0.450039, 0.45486, 0.456208, 0.50044, 0.50753, 0.550001, 0.612436, 0.696256, 0.702621, 0.712316, 0.714383, 0.761864, 0.772689, 0.798455, 0.802581, 0.811274, 0.841078, 0.859943, 0.874968, 0.916391, 0.940339, 0.941694, 0.950439,
0.0290491, 0.0518843, 0.113881, 0.11409, 0.13942, 0.149477, 0.151308, 0.155472, 0.155843, 0.166119, 0.27634, 0.2837, 0.306176, 0.326819, 0.33917, 0.357389, 0.360928, 0.389977, 0.4003, 0.402228, 0.459215, 0.473431, 0.483909, 0.503595, 0.595525, 0.613441, 0.624787, 0.643015, 0.663002, 0.665989, 0.671984, 0.785866, 0.813039, 0.83775, 0.852098, 0.873886, 0.903065, 0.9587, 0.962726, 0.992386,
0.00599772, 0.00643936, 0.0386718, 0.0499512, 0.0524684, 0.0872253, 0.0886231, 0.102493, 0.115309, 0.120321, 0.123425, 0.137009, 0.257784, 0.265398, 0.274187, 0.316164, 0.366973, 0.38121, 0.392998, 0.413396, 0.431473, 0.494398, 0.541484, 0.574525, 0.579378, 0.584538, 0.650241, 0.67726, 0.723958, 0.729996, 0.743937, 0.803373, 0.858027, 0.906187, 0.918549, 0.922694, 0.949191, 0.951447, 0.953016, 0.954591,
index :
37, 5, 32, 16, 11, 22, 2, 18, 1, 39, 0, 21, 4, 24, 38, 10, 15, 9, 35, 34, 27, 3, 30, 8, 23, 6, 12, 14, 19, 13, 28, 20, 33, 29, 36, 25, 17, 26, 31, 7,
17, 3, 5, 12, 14, 24, 10, 16, 37, 30, 25, 26, 28, 13, 21, 4, 6, 0, 15, 2, 38, 32, 7, 11, 29, 33, 9, 39, 31, 36, 34, 20, 1, 22, 23, 19, 18, 27, 8, 35,
3, 4, 29, 38, 36, 21, 10, 20, 25, 14, 7, 16, 11, 2, 30, 22, 31, 34, 9, 28, 17, 5, 6, 8, 19, 26, 27, 39, 23, 33, 1, 32, 12, 35, 24, 37, 18, 0, 15, 13,
39, 17, 34, 6, 19, 3, 37, 12, 5, 20, 32, 10, 4, 1, 0, 33, 30, 35, 14, 22, 36, 13, 7, 8, 18, 24, 16, 28, 27, 11, 26, 25, 29, 23, 21, 38, 2, 31, 9, 15,
对比numpy中的结果,确信结果无误。
或者使用nvidia官方的cub库也可以实现同样的效果,具体代码如下:
#include
#include
#include
int main(){
using namespace std;
int batch=4;
int inputsLen = 40;
int* d_offset;
float* inputs;
float* outputs;
int* index;
int* outIndex;
srand(100);
cudaMallocManaged(&inputs,sizeof(float)*inputsLen*batch);
cudaMallocManaged(&outputs,sizeof(float)*inputsLen*batch);
cudaMallocManaged(&index,sizeof(int)*inputsLen*batch);
cudaMallocManaged(&outIndex,sizeof(int)*inputsLen*batch);
cudaMallocManaged(&d_offset,sizeof(int)*(batch+1));
cout<<"input rand :"<<endl;
d_offset[0] = 0;
for(int j=0;j<batch;j++){
for(int i=0;i<inputsLen;i++){
inputs[i+j*inputsLen] = (float)rand()/(float)RAND_MAX;
index[i+j*inputsLen] = i;
cout<<inputs[i+j*inputsLen]<<", ";
}
d_offset[j+1] = inputsLen*(j+1); // 1 2 3 4 --> 0 3
cout<<" "<<endl;
}
cout<<" "<<endl;
size_t temp_storage_bytes = 0;
void *d_temp_storage = NULL;
cub::DeviceSegmentedRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
inputs, outputs,
index, outIndex,
batch * inputsLen, batch,
d_offset, d_offset + 1);
cudaMalloc(&d_temp_storage, temp_storage_bytes);
cub::DeviceSegmentedRadixSort::SortPairs(
d_temp_storage, temp_storage_bytes,
inputs, outputs,
index, outIndex,
batch * inputsLen, batch,
d_offset,d_offset + 1);
cudaDeviceSynchronize();
cout<<"output :"<<endl;
for(int j=0;j<batch;j++){
for(int i=0;i<inputsLen;i++){
cout<<outputs[i+j*inputsLen]<<", ";
}
cout<<" "<<endl;
}
cout<<" "<<endl;
cout<<"index :"<<endl;
for(int j=0;j<batch;j++){
for(int i=0;i<inputsLen;i++){
cout<<outIndex[i+j*inputsLen]<<", ";
}
cout<<" "<<endl;
}
cout<<" "<<endl;
return 0;
}
实现了一个基于cuda加速多batch的双线性插值resize,稍加改造可以用于视频流模型输入时的resize方法,具体代码如下:
#include
#include
#include
#include
#include
#include
#include
using namespace std;
using namespace cv;
__device__ uchar3 getValues(uchar3* input,int x,int y,int b, int H,int W){
if (x<0 || x>W || y<0 || y>H) return make_uchar3(0,0,0);
return input[b*H*W + y*W + x];
}
__global__ void bilinearKernel( uchar3*input, uchar3*output, int oriH, int oriW, int outH, int outW, int batch, float scaleX, float scaleY, int shiftX,int shiftY ){
int b = threadIdx.x + blockDim.x * blockIdx.x;
int outXY = threadIdx.y + blockDim.y * blockIdx.y;
int outX = outXY % outW;
int outY = outXY / outW;
if (outX>=outW || outY>=outH || b>=batch) return;
float srcX = (outX - shiftX + 0.5) * scaleX -0.5;
float srcY = (outY - shiftY + 0.5) * scaleY -0.5;
int minSrcX = (int)srcX;
int minSrcY = (int)srcY;
int maxSrcX = (int)srcX + 1;
int maxSrcY = (int)srcY + 1;
float w1 = (srcX-minSrcX) * ( srcY-minSrcY);
float w2 = (maxSrcX-srcX) * ( srcY-minSrcY);
float w3 = (maxSrcX-srcX) * ( maxSrcY-srcY);
float w4 = (srcX-minSrcX) * ( maxSrcY-srcY);
uchar3 v1 = getValues(input,minSrcX,minSrcY,b,oriH,oriW);
uchar3 v2 = getValues(input,maxSrcX,minSrcY,b,oriH,oriW);
uchar3 v3 = getValues(input,maxSrcX,maxSrcY,b,oriH,oriW);
uchar3 v4 = getValues(input,minSrcX,maxSrcY,b,oriH,oriW);
output[b*outW*outH + outY*outW + outX].x = (uchar)(w1 * (float)v1.x + w2 * (float)v2.x + w3 * (float)v3.x + w4 * (float)v4.x);
output[b*outW*outH + outY*outW + outX].y = (uchar)(w1 * (float)v1.y + w2 * (float)v2.y + w3 * (float)v3.y + w4 * (float)v4.y);
output[b*outW*outH + outY*outW + outX].z = (uchar)(w1 * (float)v1.z + w2 * (float)v2.z + w3 * (float)v3.z + w4 * (float)v4.z);
return;
}
void stringSplit(string str, const const char split,vector<string>& res)
{
istringstream iss(str); // 输入流
string token; // 接收缓冲区
while (getline(iss, token, split)) // 以split为分隔符
{
res.push_back(token);
}
}
int main(int argc,char**argv){
int outH;
int outW;
int keepRatio = 0;
int keepCenter = 0;
string imgPath;
string outPath;
int opt=0,option_index = 0;
static struct option opts[]=
{
{"outH",required_argument,nullptr,'h'},// 长选项名,required_argument 表明要跟参数,返回值是什么,返回值
{"outW",required_argument,nullptr,'w'},
{"keepRatio",no_argument,nullptr,'r'},
{"keepCenter",no_argument,nullptr,'c'},
{"imgPath",required_argument,nullptr,'i'},
{"outPath",required_argument,nullptr,'o'},
{0,0,0,0}
};
while((opt=getopt_long_only(argc,argv,"h:w:i:o:rc",opts,&option_index))!=-1)
{
switch (opt)
{
case 'h':outH = atoi(optarg);break;
case 'w':outW = atoi(optarg);break;
case 'i':imgPath = string(optarg);break;
case 'o':outPath = string(optarg);break;
case 'r':keepRatio = 1;break;
case 'c':keepCenter =1;break;
default:
break;
}
}
if(imgPath.find(".jpg") != string::npos || imgPath.find(".png") != string::npos){
Mat img = imread(imgPath);
int oriWidth = img.size().width;
int oriHeight = img.size().height;
uchar3* inputs;
uchar3* outputs;
cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight);
cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW);
float scaleX = (oriWidth*1.0f / outW);
float scaleY = (oriHeight*1.0f / outH);
float shiftX = 0.f ,shiftY = 0.f;
if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;}
cudaMemcpy(inputs,img.data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice);
dim3 blockSize(1,512);
dim3 gridSize(1,(outH*outW+512-1)/512);
bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,1,scaleX,scaleY,shiftX,shiftY);
Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0));
cudaMemcpy(outImg.data,outputs,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost);
imwrite(outPath,outImg);
}
else if (imgPath.find(".txt") != string::npos)
{
cout<<"read image list "<<imgPath<<endl;
ifstream inputImageNameList(imgPath);
vector<string> fileNames;
vector<Mat> imgs;
auto dataptr = imgs.data();
int oriWidth = 0;
int oriHeight = 0;
if(!inputImageNameList.is_open()){
cout<<"can not read image list "<<imgPath<<endl;
return 1;
}
string strLine;
while (getline(inputImageNameList,strLine)){
Mat img = imread(strLine);
oriWidth = img.size().width;
oriHeight = img.size().height;
imgs.push_back(img);
vector<string> strList;
string str2("This-is-a-test");
stringSplit(strLine, '/', strList); // 将子串存放到strList中
int lenStrList = strList.size();
fileNames.push_back(strList[lenStrList-1]);
}
inputImageNameList.close();
int batch = fileNames.size();
uchar3* inputs;
uchar3* outputs;
cudaMallocManaged(&inputs,sizeof(uchar3)*oriWidth*oriHeight*batch);
cudaMallocManaged(&outputs,sizeof(uchar3)*outH*outW*batch);
float scaleX = (oriWidth*1.0f / outW);
float scaleY = (oriHeight*1.0f / outH);
float shiftX = 0.f ,shiftY = 0.f;
if(keepRatio)scaleX = scaleY = scaleX > scaleY ? scaleX : scaleY;
if(keepRatio && keepCenter){shiftX = (outW - oriWidth/scaleX)/2.f;shiftY = (outH - oriHeight/scaleY)/2.f;}
Mat outImg_x(oriHeight,oriWidth,CV_8UC3,Scalar(0,0,0));
for(int b=0;b<batch;++b){
cudaMemcpy(inputs+oriHeight*oriWidth*b,imgs[b].data,sizeof(uchar3)*oriHeight*oriWidth,cudaMemcpyHostToDevice);
}
dim3 blockSize(1,512);
dim3 gridSize(batch,(outH*outW+512-1)/512);
bilinearKernel<<<gridSize,blockSize>>>(inputs,outputs,oriHeight,oriWidth,outH,outW,batch,scaleX,scaleY,shiftX,shiftY);
Mat outImg(outH,outW,CV_8UC3,Scalar(0,0,0));
for(int b=0;b<batch;++b){
cudaMemcpy(outImg.data,outputs+b*outH*outW,sizeof(uchar3)*outH*outW,cudaMemcpyDeviceToHost);
imwrite(outPath+"result_"+fileNames[b],outImg);
}
}
return;
}