最近遇到一个opencv中,去除最小空洞以及最小连通域的 算法, 因为要优化并集成到项目中,所以使用cuda对其进行加速,不过中间遇到一些问题,需要诚待解决,而且只是初版,里面的关于最佳线程数量的分配,都还没有优化。不过先贴上来吧,等有好的方式再做修改.
void Qimage2MatInteroperateGpu::removeSmallRegion(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode)
{
int RemoveCount = 0;
//新建一幅标签图像初始化为0像素点,为了记录每个像素点检验状态的标签,0代表未检查,1代表正在检查,2代表检查不合格(需要反转颜色),3代表检查合格或不需检查
//初始化的图像全部为0,未检查
Mat PointLabel = Mat::zeros(Src.size(), CV_8UC1); //和原始图像同等大小的空位图.
if (CheckMode == 1)//去除小连通区域的白色点
{
cout << "去除小连通域.";
for (int i = 0; i < Src.rows; i++)
{
for (int j = 0; j < Src.cols; j++)
{
if (Src.at(i, j) < 10)
{
PointLabel.at(i, j) = 3;//将背景黑色点标记为合格,像素为3
}
}
}
}
else//去除孔洞,黑色点像素
{
cout << "去除孔洞";
for (int i = 0; i < Src.rows; i++)
{
for (int j = 0; j < Src.cols; j++)
{
if (Src.at(i, j) > 10)
{
PointLabel.at(i, j) = 3;//如果原图是白色区域,标记为合格,像素为3
}
}
}
}
showMat(PointLabel,"remove dong");
//for (int i = 0; i < Src.rows; i++)
//{
// for (int j = 0; j < Src.cols; j++)
// {
// if (PointLabel.at(i, j) == 0)
// {
// }
// }
//}
vectorNeihborPos;//将邻域压进容器 田字格
NeihborPos.push_back(Point2i(-1, 0));
NeihborPos.push_back(Point2i(1, 0));
NeihborPos.push_back(Point2i(0, -1));
NeihborPos.push_back(Point2i(0, 1));
if (NeihborMode == 1) //米字格
{
cout << "Neighbor mode: 8邻域." << endl;
NeihborPos.push_back(Point2i(-1, -1));
NeihborPos.push_back(Point2i(-1, 1));
NeihborPos.push_back(Point2i(1, -1));
NeihborPos.push_back(Point2i(1, 1));
}
else cout << "Neighbor mode: 4邻域." << endl;
int NeihborCount = 4 + 4 * NeihborMode;
int CurrX = 0, CurrY = 0;
int recordNumvber = 0;
bool status = false;
//开始检测
for (int i = 0; i < Src.rows; i++)
{
for (int j = 0; j < Src.cols; j++)
{
if (PointLabel.at(i, j) == 0)//标签图像像素点为0,表示还未检查的不合格点
{
std::cout << "开始记录不合格的点: " << i<<" "<GrowBuffer;//记录检查像素点的个数
GrowBuffer.push_back(Point2i(j, i)); // j i ?
PointLabel.at(i, j) = 1;//标记为正在检查
int CheckResult = 0;
//循环为自我迭代的过程,在循环自身的过程中,需要不停的对 后来加入的数据也进行计算迭代.
//这里并行化的时候,可以使用一个比较大的数组来代替.
for (int z = 0; z < GrowBuffer.size(); z++)
{
for (int q = 0; q < NeihborCount; q++)//循环遍历周围的4或者8点.
{
CurrX = GrowBuffer.at(z).x + NeihborPos.at(q).x;
CurrY = GrowBuffer.at(z).y + NeihborPos.at(q).y;
if (CurrX >= 0 && CurrX < Src.cols&&CurrY >= 0 && CurrY < Src.rows) //防止越界
{
if (PointLabel.at(CurrY, CurrX) == 0)
{
GrowBuffer.push_back(Point2i(CurrX, CurrY)); //邻域点加入buffer
PointLabel.at(CurrY, CurrX) = 1; //更新邻域点的检查标签,避免重复检查
}
}
}
}
std::cout << "计算获取到的区域对象的像素点: " << GrowBuffer.size() << " -- (j i) " << i << " " << j << std::endl;
if (GrowBuffer.size() > AreaLimit) //判断结果(是否超出限定的大小),1为未超出,2为超出
CheckResult = 2;
else
{
CheckResult = 1;
RemoveCount++;//记录有多少区域被去除
}
for (int z = 0; z < GrowBuffer.size(); z++)
{
CurrX = GrowBuffer.at(z).x;
CurrY = GrowBuffer.at(z).y;
PointLabel.at(CurrY, CurrX) += CheckResult;//标记不合格的像素点,像素值为2
}
//********结束该点处的检查**********
//
status = true;
}
if (status == true)
{
//break;
}
}
if (status == true)
{
//break;
}
}
CheckMode = 255 * (1 - CheckMode);
//开始反转面积过小的区域
for (int i = 0; i < Src.rows; ++i)
{
for (int j = 0; j < Src.cols; ++j)
{
if (PointLabel.at(i, j) == 2)
{
Dst.at(i, j) = CheckMode;
}
else if (PointLabel.at(i, j) == 3)
{
Dst.at(i, j) = Src.at(i, j);
}
}
}
cout << RemoveCount << " objects removed." << endl;
}
定义
__host__ bool removeSmallRegionGpu(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode);
/******************************************************/
// 函数名 : removeSmallRegionKernelGpu
// 功能描述 : 最小连通域核函数.
// 参数 : source 原图像数据.(注意,这里的原图像必须是经过 灰度变换之后的二值化图像.)
// :Auxiliary 记录的检测标志
// : outImg 输出的图像像素.
// :
// 返回值 : 无
/******************************************************/
__global__ void removeSmallRegionKernelGpu
(uchar* source, uchar* Auxiliary, uchar* outImg,
int *outCalculateNumber,
int2* neiBorModeBuffer, int neightborSize, int2 *GrowBuffer, int GrowBufferSize,
int width, int height,
int AreaLimit, int checkMode, int neiborMode
);
//实现的部分. 这里描述一下,因为在核函数里啊,实在做不到那个 vector::push_back(),自增,自加,所有,我就使用了一个全局的一维结构体 + 两个变量来模拟 ... (其实大家对比 上面的opencv的写法,就明白cuda这部分为什么这么写了,包括上面的参数列表部分..当然,里面也有一些自测的部分,比如int *outCalculateNumber, 这个参数,希望不会照成误解 )
int tidx = threadIdx.x + blockIdx.x*blockDim.x;
int tidy = threadIdx.y + blockIdx.y*blockDim.y;
//行列的偏移.
int offsetx = gridDim.x * blockDim.x;
int offsety = gridDim.y * blockDim.y;
if (tidx < 0 || tidx > width || tidy < 0 || tidy > height) return; //防止越界.
//索引.
int offsetIndex = tidx + tidy * width;
if (checkMode == 1)//去除小连通区域的白色点
{
if (source[offsetIndex] < 10)
{
//将背景黑色点标记为合格,像素为3
Auxiliary[offsetIndex] = ckNumbser;
}
}
else//去除孔洞,黑色点像素
{
if (source[offsetIndex] > 10)
{
Auxiliary[offsetIndex] = ckNumbser;
}
}
//同步等待以上所有的数据都计算完毕.
__syncthreads();
int neihborCount = 4;
if (neightborSize < neihborCount)
{
printf("neightborSize less 4 \n");
}
if (neiborMode == 1)
{
neihborCount = 4 + 4 * neiborMode;
}
int currX = 0, currY = 0;
//这里,让外部修改的数据,可以让整个线程粒子都知道.
//开始检测. 这里并行化每个单步像素.
if (Auxiliary[offsetIndex] == 0)
{
GrowBuffer[0] = int2{ tidx ,tidy };// tidx ,tidy
Auxiliary[offsetIndex] = 1; //标记正在检测.
int checkResult = 0;
int offSetLength = width * height;
int GrowBUfferValied = 1;
//循环,查找关联对象.
for (int z = 0; z < GrowBUfferValied; z++)
{
for (int q = 0; q < neihborCount; q++)
{
int2 temp = GrowBuffer[z];
int2 neigborTemp = neiBorModeBuffer[q];
currX = temp.x + neigborTemp.x;
currY = temp.y + neigborTemp.y;
if (currX >= 0 && currX < width && currY >= 0 && currY < height)
{
int currxyOffset = currX + currY * width;
if (currxyOffset > 0 && currxyOffset < offSetLength)
{
if (Auxiliary[currxyOffset] == 0)
{
GrowBuffer[GrowBUfferValied] = int2{ currX,currY }; //邻域点加入buffer
Auxiliary[currxyOffset] = 1; //更新邻域点的检查标签,避免重复检查
GrowBUfferValied++;
}
}
}
}
printf("GrowBUfferValied++ %d \n", GrowBUfferValied);
if (GrowBUfferValied > GrowBufferSize - 1)
{
printf("GrowBUfferValied size number is over \n");
break;
}
}
if (GrowBUfferValied > 20)
{
printf("get recRange is %d %d -> %d imgsize: %d\n", tidx, tidy, GrowBUfferValied, offSetLength);
}
//记录每次的结果.
outCalculateNumber[offsetIndex] = GrowBUfferValied;
//判断结果(是否超出限定的大小),1为未超出,2为超出
if (GrowBUfferValied > AreaLimit)
{
checkResult = 2;
}
else
{
checkResult = 1;
//removeCount++;//这里注意,防止资源竞夺.
}
for (int z = 0; z < GrowBUfferValied; z++)
{
int2 temp = GrowBuffer[z];
currX = temp.x;
currY = temp.y;
if (currX >= 0 && currX < width && currY >= 0 && currY < height)
{
int currxyOffset = currX + currY * width;
if (currxyOffset > 0 && currxyOffset < offSetLength - 1)
{
Auxiliary[currxyOffset] += checkResult; //?
}
}
}
}
//每个像素判断,并反转过小的区域.
checkMode = 255 * (1 - checkMode);
if (Auxiliary[offsetIndex] == 2)
{
outImg[offsetIndex] = checkMode;
}
else if (Auxiliary[offsetIndex] == 3)
{
outImg[offsetIndex] = source[offsetIndex];
}
//printf("%d %d calculate end \n",tidx,tidy);
__syncthreads();
/// 本地函数实现部分.
__host__ bool removeSmallRegionGpu(Mat & Src, Mat & Dst, int AreaLimit, int CheckMode, int NeihborMode)
{
if (Src.data == nullptr)
{
std::cout << "src is nullptr" << std::endl;
return false;
}
//判断其通道大小.
//获取其宽高大小.
int imgWidth = Src.cols;
int imgHeight = Src.rows;
int channels = Src.channels();
if (channels == 3)
{
std::cout << "src` channels is 3 or more,please convert 1 channel" << std::endl;
//将多通道合并成单通道.
return false;
}
// cv::imshow("Src", Src);
std::cout << "Src` channel is " << Src.channels() << std::endl;
if (Dst.data == nullptr)
{
//如果输出图像为空,将自动创建单通道.
std::cout << "dst data is empty, the process will creat it default" << std::endl;
Dst = cv::Mat::zeros(cv::Size(imgWidth,imgHeight),CV_8UC1);
}
int imgSize = imgWidth * imgHeight*channels;
//开辟gpu空间.
uchar* srcGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&srcGpu,sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemcpy(srcGpu,Src.data,sizeof(uchar)*imgSize,cudaMemcpyKind::cudaMemcpyHostToDevice));
//开辟输出图像空间大小.
uchar* DstGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&DstGpu,sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemset(DstGpu,0,imgSize*sizeof(uchar)));
//全黑的辅助gpu位图空间
uchar* AuxiliaryGpu = nullptr;
HANDLE_ERROR(cudaMalloc((void**)&AuxiliaryGpu, sizeof(uchar)*imgSize));
HANDLE_ERROR(cudaMemset(AuxiliaryGpu, 0, imgSize * sizeof(uchar)));
//创建并计算其开辟的最佳核函数算子.
//int Maxblocks = getMaxThreadNums();
int Maxblocks = 32; //1024
dim3 threadsPerBlock(Maxblocks,Maxblocks);
dim3 blocksPerGrid((imgWidth+threadsPerBlock.x-1)/threadsPerBlock.x,(imgHeight+threadsPerBlock.y-1)/threadsPerBlock.y);
//需要写入需要的数据.
thrust::device_vector neightborBuffer;
{
neightborBuffer.push_back(int2{ -1,0 });
neightborBuffer.push_back(int2{ 1, 0 });
neightborBuffer.push_back(int2{ 0, -1 });
neightborBuffer.push_back(int2{ 0, 1});
neightborBuffer.push_back(int2{ -1,-1 });
neightborBuffer.push_back(int2{ -1,1 });
neightborBuffer.push_back(int2{ 1,-1 });
neightborBuffer.push_back(int2{ 1,1 });
}
//获取gpu 动态数组指针.并将其传入.
int2* neightBorBufferPtr = thrust::raw_pointer_cast(&neightborBuffer[0]);
int neightborSize = neightborBuffer.size();
thrust::device_vector grawBuffer;
grawBuffer.resize(imgWidth*imgHeight);//imgWidth*imgHeight
int2* grawBufferPtr = thrust::raw_pointer_cast(&grawBuffer[0]);
int grawBufferSize = grawBuffer.size();
std::cout << "blocksPerGrid size: " << blocksPerGrid.x << " " << blocksPerGrid.y << std::endl;
std::cout << "threadsPerBlock size: " << threadsPerBlock.x << " " << threadsPerBlock.y << std::endl;
//预处理待检测图像数据.
prereatmentAuxiliary << < blocksPerGrid, threadsPerBlock >> > (srcGpu, AuxiliaryGpu,imgWidth,imgHeight,CheckMode);
cv::Mat auxiliaryTemp = cv::Mat::zeros(cv::Size(imgWidth, imgHeight), CV_8UC1);
HANDLE_ERROR(cudaMemcpy(auxiliaryTemp.data, AuxiliaryGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::imshow("auxiliaryTemp", auxiliaryTemp);
/*
测试辅助数.imgSize
*/
//extern __shared__ int getRecordCalculate[]; //共享内存,不能超过16kb
int* recordBuffer = nullptr;
cudaMalloc((void**)&recordBuffer,sizeof(int)*imgSize);
cudaMemset(recordBuffer,0,sizeof(int)*imgSize);
//并行计算孔洞自生长区域.
removeSmallRegionKernelGpu << > > (
srcGpu,AuxiliaryGpu,DstGpu, recordBuffer,
neightBorBufferPtr, neightborSize,
grawBufferPtr, grawBufferSize,
imgWidth,imgHeight,
AreaLimit,CheckMode,NeihborMode);
//记录数据
int *cpuBuffer = new int[imgSize];
memset(cpuBuffer,0,sizeof(int)*imgSize);
cudaMemcpy(cpuBuffer,recordBuffer,sizeof(int)*imgSize,cudaMemcpyKind::cudaMemcpyDeviceToHost);
long controlNumber = 0;
for (int i = 0; i < imgSize; i++)
{
controlNumber += cpuBuffer[i];
}
std::cout << "最后总数: " << controlNumber << std::endl;
//计算完毕之后,将gpu数据下载,并拷贝给本地位图.
HANDLE_ERROR(cudaMemcpy(Dst.data, DstGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::Mat OUTauxiliary = cv::Mat::zeros(cv::Size(imgWidth, imgHeight), CV_8UC1);
HANDLE_ERROR(cudaMemcpy(OUTauxiliary.data, AuxiliaryGpu, sizeof(uchar)*imgSize, cudaMemcpyKind::cudaMemcpyDeviceToHost));
cv::imshow("OUTauxiliary", OUTauxiliary);
//释放gpu空间.
cudaFree(srcGpu);
cudaFree(DstGpu);
cudaFree(AuxiliaryGpu);
cudaFree(recordBuffer);
return true;
}
使用方式
void test()
{
cv::Mat Source = cv::imread(R"(..\\MatLabCuda\\img\\source\\remove_B.bmp)");
//保证这里的传入的图像为单通道.
cv::Mat SourceSignel;
if (Source.channels() == 3)
{
SourceSignel = cv::Mat::zeros(cv::Size(Source.cols,Source.rows),CV_8UC1);
for (int i = 0; i < Source.rows; i++)
{
for (int j = 0; j < Source.cols; j++)
{
Vec3b temp = Source.at(i,j);
SourceSignel.at(i, j) = temp[0];
}
}
}
cv::Mat outImg = cv::Mat::zeros(cv::Size(Source.cols,Source.rows), Source.type());
cv::Mat outImgGpu = cv::Mat::zeros(cv::Size(Source.cols, Source.rows), CV_8UC1);
removeSmallRegion(SourceSignel, outImg, 100, 1, 1);
removeSmallRegionGpu(SourceSignel, outImgGpu, 100, 1, 1);
}
以上就是整个过程,其实不需要我再介绍什么了,上面的注释以及一些过程的,都写了,只要顺着顺序看,就基本明白我要做的事情了.