PatchMatch Stereo 算法的GPU版本,提供了源码,从代码的角度来进行解读。
源码地址: https://github.com/kysucix/gipuma.git
int main(int argc, char **argv)
{
if ( argc < 3 )
{
print_help (argv);
return 0;
}
InputFiles inputFiles;
OutputFiles outputFiles;
AlgorithmParameters* algParams = new AlgorithmParameters;
GTcheckParameters gtParameters;
int ret = getParametersFromCommandLine ( argc, argv, inputFiles, outputFiles, *algParams, gtParameters);
if ( ret != 0 )
return ret;
selectCudaDevice();
Results results;
ret = runGipuma ( inputFiles, outputFiles, *algParams, gtParameters, results);
return 0;
}
InputFiles
,OutputFiles
,AlgorithmParameters
,GTcheckParameters
, Results
分别是一些输入文件、输出文件、算法参数、GroundTruth、结果相关的结构体,比较简单,略过。
需要说明的有几点
1. AlgorithmParameters
AlgorithmParameters
是继承自一个Managed
类
struct AlgorithmParameters : public Managed
其中
class Managed {
public:
void *operator new(size_t len) {
void *ptr;
checkCudaErrors(cudaMallocManaged(&ptr, len));
return ptr;
}
void operator delete(void *ptr) {
cudaFree(ptr);
}
};
继承自Managed类说明,在每次new的时候,开辟的是GPU的内存。
AlgorithmParameters* algParams = new AlgorithmParameters;
因此, algParams 中的数据都是GPU中开辟的。
2. getParametersFromCommandLine
读取函数参数的代码,写的很粗糙,略过。
3. selectCudaDevice
查询Cuda设备代码,可留作自用。
static void selectCudaDevice ()
{
int deviceCount = 0;
checkCudaErrors(cudaGetDeviceCount(&deviceCount));
if (deviceCount == 0) {
fprintf(stderr, "There is no cuda capable device!\n");
exit(EXIT_FAILURE);
}
cout << "Detected " << deviceCount << " devices!" << endl;
std::vector<int> usableDevices;
std::vector<std::string> usableDeviceNames;
for (int i = 0; i < deviceCount; i++) {
cudaDeviceProp prop;
if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
if (prop.major >= 3 && prop.minor >= 0) {
usableDevices.push_back(i);
usableDeviceNames.push_back(string(prop.name));
} else {
cout << "CUDA capable device " << string(prop.name)
<< " is only compute cabability " << prop.major << '.'
<< prop.minor << endl;
}
} else {
cout << "Could not check device properties for one of the cuda "
"devices!" << endl;
}
}
if(usableDevices.empty()) {
fprintf(stderr, "There is no cuda device supporting gipuma!\n");
exit(EXIT_FAILURE);
}
cout << "Detected gipuma compatible device: " << usableDeviceNames[0] << endl;;
checkCudaErrors(cudaSetDevice(usableDevices[0]));
cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 1024*128);
}
很长,就记录几个小知识点。
1. time
time_t timeObj;
time ( &timeObj );
tm *pTime = localtime ( &timeObj );
利用时间来给文件命名
sprintf ( outputFolder, "%s/%04d%02d%02d_%02d%02d%02d_%s", outputFiles.parentFolder, pTime->tm_year + 1900, pTime->tm_mon + 1, pTime->tm_mday, pTime->tm_hour, pTime->tm_min, pTime->tm_sec, ref_name.c_str () );
2. mkdir
Windows 下和 Linux 下
//新建文件夹,windows下函数_mkdir
#if defined(_WIN32)
_mkdir ( outputFolder );
#else
mkdir ( outputFolder, 0777 );
#endif
3. GPU内存
//查询GPU可用内存
size_t avail;
size_t used;
size_t total;
cudaMemGetInfo( &avail, &total );
used = total - avail;
4. 测时间
int64_t t = getTickCount ();
...
fun();
...
t = getTickCount () - t;
double rt = ( double ) t / getTickFrequency (); //单位second
5. 添加alpha通道,转float型
if(algParams.color_processing) {
vectorfloat > > rgbChannels ( 3 );
img_color_float_alpha[i] = Mat::zeros ( img_grayscale[0].rows, img_grayscale[0].cols, CV_32FC4 );
img_color[i].convertTo (img_color_float[i], CV_32FC3); // or CV_32F works (too)
Mat alpha( img_grayscale[0].rows, img_grayscale[0].cols, CV_32FC1 );
split (img_color_float[i], rgbChannels);
rgbChannels.push_back( alpha);
merge (rgbChannels, img_color_float_alpha[i]);
}
6. getCameraParameters
readKRtFileMiddlebury
函数Vec3f vt;
...
Mat t(vt, false); //浅拷贝
/*Mat t(vt);*/
hconcat(R, t, Rt); //将R和t串起来
cameras[i].P = K * Rt;
P是projection矩阵。
注意 hconcat(R, t, Rt)
函数的使用。
Mat img = Mat::eye(5, 5, CV_32F);
cout << img << endl;
cout << img(Range(0, 3), Range(0, 1)) << endl;
打印
[1, 0, 0, 0, 0;
0, 1, 0, 0, 0;
0, 0, 1, 0, 0;
0, 0, 0, 1, 0;
0, 0, 0, 0, 1]
[1;
0;
0]
对行和列的还有单独的函数
P.colRange ( 0,3 );
decomposeProjectionMatrix
ProjectionMatrix
来分解K,R,T。Mat_<float> K;
Mat_<float> R;
Mat_<float> T;
decomposeProjectionMatrix ( P, K, R, T);
如果要计算T,例如
Mat Rt_leftIR, P;
hconcat(R_leftIR, T_leftIR, Rt_leftIR);
cout << "Rt_leftIR: " << Rt_leftIR << endl;
cout << "cameraMatrix_leftIR: " << cameraMatrix_leftIR << endl;
P = cameraMatrix_leftIR * Rt_leftIR;
Mat_ K, R, T, C, t;
decomposeProjectionMatrix(P, K, R, T);
C = T(Range(0, 3), Range(0, 1)) / T(3, 0);
t = -R * C;
cout << "P: " << P << endl;
cout << "K: " << K << endl;
cout << "R: " << R << endl;
cout << "T: " << T << endl;
cout << "C: " << C << endl;
cout << "t: " << t << endl;
输出
Rt_leftIR: [-0.034142, 0.998565, -0.041267, -75.201183;
0.990604, 0.028341, -0.133792, -167.158638;
-0.13243, -0.045448, -0.99015, 408.359915]
cameraMatrix_leftIR: [580.49762, 0, 318.49469;
0, 580.49762, 264.3284;
0, 0, 1]
P: [-61.99760153874, 565.18965924418, -339.31291258804, 86406.35678366688;
540.0382543504801, 4.438685925220001, -339.39070283504, 10905.93143464444;
-0.13243, -0.045448, -0.99015, 408.359915]
K: [580.4978233706864, 0.0001877105137459978, 318.4942436276482;
0, 580.4975608939687, 264.3286762000524;
0, 0, 1.000000124051992]
R: [-0.03414241920693253, 0.9985646029595258, -0.04126777104451897;
0.9906041563923866, 0.02834102194255837, -0.1337915984409323;
-0.1324299835717967, -0.04544799436208576, -0.9901498771699351]
T: [0.4850183888674806;
0.2198098541623313;
0.84642530642572;
0.002234080494034573]
C: [217.0997822874214;
98.38940662579803;
378.8696551828993]
t: [-75.20076080031868;
-167.158826250214;
408.3598643421455]
注意T,C和t。
T是相机的世界坐标系下的四维坐标,通过归一化可以得到三维坐标C,也就是相机的中心坐标。t是相机坐标系下将三维点投影到平面上的位移矩阵,具体运算公式,可以见《Multiple View Geometry in Computer Vision》的《Camera Models》这一章的计算公式。
P.inv ( DECOMP_SVD );
Mat_<float> getTransformationMatrix ( Mat_<float> R, Mat_<float> t ) {
Mat_<float> transMat = Mat::eye ( 4,4, CV_32F );
//Mat_ Rt = - R * t;
R.copyTo ( transMat ( Range ( 0,3 ),Range ( 0,3 ) ) );
t.copyTo ( transMat ( Range ( 0,3 ),Range ( 3,4 ) ) );
return transMat;
}
7. selectViews
static float getAngle ( Vec3f v1, Vec3f v2 ) {
float angle = acosf ( v1.dot ( v2 ) );
//if angle is not a number the dot product was 1 and thus the two vectors should be identical --> return 0
if ( angle != angle )
return 0.0f;
//if ( acosf ( v1.dot ( v2 ) ) != acosf ( v1.dot ( v2 ) ) )
//cout << acosf ( v1.dot ( v2 ) ) << " / " << v1.dot ( v2 )<< " / " << v1<< " / " << v2 << endl;
return angle;
}
其中if ( angle != angle )
这一句话主要的作用在于,如果v1, v2没有做归一化,则acosf 函数的输入可能超过[-1, 1]的范围,则函数会返回异常值,那么angle就不是 一个float值,而是一个类似于NAN的符号,无法使用!=
来进行判断,会认为angle != angle
,从而进入if语句,强行返回0。
这里同样需要注意,对于float数,使用 ==
或者!=
进行判断,是可以的。
getViewVector
static inline Vec3f getViewVector ( Camera &cam, int x, int y) {
//get some point on the line (the other point on the line is the camera center)
Vec3f ptX = get3Dpoint ( cam,x,y,1.0f );
//get vector between camera center and other point on the line
Vec3f v = ptX - cam.C;
return normalize ( v );
}
其中
inline Vec3f get3Dpoint ( Camera &cam, float x, float y, float depth ) {
// in case camera matrix is not normalized: see page 162,
//then depth might not be the real depth but w and depth needs to be computed from that first
Mat_ pt = Mat::ones ( 3,1,CV_32F );
pt ( 0,0 ) = x;
pt ( 1,0 ) = y;
//formula taken from page 162 (alternative expression)
Mat_ ptX = cam.M_inv * ( depth*pt - cam.P.col ( 3 ) );
return Vec3f ( ptX ( 0 ),ptX ( 1 ),ptX ( 2 ) );
}
按照《Multiple View Geometry in Computer Vision》书中的公式
static void get_directory_entries(
const char *dirname,
vector<string> &directory_entries)
{
DIR *dir;
struct dirent *ent;
// Open directory stream
dir = opendir (dirname);
if (dir != NULL) {
//cout << "Dirname is " << dirname << endl;
//cout << "Dirname type is " << ent->d_type << endl;
//cout << "Dirname type DT_DIR " << DT_DIR << endl;
// Print all files and directories within the directory
while ((ent = readdir (dir)) != NULL) {
//cout << "INSIDE" << endl;
//if(ent->d_type == DT_DIR)
{
char* name = ent->d_name;
if(strcmp(name,".") == 0 || strcmp(ent->d_name,"..") == 0)
continue;
//printf ("dir %s/\n", name);
directory_entries.push_back(string(name));
}
}
closedir (dir);
} else {
// Could not open directory
printf ("Cannot open directory %s\n", dirname);
exit (EXIT_FAILURE);
}
sort ( directory_entries.begin (), directory_entries.end () );
}
纹理内存可以硬件自己计算插值。
举个示例
// 简单纹理变换函数
__global__ void transformKernel(float* output,
cudaTextureObject_t texObj,
int width, int height,
float theta)
{
// 计算纹理坐标
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / (float)width;
float v = y / (float)height;
// 坐标转换
u -= 0.5f;
v -= 0.5f;
float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
// 从纹理中读取并写入全局存储
output[y * width + x] = tex2D<float>(texObj, tu, tv);
}
int main() {
// 定义CUDA array
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaArray* cuArray;
cudaMallocArray(&cuArray, &channelDesc, width, height);
// 拷贝数据到CUDA array
cudaMemcpyToArray(cuArray, 0, 0, h_data, size,
cudaMemcpyHostToDevice);
// 定义资源描述符
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
// 定义纹理对象参数
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
// 生产纹理对象
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
// 分配用于保持结果的内存
float* output;
cudaMalloc(&output, width * height * sizeof(float));
// 调用Kernel
dim3 dimBlock(16, 16);
dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x, (height + dimBlock.y - 1) / dimBlock.y);
transformKernel<<>> (output, texObj, width, height, angle);
// 销毁纹理对象
cudaDestroyTextureObject(texObj);
// 释放内存
cudaFreeArray(cuArray);
}
再去代码中的addImageToTextureFloatGray
函数,作用就是将图像存入了纹理内存中。
init函数入口在
dim3 grid_size_initrand;
//强制变成16的倍数,grid_size 是线程块的数量,block_size 是 一个线程块上的线程数量
grid_size_initrand.x=(cols+16-1)/16;
grid_size_initrand.y=(rows+16-1)/16;
dim3 block_size_initrand;
block_size_initrand.x=16;
block_size_initrand.y=16;
...
gipuma_init_cu2<T><<< grid_size_initrand, block_size_initrand>>>(gs);
其中
template< typename T >
__global__ void gipuma_init_cu2(GlobalState &gs)
{
//当前线程坐标
const int2 p = make_int2 ( blockIdx.x * blockDim.x + threadIdx.x, blockIdx.y * blockDim.y + threadIdx.y );
const int rows = gs.cameras->rows;
const int cols = gs.cameras->cols;
if (p.x >= cols)
return;
if (p.y >= rows)
return;
// Temporary variables
Camera_cu &camera = gs.cameras->cameras[REFERENCE];
const int center = p.y*cols+p.x;
int box_hrad = gs.params->box_hsize / 2;
int box_vrad = gs.params->box_vsize / 2;
float disp_now;
float4 norm_now;
curandState localState = gs.cs[p.y*cols+p.x];
curand_init ( clock64(), p.y, p.x, &localState );
// Compute random normal on half hemisphere of fronto view vector
float mind = gs.params->min_disparity;
float maxd = gs.params->max_disparity;
float4 viewVector;
//getViewVector的GPU版本
getViewVector_cu ( &viewVector, camera, p);
//printf("Random number is %f\n", random_number);
//return;
//均匀分布,在固定范围内随机分配一个disp值
disp_now = curand_between(&localState, mind, maxd);
//随机初始化normal值,参照论文
rndUnitVectorOnHemisphere_cu ( &norm_now, viewVector, &localState );
//根据disparity值计算depth值
disp_now= disparityDepthConversion_cu ( camera.f, camera.baseline, disp_now);
// Save values
//计算
norm_now.w = getD_cu ( norm_now, p, disp_now, camera);
//disp[x] = disp_now;
gs.lines->norm4[center] = norm_now;
__shared__ T tile_leftt[1] ;
const int2 tmp =make_int2(0,0);
//计算这一点的cost值
gs.lines->c[center] = pmCostMultiview_cu ( gs.imgs,
tile_leftt,
tmp,
p,
norm_now,
box_vrad, box_hrad,
*(gs.params),
*(gs.cameras),
gs.lines->norm4,
0);
return;
}
下面结合论文来看每一个子步骤。
rndUnitVectorOnHemisphere_cu
按照论文《PatchMatch Stereo - Stereo Matching with
》,对每个点,随机初始化一个disparity和normal值。
Slanted Support Windows
假设每个点都有一个倾斜面,用
对应代码
__device__ FORCEINLINE static void rndUnitVectorOnHemisphere_cu ( float4 *v, const float4 &viewVector, curandState *cs ) {
rndUnitVectorSphereMarsaglia_cu (v, cs);
vecOnHemisphere_cu ( v,viewVector );
};
利用 q1,q2 来初始化一个normal
__device__ FORCEINLINE static void rndUnitVectorSphereMarsaglia_cu (float4 *v, curandState *cs) {
float x = 1.0f;
float y = 1.0f;
float sum = 2.0f;
while ( sum>=1.0f ) {
x = curand_between (cs, -1.0f, 1.0f);
y = curand_between (cs, -1.0f, 1.0f);
sum = get_pow2_norm(x,y);
}
const float sq = sqrtf ( 1.0f-sum );
v->x = 2.0f*x*sq;
v->y = 2.0f*y*sq;
v->z = 1.0f-2.0f*sum;
//v->x = 0;
//v->y = 0;
//v->z = -1;
}
如果ray是正向的,就将整个normal反向。
__device__ FORCEINLINE static void vecOnHemisphere_cu ( float4 * __restrict__ v, const float4 &viewVector ) {
const float dp = dot4 ( (*v), viewVector );
if ( dp > 0.0f ) {
negate4(v);
}
return;
}
/* cost computation for multiple images
* combines cost of all ref-to-img correspondences
*/
template< typename T >
__device__ FORCEINLINE static float pmCostMultiview_cu (
const cudaTextureObject_t *images,
const T * __restrict__ tile_left,
const int2 tile_offset,
const int2 p,
const float4 &normal,
const int &vRad,
const int &hRad,
const AlgorithmParameters &algParam,
const CameraParameters_cu &camParams,
const float4 * __restrict__ state,
const int point)
{
// iterate over all other images and compute cost
//const int numImages = camParams.viewSelectionSubsetNumber; // CACHE
float costVector[32];
float cost = 0.0f;
int numValidViews = 0;
int cost_count=0;
for ( int i = 0; i < camParams.viewSelectionSubsetNumber; i++ ) {
int idxCurr = camParams.viewSelectionSubset[i];
/*if ( idxCurr != REFERENCE ) */
{
float c = 0;
#ifdef SHARED
if (tile_offset.x!= 0 )
c = pmCost_shared ( images[REFERENCE],
tile_left,
tile_offset,
images[idxCurr],
p,
normal,
vRad, hRad,
algParam, camParams,
idxCurr );
else
#endif
c = pmCost ( images[REFERENCE],
tile_left,
tile_offset,
images[idxCurr],
p.x, p.y,
normal,
vRad, hRad,
algParam, camParams,
idxCurr );
// only add to cost vector if viewable
if ( c < MAXCOST )
numValidViews++;
else
c = MAXCOST; // in order to not get an overflow when accumulating
costVector[i] = c;
cost_count++;
}
}
sort_small(costVector,cost_count);
//for some robustness only consider best n cost values (n dependent on number of images)
int numBest = numValidViews; //numImages-1;
if ( algParam.cost_comb == COMB_BEST_N )
numBest = min ( numBest, algParam.n_best );
if ( algParam.cost_comb == COMB_GOOD )
numBest = camParams.viewSelectionSubsetNumber ;
float costThresh = costVector[0] * algParam.good_factor;
int numConsidered = 0;
for ( int i = 0; i < numBest; i++ ) {
numConsidered++;
float c = costVector[i];
if ( algParam.cost_comb == COMB_GOOD ) {
c = fminf ( c, costThresh );
}
cost = cost + c;
}
cost = cost / ( ( float ) numConsidered);
if ( numConsidered < 1 )
cost = MAXCOST;
if ( cost != cost || cost > MAXCOST || cost < 0 )
cost = MAXCOST;
return cost;
}
主要针对multiview的做cost计算,关键是其中的pmCost函数来计算cost。
计算公式如下,在一个小窗口中,对一个窗口内的点进行计算总的cost,作为中心点的cost,这样比较鲁棒,每个点的权重与点到中心点的颜色相似度有关。
pmCost函数为
template< typename T >
__device__ FORCEINLINE static float pmCost (
const cudaTextureObject_t &l,
const T * __restrict__ tile_left,
const int2 tile_offset,
const cudaTextureObject_t &r,
const int &x,
const int &y,
const float4 &normal,
const int &vRad,
const int &hRad,
const AlgorithmParameters &algParam,
const CameraParameters_cu &camParams,
const int &camTo )
{
const int cols = camParams.cols;
const int rows = camParams.rows;
const float alpha = algParam.alpha;
const float tau_color = algParam.tau_color;
const float tau_gradient = algParam.tau_gradient;
const float gamma = algParam.gamma;
float4 pt_c;
float H[16];
//计算Homography矩阵,用于获得patch在另一个view下的位置。
getHomography_cu ( camParams.cameras[REFERENCE], camParams.cameras[camTo], camParams.cameras[REFERENCE].K_inv, camParams.cameras[camTo].K, normal, normal.w, H );
//根据Homography矩阵,获得这个点对应的位置。
getCorrespondingPoint_cu ( make_int2(x, y), H, &pt_c );
{
float cost = 0;
//float weightSum = 0.0f;
for ( int i = -hRad; i < hRad + 1; i+=WIN_INCREMENT ) {
for ( int j = -vRad; j < vRad + 1; j+=WIN_INCREMENT ) {
const int xTemp = x + i;
const int yTemp = y + j;
float4 pt_l;
pt_l.x = __int2float_rn(xTemp);
pt_l.y = __int2float_rn(yTemp);
int2 pt_li = make_int2(xTemp, yTemp);
float w;
w = weight_cu ( tex2D(l, pt_l.x + 0.5f, pt_l.y + 0.5f), tex2D(l,x + 0.5f,y + 0.5f), gamma);
float4 pt;
getCorrespondingPoint_cu ( make_int2(xTemp, yTemp),
H,
&pt );
cost = cost + pmCostComputation ( l, tile_left, r, pt_l, pt, rows, cols, tau_color, tau_gradient, alpha, w );
//weightSum = weightSum + w;
}
}
return cost;
}
}
其中, getHomography_cu
和 getCorrespondingPoint_cu
的算法来自
Gipuma论文中
按照论文的说法,直接利用Homograph来寻找在另一个view上的对应点,可以不用rectify。其实就是假设一个patch对应的3D物体在一个平面上。
然后,在函数
weight_cu
和 pmCostComputation
里基本实现了前面提到的cost计算公式,只是需要注意
const T gx1 = tex2D (l, pt_l.x+1 + 0.5f, pt_l.y + 0.5f) - tex2D (l, pt_l.x-1 + 0.5f, pt_l.y + 0.5f);
const T gy1 = tex2D (l, pt_l.x + 0.5f, pt_l.y+1 + 0.5f) - tex2D (l, pt_l.x + 0.5f, pt_l.y-1 + 0.5f);
const T gx2 = tex2D (r, pt_r.x+1 + 0.5f, pt_r.y + 0.5f) - tex2D (r, pt_r.x-1 + 0.5f, pt_r.y + 0.5f);
const T gy2 = tex2D (r, pt_r.x + 0.5f, pt_r.y+1 + 0.5f) - tex2D (r, pt_r.x + 0.5f, pt_r.y-1 + 0.5f);
//计算梯度时,是间隔了一个元素进行计算的,注意。
const T gradX = (gx1 - gx2);
const T gradY = (gy1 - gy2);
//gradient dissimilarity (L1) in x and y direction (multiplication by 0.5 to use tauGrad from PatchMatch stereo paper)
//为什么乘以0.0625f?
const float gradDis = fminf ( ( l1_norm ( gradX ) + l1_norm ( gradY ) ) * 0.0625f, tau_gradient );
然后就开始大循环迭代了,
for (int it = 0;it"%d ", it+1);
#ifdef SMALLKERNEL
//spatial propagation of 4 closest neighbors (1px up/down/left/right)
gipuma_black_spatialPropClose_cu<<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
cudaDeviceSynchronize();
#ifdef EXTRAPOINTFAR
//spatial propagation of 4 far away neighbors (5px up/down/left/right)
gipuma_black_spatialPropFar_cu<<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
cudaDeviceSynchronize();
#endif
//plane refinement
gipuma_black_planeRefine_cu<<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
cudaDeviceSynchronize();
//spatial propagation of 4 closest neighbors (1px up/down/left/right)
gipuma_red_spatialPropClose_cu<<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
cudaDeviceSynchronize();
#ifdef EXTRAPOINTFAR
//spatial propagation of 4 far away neighbors (5px up/down/left/right)
gipuma_red_spatialPropFar_cu<<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
cudaDeviceSynchronize();
#endif
//plane refinement
gipuma_red_planeRefine_cu<<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
cudaDeviceSynchronize();
#else
gipuma_black_cu<<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
gipuma_red_cu<<< grid_size, block_size, shared_size_host * sizeof(T)>>>(gs, it);
#endif
}
原始的算法是从第一个元素开始遍历,试用邻居节点的平面,保留cost值小的参数。而GPU版本利用红黑块,来做并行化。
具体实现的代码,几个函数大同小异,暂时简单说明,以后有空再开一篇详细介绍。简单的说,就是根据上下左右的点来更新,将让自己cost值更小的normal值更新为自己的。
if (p.y>0) {
SPATIALPROPAGATION(up);
}
if (p.y1) {
SPATIALPROPAGATION(down);
}
if (p.x>0) {
SPATIALPROPAGATION(left);
}
if (p.x1) {
SPATIALPROPAGATION(right);
}
// Save to global memory
c [center] = cost_now;
//disp [center] = disp_now;
norm [center] = norm_now;