关于CUDA实现最值问题

          一转眼一周就过去了,算来入职已经半个月了,项目也进行了十来天,感觉这一周受到最纠结的莫过于寻最值问题了。

           听起来寻最值应该是个很简单的问题,当时的思路是这样的,并行规约寻最值,并记录下标,于是有了第一版的程序:

           __global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

            {

                           __share__ int temp_value_share[warp_size];

                           __share__ int temp_idx_share[warp_size];

                          int tid=thread.x+blockDim.x*blockIdx.x;

                          int i,temp_value,temp_value1,temp_idx,temp_idx1;

                          int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

                          if(tid

                          {

                                 temp_value=d_array[tid];

                                temp_idx=thread.x;

                                 for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                  }

                               if(!laneid)

                               {

                                      temp_value_share[warpid]=temp_value;

                                      temp_idx_share[warpid]=temp_idx;

                              }

                            __sychthreads();

                           if(thread.x

                          {

                                   temp_value=temp_value_share[thread.x];

                                   temp_idx=temp_idx_share[thread.x];

                                  for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                  }

                           }

                             if(!thread.x)

                            {

                                    max_value[blockIdx.x]=temp_value;

                                    max_idx[block.x]=temp_idx;

                             }

                           }

             }

            用例测试发现当数组存在超过长度n,可能存在寻找的数据超过n的情况,在师兄的提示下,有了第二版:

    __global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

            {

                           __share__ int temp_value_share[warp_size];

                           __share__ int temp_idx_share[warp_size];

                          int tid=thread.x+blockDim.x*blockIdx.x;

                          int i,temp_value,temp_value1,temp_idx,temp_idx1;

                          int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

                            temp_value=-1e30;

                           temp_idx=thread.x;

                          if(tid

                          {

                                    temp_value=d_array[tid];

                           }

                                 for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                  }

                               if(!laneid)

                               {

                                      temp_value_share[warpid]=temp_value;

                                      temp_idx_share[warpid]=temp_idx;

                              }

                           __sychthreads();

                           if(thread.x

                          {

                                   temp_value=temp_value_share[thread.x];

                                   temp_idx=temp_idx_share[thread.x];

                                  for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                  }

                           }

                             if(!thread.x)

                            {

                                    max_value[blockIdx.x]=temp_value;

                                    max_idx[block.x]=temp_idx;

                             }

                     

             }

          这个感觉应该没问题了吧,可事实令我发狂,项目中刚开始是对的,可后面就错了,为此,调试了很久,结果发现最大值是对的,而下标是错的,很显然,存在相等的下标,这个让我吃透了苦,于是有了第三版:

__global__ void Max_Reduce(int *d_array, int array_len, int *max_value, int *max_idx)

            {

                           __share__ int temp_value_share[warp_size];

                           __share__ int temp_idx_share[warp_size];

                          int tid=thread.x+blockDim.x*blockIdx.x;

                          int i,temp_value,temp_value1,temp_idx,temp_idx1;

                          int warpid=thread.x/warp_size,laneid=thread.x%warp_size;

                            temp_value=-1e30;

                           temp_idx=thread.x;

                          if(tid

                          {

                                    temp_value=d_array[tid];

                           }

                                 for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                       else if(temp_value=temp_value1)

                                       {

                                                 if(temp_idx>temp_idx1)

                                                {

                                                         temp_idx=temp_idx1;

                                                }

                                        }

                                  }

                               if(!laneid)

                               {

                                      temp_value_share[warpid]=temp_value;

                                      temp_idx_share[warpid]=temp_idx;

                              }

                            __sychthreads();

                           if(thread.x

                          {

                                   temp_value=temp_value_share[thread.x];

                                   temp_idx=temp_idx_share[thread.x];

                                  for(i=warp_size/2;i>=1;i/=2)

                                 {

                                         temp_value1=shft_xor(temp_value,i,warp_size);

                                         temp_idx1=shft_xor(temp_idx,i,warp_size);

                                        if(temp_value

                                        {

                                                temp_value=temp_value1;

                                                temp_idx=temp_idx1;

                                         }

                                         else if(temp_value=temp_value1)

                                       {

                                                 if(temp_idx>temp_idx1)

                                                {

                                                         temp_idx=temp_idx1;

                                                }

                                        }

                                  }

                           }

                             if(!thread.x)

                            {

                                    max_value[blockIdx.x]=temp_value;

                                    max_idx[block.x]=temp_idx;

                             }

                     

             }



你可能感兴趣的:(CUDA)