小牛蛋

CUDA学习笔记三

HOG行人检测算法CUDA并行实现这个版本的封装级别还不是特别高所以很多还是可以看懂

这个示例可以看到写并行程序的不易首先要对算法的实现有十分清晰的理解也展现了很多CUDA优化的方法包括shared memory的使用循环展开

#include "internal_shared.hpp"

#ifndef CV_PI_F
  #ifndef CV_PI
    #define CV_PI_F 3.14159265f
  #else
    #define CV_PI_F ((float)CV_PI)
  #endif
#endif

// Other values are not supported
#define CELL_WIDTH 8
#define CELL_HEIGHT 8
#define CELLS_PER_BLOCK_X 2
#define CELLS_PER_BLOCK_Y 2

namespace cv { namespace gpu { namespace hog {

__constant__ int cnbins;
__constant__ int cblock_stride_x;
__constant__ int cblock_stride_y;
__constant__ int cnblocks_win_x;
__constant__ int cnblocks_win_y;
__constant__ int cblock_hist_size;
__constant__ int cblock_hist_size_2up;
__constant__ int cdescr_size;
__constant__ int cdescr_width;


/* Returns the nearest upper power of two, works only for 
the typical GPU thread count (pert block) values */
int power_2up(unsigned int n)
{
    if (n < 1) return 1;
    else if (n < 2) return 2;
    else if (n < 4) return 4;
    else if (n < 8) return 8;
    else if (n < 16) return 16;
    else if (n < 32) return 32;
    else if (n < 64) return 64;
    else if (n < 128) return 128;
    else if (n < 256) return 256;
    else if (n < 512) return 512;
    else if (n < 1024) return 1024;
    return -1; // Input is too big
}


void set_up_constants(int nbins, int block_stride_x, int block_stride_y, 
                      int nblocks_win_x, int nblocks_win_y)
{
    uploadConstant("cv::gpu::hog::cnbins", nbins);
    uploadConstant("cv::gpu::hog::cblock_stride_x", block_stride_x);
    uploadConstant("cv::gpu::hog::cblock_stride_y", block_stride_y);
    uploadConstant("cv::gpu::hog::cnblocks_win_x", nblocks_win_x);
    uploadConstant("cv::gpu::hog::cnblocks_win_y", nblocks_win_y);

    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
    uploadConstant("cv::gpu::hog::cblock_hist_size", block_hist_size);

    int block_hist_size_2up = power_2up(block_hist_size);    
    uploadConstant("cv::gpu::hog::cblock_hist_size_2up", block_hist_size_2up);

    int descr_width = nblocks_win_x * block_hist_size;
    uploadConstant("cv::gpu::hog::cdescr_width", descr_width);

    int descr_size = descr_width * nblocks_win_y;
    uploadConstant("cv::gpu::hog::cdescr_size", descr_size);
}


//----------------------------------------------------------------------------
// Histogram computation


template  // Number of histogram blocks processed by single GPU thread block
__global__ void compute_hists_kernel_many_blocks(const int img_block_width, const PtrElemStepf grad, 
                                                 const PtrElemStep qangle, float scale, float* block_hists)
{
    const int block_x = threadIdx.z;
    const int cell_x = threadIdx.x / 16;
    const int cell_y = threadIdx.y;
    const int cell_thread_x = threadIdx.x & 0xF;

    if (blockIdx.x * blockDim.z + block_x >= img_block_width)
        return;

    extern __shared__ float smem[];
    float* hists = smem;
    float* final_hist = smem + cnbins * 48 * nblocks;

    const int offset_x = (blockIdx.x * blockDim.z + block_x) * cblock_stride_x + 
                         4 * cell_x + cell_thread_x;
    const int offset_y = blockIdx.y * cblock_stride_y + 4 * cell_y;

    const float* grad_ptr = grad.ptr(offset_y) + offset_x * 2;
    const unsigned char* qangle_ptr = qangle.ptr(offset_y) + offset_x * 2;

    // 12 means that 12 pixels affect on block's cell (in one row)
    if (cell_thread_x < 12)
    {
        float* hist = hists + 12 * (cell_y * blockDim.z * CELLS_PER_BLOCK_Y + 
                                    cell_x + block_x * CELLS_PER_BLOCK_X) + 
                                   cell_thread_x;
        for (int bin_id = 0; bin_id < cnbins; ++bin_id)
            hist[bin_id * 48 * nblocks] = 0.f;

        const int dist_x = -4 + (int)cell_thread_x - 4 * cell_x;

        const int dist_y_begin = -4 - 4 * (int)threadIdx.y;
        for (int dist_y = dist_y_begin; dist_y < dist_y_begin + 12; ++dist_y)
        {
            float2 vote = *(const float2*)grad_ptr;
            uchar2 bin = *(const uchar2*)qangle_ptr;

            grad_ptr += grad.step;
            qangle_ptr += qangle.step;

            int dist_center_y = dist_y - 4 * (1 - 2 * cell_y);
            int dist_center_x = dist_x - 4 * (1 - 2 * cell_x);

            float gaussian = expf(-(dist_center_y * dist_center_y + 
                                    dist_center_x * dist_center_x) * scale);
            float interp_weight = (8.f - fabs(dist_y + 0.5f)) * 
                                  (8.f - fabs(dist_x + 0.5f)) / 64.f;

            hist[bin.x * 48 * nblocks] += gaussian * interp_weight * vote.x;
            hist[bin.y * 48 * nblocks] += gaussian * interp_weight * vote.y;
        }

        volatile float* hist_ = hist;
        for (int bin_id = 0; bin_id < cnbins; ++bin_id, hist_ += 48 * nblocks)
        {
            if (cell_thread_x < 6) hist_[0] += hist_[6];
            if (cell_thread_x < 3) hist_[0] += hist_[3];
            if (cell_thread_x == 0) 
                final_hist[((cell_x + block_x * 2) * 2 + cell_y) * cnbins + bin_id] 
                    = hist_[0] + hist_[1] + hist_[2];
        }
    }

    __syncthreads();

    float* block_hist = block_hists + (blockIdx.y * img_block_width + 
                                       blockIdx.x * blockDim.z + block_x) * 
                                      cblock_hist_size;        

    int tid = (cell_y * CELLS_PER_BLOCK_Y + cell_x) * 16 + cell_thread_x;
    if (tid < cblock_hist_size)
        block_hist[tid] = final_hist[block_x * cblock_hist_size + tid];     
}


void compute_hists(int nbins, int block_stride_x, int block_stride_y, 
                   int height, int width, const DevMem2Df& grad, 
                   const DevMem2D& qangle, float sigma, float* block_hists)                             
{
    const int nblocks = 1;

    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / 
                          block_stride_x;
    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / 
                           block_stride_y;

    dim3 grid(divUp(img_block_width, nblocks), img_block_height);
    dim3 threads(32, 2, nblocks);

    cudaSafeCall(cudaFuncSetCacheConfig(compute_hists_kernel_many_blocks, 
                                        cudaFuncCachePreferL1));
 
    // Precompute gaussian spatial window parameter
    float scale = 1.f / (2.f * sigma * sigma);

    int hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * 12 * nblocks) * sizeof(float);
    int final_hists_size = (nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y * nblocks) * sizeof(float);
    int smem = hists_size + final_hists_size;
    compute_hists_kernel_many_blocks<<>>(
        img_block_width, grad, qangle, scale, block_hists);
    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
}


//-------------------------------------------------------------
//  Normalization of histograms via L2Hys_norm
//


template 
__device__ float reduce_smem(volatile float* smem)
{        
    unsigned int tid = threadIdx.x;
    float sum = smem[tid];

    if (size >= 512) { if (tid < 256) smem[tid] = sum = sum + smem[tid + 256]; __syncthreads(); }
    if (size >= 256) { if (tid < 128) smem[tid] = sum = sum + smem[tid + 128]; __syncthreads(); }
    if (size >= 128) { if (tid < 64) smem[tid] = sum = sum + smem[tid + 64]; __syncthreads(); }
    
    if (tid < 32)
    {        
        if (size >= 64) smem[tid] = sum = sum + smem[tid + 32];
        if (size >= 32) smem[tid] = sum = sum + smem[tid + 16];
        if (size >= 16) smem[tid] = sum = sum + smem[tid + 8];
        if (size >= 8) smem[tid] = sum = sum + smem[tid + 4];
        if (size >= 4) smem[tid] = sum = sum + smem[tid + 2];
        if (size >= 2) smem[tid] = sum = sum + smem[tid + 1];
    }

    __syncthreads();
    sum = smem[0];
    
    return sum;
}


template  // Number of block hisograms processed by one GPU thread block
__global__ void normalize_hists_kernel_many_blocks(const int block_hist_size,
                                                   const int img_block_width, 
                                                   float* block_hists, float threshold)
{
    if (blockIdx.x * blockDim.z + threadIdx.z >= img_block_width)
        return;

    float* hist = block_hists + (blockIdx.y * img_block_width + 
                                 blockIdx.x * blockDim.z + threadIdx.z) * 
                                block_hist_size + threadIdx.x;
    
    __shared__ float sh_squares[nthreads * nblocks];
    float* squares = sh_squares + threadIdx.z * nthreads;
    
    float elem = 0.f;
    if (threadIdx.x < block_hist_size)
        elem = hist[0];
    
    squares[threadIdx.x] = elem * elem;        

    __syncthreads();
    float sum = reduce_smem(squares);
    
    float scale = 1.0f / (sqrtf(sum) + 0.1f * block_hist_size);        
    elem = min(elem * scale, threshold);
    
    __syncthreads();
    squares[threadIdx.x] = elem * elem;

    __syncthreads();
    sum = reduce_smem(squares);
    scale = 1.0f / (sqrtf(sum) + 1e-3f);
    
    if (threadIdx.x < block_hist_size)
        hist[0] = elem * scale;
}


void normalize_hists(int nbins, int block_stride_x, int block_stride_y, 
                     int height, int width, float* block_hists, float threshold)
{   
    const int nblocks = 1;

    int block_hist_size = nbins * CELLS_PER_BLOCK_X * CELLS_PER_BLOCK_Y;
    int nthreads = power_2up(block_hist_size);
    dim3 threads(nthreads, 1, nblocks);

    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
    int img_block_height = (height - CELLS_PER_BLOCK_Y * CELL_HEIGHT + block_stride_y) / block_stride_y;
    dim3 grid(divUp(img_block_width, nblocks), img_block_height);

    if (nthreads == 32)
        normalize_hists_kernel_many_blocks<32, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold);
    else if (nthreads == 64)
        normalize_hists_kernel_many_blocks<64, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold);
    else if (nthreads == 128)
        normalize_hists_kernel_many_blocks<64, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold);
    else if (nthreads == 256)
        normalize_hists_kernel_many_blocks<256, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold);
    else if (nthreads == 512)
        normalize_hists_kernel_many_blocks<512, nblocks><<>>(block_hist_size, img_block_width, block_hists, threshold);
    else
        cv::gpu::error("normalize_hists: histogram's size is too big, try to decrease number of bins", __FILE__, __LINE__);

    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
}


//---------------------------------------------------------------------
//  Linear SVM based classification
//


template  // Number of histogram block processed by single GPU thread block
__global__ void classify_hists_kernel_many_blocks(const int img_win_width, const int img_block_width, 
                                                  const int win_block_stride_x, const int win_block_stride_y,
                                                  const float* block_hists, const float* coefs,
                                                  float free_coef, float threshold, unsigned char* labels)
{            
    const int win_x = threadIdx.z;
    if (blockIdx.x * blockDim.z + win_x >= img_win_width)
        return;

    const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
                                       blockIdx.x * win_block_stride_x * blockDim.z + win_x) * 
                                      cblock_hist_size;

    float product = 0.f;
    for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
    {
        int offset_y = i / cdescr_width;
        int offset_x = i - offset_y * cdescr_width;
        product += coefs[i] * hist[offset_y * img_block_width * cblock_hist_size + offset_x];
    }

    __shared__ float products[nthreads * nblocks];

    const int tid = threadIdx.z * nthreads + threadIdx.x;
    products[tid] = product;

    __syncthreads();

    if (nthreads >= 512) 
    { 
        if (threadIdx.x < 256) products[tid] = product = product + products[tid + 256];
        __syncthreads(); 
    }
    if (nthreads >= 256) 
    { 
        if (threadIdx.x < 128) products[tid] = product = product + products[tid + 128]; 
        __syncthreads(); 
    }
    if (nthreads >= 128) 
    { 
        if (threadIdx.x < 64) products[tid] = product = product + products[tid + 64]; 
        __syncthreads(); 
    }
    
    if (threadIdx.x < 32)
    {        
        volatile float* smem = products;
        if (nthreads >= 64) smem[tid] = product = product + smem[tid + 32];
        if (nthreads >= 32) smem[tid] = product = product + smem[tid + 16];
        if (nthreads >= 16) smem[tid] = product = product + smem[tid + 8];
        if (nthreads >= 8) smem[tid] = product = product + smem[tid + 4];
        if (nthreads >= 4) smem[tid] = product = product + smem[tid + 2];
        if (nthreads >= 2) smem[tid] = product = product + smem[tid + 1];
    }

    if (threadIdx.x == 0)
        labels[blockIdx.y * img_win_width + blockIdx.x * blockDim.z + win_x] = (product + free_coef >= threshold);
}


void classify_hists(int win_height, int win_width, int block_stride_y, int block_stride_x, 
                    int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
                    float* coefs, float free_coef, float threshold, unsigned char* labels)
{   
    const int nthreads = 256;
    const int nblocks = 1;

    int win_block_stride_x = win_stride_x / block_stride_x;
    int win_block_stride_y = win_stride_y / block_stride_y;
    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;

    dim3 threads(nthreads, 1, nblocks);
    dim3 grid(divUp(img_win_width, nblocks), img_win_height);

    cudaSafeCall(cudaFuncSetCacheConfig(classify_hists_kernel_many_blocks, cudaFuncCachePreferL1));

    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
    classify_hists_kernel_many_blocks<<>>(
        img_win_width, img_block_width, win_block_stride_x, win_block_stride_y, 
        block_hists, coefs, free_coef, threshold, labels);
    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
}

//----------------------------------------------------------------------------
// Extract descriptors


template 
__global__ void extract_descrs_by_rows_kernel(const int img_block_width, const int win_block_stride_x, const int win_block_stride_y, 
											  const float* block_hists, PtrElemStepf descriptors)
{
    // Get left top corner of the window in src
    const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
                                       blockIdx.x * win_block_stride_x) * cblock_hist_size;

    // Get left top corner of the window in dst
    float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);

    // Copy elements from src to dst
    for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
    {
        int offset_y = i / cdescr_width;
        int offset_x = i - offset_y * cdescr_width;
        descriptor[i] = hist[offset_y * img_block_width * cblock_hist_size + offset_x];
    }
}


void extract_descrs_by_rows(int win_height, int win_width, int block_stride_y, int block_stride_x, int win_stride_y, int win_stride_x, 
							int height, int width, float* block_hists, DevMem2Df descriptors)
{
    const int nthreads = 256;

    int win_block_stride_x = win_stride_x / block_stride_x;
    int win_block_stride_y = win_stride_y / block_stride_y;
    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
    dim3 threads(nthreads, 1);
    dim3 grid(img_win_width, img_win_height);

    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
    extract_descrs_by_rows_kernel<<>>(
        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
}


template 
__global__ void extract_descrs_by_cols_kernel(const int img_block_width, const int win_block_stride_x, 
                                              const int win_block_stride_y, const float* block_hists, 
                                              PtrElemStepf descriptors)
{
    // Get left top corner of the window in src
    const float* hist = block_hists + (blockIdx.y * win_block_stride_y * img_block_width + 
                                       blockIdx.x * win_block_stride_x) * cblock_hist_size;

    // Get left top corner of the window in dst
    float* descriptor = descriptors.ptr(blockIdx.y * gridDim.x + blockIdx.x);

    // Copy elements from src to dst
    for (int i = threadIdx.x; i < cdescr_size; i += nthreads)
    {
        int block_idx = i / cblock_hist_size;
        int idx_in_block = i - block_idx * cblock_hist_size;

        int y = block_idx / cnblocks_win_x;
        int x = block_idx - y * cnblocks_win_x;

        descriptor[(x * cnblocks_win_y + y) * cblock_hist_size + idx_in_block] 
            = hist[(y * img_block_width  + x) * cblock_hist_size + idx_in_block];
    }
}


void extract_descrs_by_cols(int win_height, int win_width, int block_stride_y, int block_stride_x, 
                            int win_stride_y, int win_stride_x, int height, int width, float* block_hists, 
                            DevMem2Df descriptors)
{
    const int nthreads = 256;

    int win_block_stride_x = win_stride_x / block_stride_x;
    int win_block_stride_y = win_stride_y / block_stride_y;
    int img_win_width = (width - win_width + win_stride_x) / win_stride_x;
    int img_win_height = (height - win_height + win_stride_y) / win_stride_y;
    dim3 threads(nthreads, 1);
    dim3 grid(img_win_width, img_win_height);

    int img_block_width = (width - CELLS_PER_BLOCK_X * CELL_WIDTH + block_stride_x) / block_stride_x;
    extract_descrs_by_cols_kernel<<>>(
        img_block_width, win_block_stride_x, win_block_stride_y, block_hists, descriptors);
    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
}

//----------------------------------------------------------------------------
// Gradients computation


template 
__global__ void compute_gradients_8UC4_kernel(int height, int width, const PtrElemStep img, 
                                              float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;

    const uchar4* row = (const uchar4*)img.ptr(blockIdx.y);

    __shared__ float sh_row[(nthreads + 2) * 3];

    uchar4 val;
    if (x < width) 
        val = row[x]; 
    else 
        val = row[width - 2];

    sh_row[threadIdx.x + 1] = val.x;
    sh_row[threadIdx.x + 1 + (nthreads + 2)] = val.y;
    sh_row[threadIdx.x + 1 + 2 * (nthreads + 2)] = val.z;

    if (threadIdx.x == 0)
    {
        val = row[max(x - 1, 1)];
        sh_row[0] = val.x;
        sh_row[(nthreads + 2)] = val.y;
        sh_row[2 * (nthreads + 2)] = val.z;
    }

    if (threadIdx.x == blockDim.x - 1)
    {
        val = row[min(x + 1, width - 2)];
        sh_row[blockDim.x + 1] = val.x;
        sh_row[blockDim.x + 1 + (nthreads + 2)] = val.y;
        sh_row[blockDim.x + 1 + 2 * (nthreads + 2)] = val.z;
    }

    __syncthreads();
    if (x < width)
    {
        float3 a, b;

        b.x = sh_row[threadIdx.x + 2];
        b.y = sh_row[threadIdx.x + 2 + (nthreads + 2)];
        b.z = sh_row[threadIdx.x + 2 + 2 * (nthreads + 2)];
        a.x = sh_row[threadIdx.x];
        a.y = sh_row[threadIdx.x + (nthreads + 2)];
        a.z = sh_row[threadIdx.x + 2 * (nthreads + 2)];

        float3 dx;
        if (correct_gamma)
            dx = make_float3(sqrtf(b.x) - sqrtf(a.x), sqrtf(b.y) - sqrtf(a.y), sqrtf(b.z) - sqrtf(a.z));    
        else
            dx = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);    

        float3 dy = make_float3(0.f, 0.f, 0.f);

        if (blockIdx.y > 0 && blockIdx.y < height - 1)
        {
            val = ((const uchar4*)img.ptr(blockIdx.y - 1))[x];
            a = make_float3(val.x, val.y, val.z);

            val = ((const uchar4*)img.ptr(blockIdx.y + 1))[x];
            b = make_float3(val.x, val.y, val.z);

            if (correct_gamma)
                dy = make_float3(sqrtf(b.x) - sqrtf(a.x), sqrtf(b.y) - sqrtf(a.y), sqrtf(b.z) - sqrtf(a.z));
            else
                dy = make_float3(b.x - a.x, b.y - a.y, b.z - a.z);
        }

        float best_dx = dx.x;
        float best_dy = dy.x;

        float mag0 = dx.x * dx.x + dy.x * dy.x;
        float mag1 = dx.y * dx.y + dy.y * dy.y;
        if (mag0 < mag1) 
        {
            best_dx = dx.y;
            best_dy = dy.y;
            mag0 = mag1;
        }

        mag1 = dx.z * dx.z + dy.z * dy.z;
        if (mag0 < mag1)
        {
            best_dx = dx.z;
            best_dy = dy.z;
            mag0 = mag1;
        }

        mag0 = sqrtf(mag0);

        float ang = (atan2f(best_dy, best_dx) + CV_PI_F) * angle_scale - 0.5f;
        int hidx = (int)floorf(ang);
        ang -= hidx;
        hidx = (hidx + cnbins) % cnbins;

        ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
        ((float2*)grad.ptr(blockIdx.y))[x] = make_float2(mag0 * (1.f - ang), mag0 * ang);
    }
}


void compute_gradients_8UC4(int nbins, int height, int width, const DevMem2D& img, 
                            float angle_scale, DevMem2Df grad, DevMem2D qangle, bool correct_gamma)
{
    const int nthreads = 256;

    dim3 bdim(nthreads, 1);
    dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));

    if (correct_gamma)
        compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle);
    else
        compute_gradients_8UC4_kernel<<>>(height, width, img, angle_scale, grad, qangle);

    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
}

template 
__global__ void compute_gradients_8UC1_kernel(int height, int width, const PtrElemStep img, 
                                              float angle_scale, PtrElemStepf grad, PtrElemStep qangle)
{
    const int x = blockIdx.x * blockDim.x + threadIdx.x;

    const unsigned char* row = (const unsigned char*)img.ptr(blockIdx.y);

    __shared__ float sh_row[nthreads + 2];

    if (x < width) 
        sh_row[threadIdx.x + 1] = row[x]; 
    else 
        sh_row[threadIdx.x + 1] = row[width - 2];

    if (threadIdx.x == 0)
        sh_row[0] = row[max(x - 1, 1)];

    if (threadIdx.x == blockDim.x - 1)
        sh_row[blockDim.x + 1] = row[min(x + 1, width - 2)];

    __syncthreads();
    if (x < width)
    {
        float dx;

        if (correct_gamma)
            dx = sqrtf(sh_row[threadIdx.x + 2]) - sqrtf(sh_row[threadIdx.x]);
        else
            dx = sh_row[threadIdx.x + 2] - sh_row[threadIdx.x];

        float dy = 0.f;
        if (blockIdx.y > 0 && blockIdx.y < height - 1)
        {
            float a = ((const unsigned char*)img.ptr(blockIdx.y + 1))[x];
            float b = ((const unsigned char*)img.ptr(blockIdx.y - 1))[x];
            if (correct_gamma)
                dy = sqrtf(a) - sqrtf(b);
            else
                dy = a - b;
        }
        float mag = sqrtf(dx * dx + dy * dy);

        float ang = (atan2f(dy, dx) + CV_PI_F) * angle_scale - 0.5f;
        int hidx = (int)floorf(ang);
        ang -= hidx;
        hidx = (hidx + cnbins) % cnbins;

        ((uchar2*)qangle.ptr(blockIdx.y))[x] = make_uchar2(hidx, (hidx + 1) % cnbins);
        ((float2*)  grad.ptr(blockIdx.y))[x] = make_float2(mag * (1.f - ang), mag * ang);
    }
}


void compute_gradients_8UC1(int nbins, int height, int width, const DevMem2D& img, 
                            float angle_scale, DevMem2Df grad, DevMem2D qangle, bool correct_gamma)
{
    const int nthreads = 256;

    dim3 bdim(nthreads, 1);
    dim3 gdim(divUp(width, bdim.x), divUp(height, bdim.y));

    if (correct_gamma)
        compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle);
    else
        compute_gradients_8UC1_kernel<<>>(height, width, img, angle_scale, grad, qangle);

    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );
}



//-------------------------------------------------------------------
// Resize

texture resize8UC4_tex;
texture resize8UC1_tex;

__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_ dst, int colOfs)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < dst.cols && y < dst.rows)
        dst.ptr(y)[x] = tex2D(resize8UC1_tex, x * sx + colOfs, y * sy) * 255;
}

__global__ void resize_for_hog_kernel(float sx, float sy, DevMem2D_ dst, int colOfs)
{
    unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    if (x < dst.cols && y < dst.rows)
	{        
		float4 val = tex2D(resize8UC4_tex, x * sx + colOfs, y * sy);
        dst.ptr(y)[x] = make_uchar4(val.x * 255, val.y * 255, val.z * 255, val.w * 255);
	}
}

template 
static void resize_for_hog(const DevMem2D& src, DevMem2D dst, TEX& tex)
{
    tex.filterMode = cudaFilterModeLinear;

    size_t texOfs = 0;
    int colOfs = 0;

    cudaChannelFormatDesc desc = cudaCreateChannelDesc();    
    cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );

    if (texOfs != 0) 
    {
        colOfs = static_cast( texOfs/sizeof(T) );
        cudaSafeCall( cudaUnbindTexture(tex) );
        cudaSafeCall( cudaBindTexture2D(&texOfs, tex, src.data, desc, src.cols, src.rows, src.step) );
    }    

    dim3 threads(32, 8);
    dim3 grid(divUp(dst.cols, threads.x), divUp(dst.rows, threads.y));
    
	float sx = static_cast(src.cols) / dst.cols;
    float sy = static_cast(src.rows) / dst.rows;

    resize_for_hog_kernel<<>>(sx, sy, (DevMem2D_)dst, colOfs);
    cudaSafeCall( cudaGetLastError() );

    cudaSafeCall( cudaDeviceSynchronize() );

    cudaSafeCall( cudaUnbindTexture(tex) );
}

void resize_8UC1(const DevMem2D& src, DevMem2D dst) { resize_for_hog (src, dst, resize8UC1_tex); }
void resize_8UC4(const DevMem2D& src, DevMem2D dst) { resize_for_hog(src, dst, resize8UC4_tex); }

}}}

系统学习Python——并发模型和异步编程：进程、线程和GIL
分类目录：《系统学习Python》总目录在文章《并发模型和异步编程：基础知识》我们简单介绍了Python中的进程、线程和协程。本文就着重介绍Python中的进程、线程和GIL的关系。Python解释器的每个实例都是一个进程。使用multiprocessing或concurrent.futures库可以启动额外的Python进程。Python的subprocess库用于启动运行外部程序（不管使用何种
C++ 11 Lambda表达式和min_element()与max_element()的使用_c++ lamda函数 min_element((1) 2401_84976182 程序员 c语言 c++学习
既有适合小白学习的零基础资料，也有适合3年以上经验的小伙伴深入学习提升的进阶课程，涵盖了95%以上CC++开发知识点，真正体系化！由于文件比较多，这里只是将部分目录截图出来，全套包含大厂面经、学习笔记、源码讲义、实战项目、大纲路线、讲解视频，并且后续会持续更新如果你需要这些资料，可以戳这里获取#include#include#includeusingnamespacestd;boolcmp(int
C++ 11 Lambda表达式和min_element()与max_element()的使用_c++ lamda函数 min_element(
网上学习资料一大堆，但如果学到的知识不成体系，遇到问题时只是浅尝辄止，不再深入研究，那么很难做到真正的技术提升。需要这份系统化的资料的朋友，可以添加戳这里获取一个人可以走的很快，但一群人才能走的更远！不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人，都欢迎加入我们的的圈子（技术交流、学习资源、职场吐槽、大厂内推、面试辅导），让我们一起学习成长！intmain(){vectormyvec{3,
冒泡、选择、插入排序：三大基础排序算法深度解析（C语言实现） xienda 算法排序算法数据结构
在算法学习道路上，排序算法是每位程序员必须掌握的基石。本文将深入解析冒泡排序、选择排序和插入排序这三种基础排序算法，通过C语言代码实现和对比分析，帮助读者彻底理解它们的差异与应用场景。算法原理与代码实现1.冒泡排序（BubbleSort）工作原理：通过重复比较相邻元素，将较大元素逐步"冒泡"到数组末尾。voidbubbleSort(intarr[],intn){ for(inti=0;iarr[
精通Canvas：15款时钟特效代码实现指南烟幕缭绕
本文还有配套的精品资源，点击获取简介：HTML5的Canvas是一个用于绘制矢量图形的API，通过JavaScript实现动态效果。本项目集合了15种不同的时钟特效代码，帮助开发者通过学习绘制圆形、线条、时间更新、旋转、颜色样式设置及动画效果等概念，深化对Canvas的理解和应用。项目中的CSS文件负责时钟的样式设定，而JS文件则包含实现各种特效的逻辑，通过不同的函数或类处理时间更新和动画绘制，提
高效批量单词翻译工具的设计与应用
本文还有配套的精品资源，点击获取简介：在信息技术飞速发展的今天，批量单词翻译工具通过计算机的数据处理能力，大大提高了语言学习和文字处理的效率。用户通过简单输入单词列表到一个文本文件，并运行翻译程序，即可获得翻译结果并保存至指定文件。该工具集成了内置或外部翻译引擎，利用自然语言处理技术实现快速准确的翻译，并可能提供词性识别等附加功能。尽管机器翻译无法完全取代人工校对，但它为用户提供了一种高效的翻译解
FPGA小白到项目实战：Verilog+Vivado全流程通关指南（附光学类岗位技能映射）阿牛的药铺算法移植部署 fpga开发 verilog
FPGA小白到项目实战：Verilog+Vivado全流程通关指南（附光学类岗位技能映射）引言：为什么这个FPGA入门路线能帮你快速上岗？本文设计了一条**"Verilog语法→工具链操作→光学项目实战→岗位技能对标"的阶梯式学习路径。不同于泛泛而谈的FPGA教程，我们聚焦光学类产品开发**核心能力（时序接口设计、图像处理算法移植、高速接口应用），通过3个递进式项目（从LED闪烁到图像边缘检测），
PyTorch & TensorFlow速成复习：从基础语法到模型部署实战（附FPGA移植衔接）阿牛的药铺算法移植部署 pytorch tensorflow fpga开发
PyTorch&TensorFlow速成复习：从基础语法到模型部署实战（附FPGA移植衔接）引言：为什么算法移植工程师必须掌握框架基础？针对光学类产品算法FPGA移植岗位需求（如可见光/红外图像处理），深度学习框架是算法落地的"桥梁"——既要用PyTorch/TensorFlow验证算法可行性，又要将训练好的模型（如CNN、目标检测）转换为FPGA可部署的格式（ONNX、TFLite）。本文采用"
基于链家网的二手房数据采集清洗与可视化分析 Mint_Datazzh 项目 selenium 网络爬虫
个人学习内容笔记，仅供参考。项目链接：https://gitee.com/rongwu651/lianjia原文链接：基于链家网的二手房数据采集清洗与可视化分析–笔墨云烟研究内容该课题的主要目的是通过将二手房网站上的存量与已销售房源，构建一个二手房市场行情情况与房源特点的可视化平台。该平台通过HTML架构和Echarts完成可视化的搭建。因此，该课题的主要研究内容就是如何利用相关技术设计并实现这样
算法学习笔记：17.蒙特卡洛算法 ——从原理到实战，涵盖 LeetCode 与考研 408 例题
在计算机科学和数学领域，蒙特卡洛算法（MonteCarloAlgorithm）以其独特的随机抽样思想，成为解决复杂问题的有力工具。从圆周率的计算到金融风险评估，从物理模拟到人工智能，蒙特卡洛算法都发挥着不可替代的作用。本文将深入剖析蒙特卡洛算法的思想、解题思路，结合实际应用场景与Java代码实现，并融入考研408的相关考点，穿插图片辅助理解，帮助你全面掌握这一重要算法。蒙特卡洛算法的基本概念蒙特卡
分布式学习笔记_04_复制模型 NzuCRAS 分布式学习笔记架构后端
常见复制模型使用复制的目的在分布式系统中，数据通常需要被分布在多台机器上，主要为了达到：拓展性：数据量因读写负载巨大，一台机器无法承载，数据分散在多台机器上仍然可以有效地进行负载均衡，达到灵活的横向拓展高容错&高可用：在分布式系统中单机故障是常态，在单机故障的情况下希望整体系统仍然能够正常工作，这时候就需要数据在多台机器上做冗余，在遇到单机故障时能够让其他机器接管统一的用户体验：如果系统客户端分布
算法学习笔记：15.二分查找 ——从原理到实战，涵盖 LeetCode 与考研 408 例题呆呆企鹅仔算法学习算法学习笔记考研二分查找
在计算机科学的查找算法中，二分查找以其高效性占据着重要地位。它利用数据的有序性，通过不断缩小查找范围，将原本需要线性时间的查找过程优化为对数时间，成为处理大规模有序数据查找问题的首选算法。二分查找的基本概念二分查找（BinarySearch），又称折半查找，是一种在有序数据集合中查找特定元素的高效算法。其核心原理是：通过不断将查找范围减半，快速定位目标元素。与线性查找逐个遍历元素不同，二分查找依赖
OpenWebUI(12)源码学习-后端constants.py常量定义文件青苔猿猿 AI大模型 openwebui constants常量定义
目录文件名：`constants.py`功能概述：主要功能点详解1.**MESSAGES枚举类**2.**WEBHOOK_MESSAGES枚举类**3.**ERROR_MESSAGES枚举类**✅默认错误模板✅认证与用户相关错误✅资源冲突与重复错误✅验证失败类错误✅权限限制类错误✅文件上传与格式错误✅模型与API错误✅请求频率与安全限制✅数据库与配置错误4.**TASKS枚举类**✅总结实际应用场
RocketMQ 基础教程-应用篇-死信队列码炫课堂-码哥 rocketmq专题 rocketmq java
作者简介：大家好，我是smart哥，前中兴通讯、美团架构师，现某互联网公司CTO联系qq：184480602，加我进群，大家一起学习，一起进步，一起对抗互联网寒冬学习必须往深处挖，挖的越深，基础越扎实！阶段1、深入多线程阶段2、深入多线程设计模式阶段3、深入juc源码解析阶段4、深入jdk其余源码解析
OKHttp3源码分析——学习笔记 Sincerity_ 源码相关 Okhttp 源码解析读书笔记 httpclient cache
文章目录1.HttpClient与HttpUrlConnection的区别2.OKHttp源码分析使用步骤:dispatcher任务调度器,（后面有详细说明）Request请求RealCallAsyncCall3.OKHttp架构分析1.异步请求线程池,Dispather2.连接池清理线程池-ConnectionPool3.缓存整理线程池DisLruCache4.Http2异步事务线程池,http
JavaScript 基础09：Web APIs——日期对象、DOM节点梦想当全栈 JavaScript javascript 前端开发语言
JavaScript基础09：WebAPIs——日期对象、DOM节点进一步学习DOM相关知识，实现可交互的网页特效能够插入、删除和替换元素节点。能够依据元素节点关系查找节点。一、日期对象掌握Date日期对象的使用，动态获取当前计算机的时间。ECMAScript中内置了获取系统时间的对象Date，使用Date时与之前学习的内置对象console和Math不同，它需要借助new关键字才能使用。1.实例
【AI大模型学习路线】第三阶段之RAG与LangChain——第十六章（Advanced RAG[1]）基于历史对话重新生成Query？ 985小水博一枚呀 AI大模型学习路线人工智能学习 langchain RAG
【AI大模型学习路线】第三阶段之RAG与LangChain——第十六章（AdvancedRAG[1]）基于历史对话重新生成Query？【AI大模型学习路线】第三阶段之RAG与LangChain——第十六章（AdvancedRAG[1]）基于历史对话重新生成Query？文章目录【AI大模型学习路线】第三阶段之RAG与LangChain——第十六章（AdvancedRAG[1]）基于历史对话重新生成Q
【AI大模型学习路线】第三阶段之RAG与LangChain——第十六章（Advanced RAG[1]）其他Query优化相关策略？ 985小水博一枚呀 AI大模型学习路线人工智能学习 langchain
【AI大模型学习路线】第三阶段之RAG与LangChain——第十六章（AdvancedRAG[1]）其他Query优化相关策略？【AI大模型学习路线】第三阶段之RAG与LangChain——第十六章（AdvancedRAG[1]）其他Query优化相关策略？文章目录【AI大模型学习路线】第三阶段之RAG与LangChain——第十六章（AdvancedRAG[1]）其他Query优化相关策略？一
传奇修改map地图教程_传奇技能第三祭：NPC的增加、隐藏和脚本修改垃圾箱博物馆传奇修改map地图教程
技能献祭，Get新技能：传奇技能——NPC功能与实现跟航家学技能，用干货带你飞，现学现用，底部有配套学习资源本篇内容简介：通过对游戏内NPC的控制，可以让NPC出现在地图中的任意位置，还可以控制外观显示、自定义命名，新增与隐藏以及脚本功能的实现。一、NPC总控制文本所在路径：D:MirServerMir200EnvirEnvir目录下，找到NPC总控制文本：Merchant，游戏内的所有NPC都在
LangChain中的向量数据库接口－Weaviate 洪城叮当 langchain 数据库经验分享笔记交互人工智能知识图谱
文章目录前言一、原型定义二、代码解析1、add_texts方法1.1、应用样例2、from_texts方法2.1、应用样例3、similarity_search方法3.1、应用样例三、项目应用1、安装依赖2、引入依赖3、创建对象4、添加数据5、查询数据总结前言 Weaviate是一个开源的向量数据库，支持存储来自各类机器学习模型的数据对象和向量嵌入，并能无缝扩展至数十亿数据对象。它提供存储文档嵌
深度学习模型表征提取全解析 ZhangJiQun&MXP 教学 2024大模型以及算力 2021 AI python 深度学习人工智能 python embedding 语言模型
模型内部进行表征提取的方法在自然语言处理（NLP）中，“表征（Representation）”指将文本（词、短语、句子、文档等）转化为计算机可理解的数值形式（如向量、矩阵），核心目标是捕捉语言的语义、语法、上下文依赖等信息。自然语言表征技术可按“静态/动态”“有无上下文”“是否融入知识”等维度划分一、传统静态表征（无上下文，词级为主）这类方法为每个词分配固定向量，不考虑其在具体语境中的含义（无法解
AI Agent开发学习系列 - langchain之Chains的使用(7)：用四种处理文档的预制链轻松实现文档对话 alex100 AI Agent 学习人工智能 langchain prompt 语言模型 python
在LangChain中，四种文档处理预制链（stuff、refine、mapreduce、mapre-rank）是实现文档问答、摘要等任务的常用高阶工具。它们的核心作用是：将长文档切分为块，分步处理，再整合结果，极大提升大模型处理长文档的能力。stuff直接拼接所有文档内容到prompt，一次性交给大模型处理。适合文档较短、token不超限的场景。refine递进式摘要。先对第一块文档生成初步答案
.NET 一款基于BGInfo的红队内网渗透工具 dot.Net安全矩阵网络 .net 安全 .netcore web安全矩阵
01阅读须知此文所提供的信息只为网络安全人员对自己所负责的网站、服务器等（包括但不限于）进行检测或维护参考，未经授权请勿利用文章中的技术资料对任何计算机系统进行入侵操作。利用此文所提供的信息而造成的直接或间接后果和损失，均由使用者本人负责。本文所提供的工具仅用于学习，禁止用于其他方面02基本介绍在内网渗透过程中，白名单绕过是红队常见的技术需求。Sharp4Bginfo.exe是一款基于微软签名工具
Python学习笔记5|条件语句和循环语句 iamecho9 Python从0到1学习笔记 python 学习笔记
一、条件语句条件语句用于根据不同的条件执行不同的代码块。1、if语句基本语法：if布尔型语句1:代码块#语句1为True时执行的代码示例：age=int(input("请输入你的年龄:"))ifage>=18:print("你已成年")2、if-else语句如果if条件不成立，则执行else代码块：if布尔型语句1:代码块#语句1为True时执行的代码else:代码块#语句1为False时执行的代
2025年渗透测试面试题总结-2025年HW(护网面试) 43（题目+回答）独行soc 2025年护网面试职场和发展 linux 科技渗透测试安全护网
安全领域各种资源，学习文档，以及工具分享、前沿信息分享、POC、EXP分享。不定期分享各种好玩的项目及好用的工具，欢迎关注。目录2025年HW(护网面试)431.自我介绍与职业规划2.Webshell源码级检测方案3.2025年新型Web漏洞TOP54.渗透中的高价值攻击点5.智能Fuzz平台架构设计6.堆栈溢出攻防演进7.插桩技术实战应用8.二进制安全能力矩阵9.C语言内存管理精要10.Pyth
Javaweb学习之Vue模板语法（三）不要数手指啦 vue.js 学习前端
目录学习资料前情回顾本期介绍（vue模板语法）文本插值Vue的Attribute绑定使用JavaScript表达式综合实例代码：学习资料Vue.js-渐进式JavaScript框架|Vue.js(vuejs.org)前情回顾项目的创建大家可以看这篇文章Vue学习之项目的创建-CSDN博客本期介绍（vue模板语法）首先，找到我们编写代码的地方找到自己项目的src文件夹，打开之后点击component
AI问答之手机相机专业拍照模式的主要几个参数解释 piaopiaolanghua 拍摄曝光时间 ISO感光度
一、背景近期突然想了解下手机的专业拍照模式，了解如何拍出拖尾效果，譬如拍摄运动的车辆，长曝光拍摄星空，甚至能够拍到卫星（再来个漂亮的拖尾），因此想到先了解下手机相机专业模式的参数再说，通过AI问答，学习了下，也就有了本文。二、主要参数详细解释截图显示了在“专业”模式下设置的典型核心参数。这些参数共同决定了照片的曝光、清晰度、色彩和焦点。下面逐一解释每个参数及其典型用法：1、ISO640解释：ISO
Python selenium 库 AI老李 python python selenium 开发语言
关键要点PythonSelenium库用于自动化Web浏览器，适合测试和爬虫，中文教程资源丰富。推荐菜鸟教程、CSDN博客和Selenium-Python中文文档，涵盖基础到进阶。学习需注意浏览器驱动匹配和动态加载处理，可能需显式等待。资源推荐以下是适合初学者和中级学习者的中文教程：菜鸟教程：提供全面的Selenium教程，包括安装和示例，详见Selenium教程。Selenium-Python中
Python3 内置函数 AI老李 python python
关键要点Python3的内置函数是解释器直接提供的，无需导入即可使用，涵盖数据类型转换、数学操作、序列处理等多种功能。推荐使用官方文档、菜鸟教程和腾讯云开发者社区的中文资源，适合初学者和中级学习者。资源提供详细解释和示例，学习时可结合实际项目实践。简介Python3的内置函数是编程中常用的工具，方便用户快速实现各种操作。以下是几个主要资源，帮助您学习这些函数的用法。资源推荐Python官方文档：内
three前置课程知识
学习中文网(1.threejs文件包下载和目录简介|Three.js中文网)threejs官方文件包所有版本：https://github.com/mrdoob/three.js/releases更新迭代较快，要选择对应版本使用---下载zip压缩包Threejs官网中文文档链接：https://threejs.org/docs/index.html#manual/zh/重要的内容docs包:文档
linux系统服务器下jsp传参数乱码 3213213333332132 java jsp linux windows xml
在一次解决乱码问题中，发现jsp在windows下用js原生的方法进行编码没有问题，但是到了linux下就有问题， escape,encodeURI,encodeURIComponent等都解决不了问题但是我想了下既然原生的方法不行，我用el标签的方式对中文参数进行加密解密总该可以吧。于是用了java的java.net.URLDecoder,结果还是乱码，最后在绝望之际，用了下面的方法解决了
Spring 注解区别以及应用 BlueSkator spring
1. @Autowired @Autowired是根据类型进行自动装配的。如果当Spring上下文中存在不止一个UserDao类型的bean，或者不存在UserDao类型的bean，会抛出 BeanCreationException异常，这时可以通过在该属性上再加一个@Qualifier注解来声明唯一的id解决问题。 2. @Qualifier 当spring中存在至少一个匹
printf和sprintf的应用 dcj3sjt126com PHP sprintf printf
<?php printf('b: %b c: %c d: %d <bf>f: %f', 80,80, 80, 80); echo ' '; printf('%0.2f %+d %0.2f ', 8, 8, 1235.456); printf('th
config.getInitParameter 171815164 parameter
web.xml <servlet> <servlet-name>servlet1</servlet-name> <jsp-file>/index.jsp</jsp-file> <init-param> <param-name>str</param-name>
Ant标签详解--基础操作 g21121 ant
Ant的一些核心概念： build.xml：构建文件是以XML 文件来描述的，默认构建文件名为build.xml。 project：每个构建文
[简单]代码片段_数据合并 53873039oycg 代码
合并规则:删除家长phone为空的记录,若一个家长对应多个孩子,保留一条家长记录,家长id修改为phone,对应关系也要修改。代码如下:
java 通信技术云端月影 Java 远程通信技术
在分布式服务框架中，一个最基础的问题就是远程服务是怎么通讯的，在Java领域中有很多可实现远程通讯的技术，例如：RMI、MINA、ESB、Burlap、Hessian、SOAP、EJB和JMS等，这些名词之间到底是些什么关系呢，它们背后到底是基于什么原理实现的呢，了解这些是实现分布式服务框架的基础知识，而如果在性能上有高的要求的话，那深入了解这些技术背后的机制就是必须的了，在这篇blog中我们将来
string与StringBuilder 性能差距到底有多大 aijuans
之前也看过一些对string与StringBuilder的性能分析，总感觉这个应该对整体性能不会产生多大的影响，所以就一直没有关注这块！由于学程序初期最先接触的string拼接，所以就一直没改变过自己的习惯！
今天碰到 java.util.ConcurrentModificationException 异常 antonyup_2006 java 多线程工作 IBM
今天改bug，其中有个实现是要对map进行循环，然后有删除操作，代码如下： Iterator<ListItem> iter = ItemMap.keySet.iterator(); while(iter.hasNext()){ ListItem it = iter.next(); //...一些逻辑操作 ItemMap.remove(it); } 结果运行报Con
PL/SQL的类型和JDBC操作数据库百合不是茶 PL/SQL表标量类型游标 PL/SQL记录
PL/SQL的标量类型: 字符,数字,时间,布尔,%type五中类型的 --标量：数据库中预定义类型的变量 --定义一个变长字符串 v_ename varchar2(10); --定义一个小数,范围 -9999.99~9999.99 v_sal number(6,2); --定义一个小数并给一个初始值为5.4 :=是pl/sql的赋值号
Mockito：一个强大的用于 Java 开发的模拟测试框架实例 bijian1013 mockito 单元测试
Mockito框架： Mockito是一个基于MIT协议的开源java测试框架。 Mockito区别于其他模拟框架的地方主要是允许开发者在没有建立“预期”时验证被测系统的行为。对于mock对象的一个评价是测试系统的测
精通Oracle10编程SQL(10)处理例外 bijian1013 oracle 数据库 plsql
/* *处理例外 */ --例外简介 --处理例外-传递例外 declare v_ename emp.ename%TYPE; begin SELECT ename INTO v_ename FROM emp where empno=&no; dbms_output.put_line('雇员名：'||v_ename); exceptio
【Java】Java执行远程机器上Linux命令 bit1129 linux命令
Java使用ethz通过ssh2执行远程机器Linux上命令，封装定义Linux机器的环境信息 package com.tom; import java.io.File; public class Env { private String hostaddr; //Linux机器的IP地址 private Integer po
java通信之Socket通信基础白糖_ java socket 网络协议
正处于网络环境下的两个程序，它们之间通过一个交互的连接来实现数据通信。每一个连接的通信端叫做一个Socket。一个完整的Socket通信程序应该包含以下几个步骤： ①创建Socket； ②打开连接到Socket的输入输出流； ④按照一定的协议对Socket进行读写操作； ④关闭Socket。 Socket通信分两部分：服务器端和客户端。服务器端必须优先启动，然后等待soc
angular.bind boyitech AngularJS angular.bind AngularJS API bind
angular.bind 描述：上下文，函数以及参数动态绑定，返回值为绑定之后的函数. 其中args是可选的动态参数，self在fn中使用this调用。使用方法： angular.bind(se
java-13个坏人和13个好人站成一圈，数到7就从圈里面踢出一个来，要求把所有坏人都给踢出来，所有好人都留在圈里。请找出初始时坏人站的位置。 bylijinnan java
import java.util.ArrayList; import java.util.List; public class KickOutBadGuys { /** * 题目：13个坏人和13个好人站成一圈，数到7就从圈里面踢出一个来，要求把所有坏人都给踢出来，所有好人都留在圈里。请找出初始时坏人站的位置。 * Maybe you can find out
Redis.conf配置文件及相关项说明（自查备用） Kai_Ge redis
Redis.conf配置文件及相关项说明 # Redis configuration file example # Note on units: when memory size is needed, it is possible to specifiy # it in the usual form of 1k 5GB 4M and so forth: #
[强人工智能]实现大规模拓扑分析是实现强人工智能的前奏 comsci 人工智能
真不好意思,各位朋友...博客再次更新... 节点数量太少,网络的分析和处理能力肯定不足,在面对机器人控制的需求方面,显得力不从心.... 但是,节点数太多,对拓扑数据处理的要求又很高,设计目标也很高,实现起来难度颇大...
记录一些常用的函数 dai_lm java
public static String convertInputStreamToString(InputStream is) { StringBuilder result = new StringBuilder(); if (is != null) try { InputStreamReader inputReader = new InputStreamRead
Hadoop中小规模集群的并行计算缺陷 datamachine mapreduce hadoop 并行计算
注：写这篇文章的初衷是因为Hadoop炒得有点太热，很多用户现有数据规模并不适用于Hadoop，但迫于扩容压力和去IOE（Hadoop的廉价扩展的确非常有吸引力）而尝试。尝试永远是件正确的事儿，但有时候不用太突进，可以调优或调需求，发挥现有系统的最大效用为上策。 -----------------------------------------------------------------
小学4年级英语单词背诵第二课 dcj3sjt126com english word
egg 蛋 twenty 二十 any 任何 well 健康的，好 twelve 十二 farm 农场 every 每一个 back 向后，回 fast 快速的 whose 谁的 much 许多 flower 花 watch 手表 very 非常，很 sport 运动 Chinese 中国的
自己实践了github的webhooks, linux上面的权限需要注意 dcj3sjt126com github webhook
环境, 阿里云服务器 1. 本地创建项目, push到github服务器上面 2. 生成www用户的密钥 sudo -u www ssh-keygen -t rsa -C "[email protected]" 3. 将密钥添加到github帐号的SSH_KEYS里面 3. 用www用户执行克隆, 源使
Java冒泡排序蕃薯耀冒泡排序 Java冒泡排序 Java排序
冒泡排序 >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> 蕃薯耀 2015年6月23日 10:40:14 星期二 http://fanshuyao.iteye.com/
Excle读取数据转换为实体List【基于apache-poi】 hanqunfeng apache
1.依赖apache-poi 2.支持xls和xlsx 3.支持按属性名称绑定数据值 4.支持从指定行、列开始读取 5.支持同时读取多个sheet 6.具体使用方式参见org.cpframework.utils.excelreader.CP_ExcelReaderUtilTest.java 比如： Str
3个处于草稿阶段的Javascript API介绍 jackyrong JavaScript
原文： http://www.sitepoint.com/3-new-javascript-apis-may-want-follow/?utm_source=html5weekly&utm_medium=email 本文中，介绍3个仍然处于草稿阶段，但应该值得关注的Javascript API. 1) Web Alarm API &
6个创建Web应用程序的高效PHP框架 lampcy Web 框架 PHP
以下是创建Web应用程序的PHP框架，有coder bay网站整理推荐： 1. CakePHP CakePHP是一个PHP快速开发框架，它提供了一个用于开发、维护和部署应用程序的可扩展体系。CakePHP使用了众所周知的设计模式，如MVC和ORM，降低了开发成本，并减少了开发人员写代码的工作量。 2. CodeIgniter CodeIgniter是一个非常小且功能强大的PHP框架，适合需
评"救市后中国股市新乱象泛起"谣言 nannan408
首先来看百度百家一位易姓作者的新闻：三个多星期来股市持续暴跌，跌得投资者及上市公司都处于极度的恐慌和焦虑中，都要寻找自保及规避风险的方式。面对股市之危机，政府突然进入市场救市，希望以此来重建市场信心，以此来扭转股市持续暴跌的预期。而政府进入市场后，由于市场运作方式发生了巨大变化，投资者及上市公司为了自保及为了应对这种变化，中国股市新的乱象也自然产生。首先，中国股市这两天
页面全屏遮罩的实现方式 Rainbow702 html css 遮罩 mask
之前做了一个页面，在点击了某个按钮之后，要求页面出现一个全屏遮罩，一开始使用了position:absolute来实现的。当时因为画面大小是固定的，不可以resize的，所以，没有发现问题。最近用了同样的做法做了一个遮罩，但是画面是可以进行resize的，所以就发现了一个问题，当画面被reisze到浏览器出现了滚动条的时候，就发现，用absolute 的做法是有问题的。后来改成fixed定位就
关于angularjs的点滴 tntxia AngularJS
angular是一个新兴的JS框架，和以往的框架不同的事，Angularjs更注重于js的建模，管理，同时也提供大量的组件帮助用户组建商业化程序，是一种值得研究的JS框架。 Angularjs使我们可以使用MVC的模式来写JS。Angularjs现在由谷歌来维护。这里我们来简单的探讨一下它的应用。首先使用Angularjs我
Nutz--->>反复新建ioc容器的后果 xiaoxiao1992428 DAO mvc IOC nutz
问题： public class DaoZ { public static Dao dao() { // 每当需要使用dao的时候就取一次 Ioc ioc = new NutIoc(new JsonLoader("dao.js")); return ioc.get(

CUDA学习笔记三

你可能感兴趣的:(CUDA学习)