全图像素值相加
#include
#include
#include
#include
#include
#include
#include
#include
__global__ void sumOfCuda1(float* pData, float* pSum, int N)
{
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
const int nStep = blockDim.x * gridDim.x;
double dSum = 0.0;
for (int i = tid; i < N; i += nStep)
{
dSum += pData[i];
}
atomicAdd(pSum, dSum);
}
const int N = 5120 * 5120;
const int THREAD_NUM = 2048;
const int BLOCK_NUM = 2048;
__global__ void sumOfCuda2(float* pfData, float* pSum, int N)
{
__shared__ double share_dTemp[THREAD_NUM];
const int nStep = gridDim.x * blockDim.x;
const int tid = blockIdx.x * blockDim.x + threadIdx.x;
double dTempSum = 0.0;
for (int i = tid; i < N; i += nStep)
{
dTempSum += pfData[i];
}
share_dTemp[threadIdx.x] = dTempSum;
__syncthreads();
for (int i = blockDim.x / 2; i != 0; i /= 2)
{
if (threadIdx.x < i)
{
share_dTemp[threadIdx.x] += share_dTemp[threadIdx.x + i];
}
__syncthreads();
}
if (0 == threadIdx.x)
{
atomicAdd(pSum, share_dTemp[0]);
}
}
int main()
{
cv::Mat matBgrImg = cv::imread("1.jpg");
cv::resize(matBgrImg, matBgrImg, cv::Size(5120, 5120));
cv::Mat matGrayIMg;
cv::cvtColor(matBgrImg, matGrayIMg, cv::COLOR_BGR2GRAY);
cv::Mat matF32;
matGrayIMg.convertTo(matF32, CV_32FC1);
matF32 = matF32 / 255.;
int nH = matF32.rows;
int nW = matF32.cols;
int nStep = matF32.step;
printf("matF32 h = %d, w = %d, channel = %d, step = %d \n", matF32.rows, matF32.cols, matF32.channels(), nStep);
int LOOPS = 10000;
clock_t t0 = clock();
for (size_t i = 0; i < LOOPS; i++)
{
sum(matF32)[0];
}
clock_t t1 = clock();
std::cout << "cpu costime is " << t1 - t0 << "ms" << std::endl;
void* pvData = malloc(1 * 1 * N * sizeof(float));
memcpy(pvData, (unsigned char*)matF32.data, N * sizeof(float));
float* pfData_dev = NULL;
cudaMalloc((void**)& pfData_dev, N * sizeof(float));
cudaMemcpy(pfData_dev, pvData, N * sizeof(float), cudaMemcpyHostToDevice);
float fSum = 0.0;
clock_t start = clock();
float* pfSum_dev = NULL;
cudaMalloc((void**)& pfSum_dev, THREAD_NUM * sizeof(float));
for (int i = 0; i < LOOPS; ++i)
{
cudaMemset(pfSum_dev, 0, THREAD_NUM * sizeof(float));
sumOfCuda2 << <BLOCK_NUM, THREAD_NUM >> > (pfData_dev, pfSum_dev, N);
float pfSum[THREAD_NUM] = { 0.0 };
cudaMemcpy(pfSum, pfSum_dev, THREAD_NUM * sizeof(float), cudaMemcpyDeviceToHost);
fSum = 0.0;
for (int j = 0; j < THREAD_NUM; ++j)
{
fSum += pfSum[j];
}
}
clock_t t2 = clock();
std::cout << "costime is " << t2 - t1 << "ms" << std::endl;
return 0;
}
YUV转BGR
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "npp.h"
#include
int main()
{
cv::Mat matBrgImg = cv::imread("1.jpg");
int nWidth = matBrgImg.cols;
int nHeight = matBrgImg.rows;
int nStep = matBrgImg.step;
cv::Mat matYuvImg;
cv::cvtColor(matBrgImg, matYuvImg, cv::COLOR_BGR2YUV);
Npp8u* pu8YUV_dev = NULL;
cudaMalloc((void**)& pu8YUV_dev, nWidth * nHeight * 3 * sizeof(Npp8u));
cudaMemcpy(pu8YUV_dev, (Npp8u*)matYuvImg.data, nWidth * nHeight * 3 * sizeof(Npp8u), cudaMemcpyHostToDevice);
NppStatus nppRet = NPP_NO_ERROR;
NppiSize nppSize{ nWidth, nHeight };
int nLineStep_npp = 0;
Npp8u* pu8BGR_dev = nppiMalloc_8u_C3(nWidth, nHeight, &nLineStep_npp);
printf("nLineStep_npp = %d \n", nLineStep_npp);
nppRet = nppiYUVToBGR_8u_C3R(pu8YUV_dev, nStep, pu8BGR_dev, nStep, nppSize);
printf("nppRet = %d \n", nppRet);
unsigned char* pu8Bgr_host = NULL;
pu8Bgr_host = (unsigned char*)malloc(nWidth * nHeight * 3);
memset(pu8Bgr_host, 0, nWidth * nHeight * 3);
cudaMemcpy(pu8Bgr_host, pu8BGR_dev, nWidth * nHeight * 3, cudaMemcpyDeviceToHost);
cv::Mat newimage(nHeight, nWidth, CV_8UC3);
memcpy(newimage.data, pu8Bgr_host, nWidth * nHeight * 3);
cv::imwrite("YUV2BGR.jpg", newimage);
if (NULL != pu8BGR_dev)
{
nppiFree(pu8BGR_dev);
pu8BGR_dev = NULL;
}
if (NULL != pu8YUV_dev)
{
cudaFree(pu8YUV_dev);
pu8YUV_dev = NULL;
}
if (NULL != pu8Bgr_host)
{
free(pu8Bgr_host);
pu8Bgr_host = NULL;
}
return 0;
}
图像缩放
#include
#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
const int N = 2048;
const int threadnum = 32;
template <typename T>
void dot_cpu(T* a, T* b, T* c, int n)
{
double dTemp = 0;
for (int i = 0; i < n; ++i)
{
dTemp += a[i] * b[i];
}
*c = dTemp;
}
template <typename T>
__global__ void dot_gpu_1(T* a, T* b, T* c, int n)
{
__shared__ T tmp[threadnum];
const int tid = threadIdx.x;
const int t_n = blockDim.x;
int nTid = tid;
double dTemp = 0.0;
while (nTid < n)
{
dTemp += a[nTid] * b[nTid];
nTid += t_n;
}
tmp[tid] = dTemp;
__syncthreads();
int i = 2, j = 1;
while (i <= threadnum)
{
if (tid % i == 0)
{
tmp[tid] += tmp[tid + j];
}
__syncthreads();
i *= 2;
j *= 2;
}
if (0 == tid)
{
c[0] = tmp[0];
}
}
template <typename T>
__global__ void dot_gpu_2(T* a, T* b, T* c, int n)
{
__shared__ T tmp[threadnum];
const int nThreadIdX = threadIdx.x;
const int nBlockDimX = blockDim.x;
int nTid = nThreadIdX;
double dTemp = 0.0;
while (nTid < n)
{
dTemp += a[nTid] * b[nTid];
nTid += nBlockDimX;
}
tmp[nThreadIdX] = dTemp;
__syncthreads();
int i = threadnum / 2;
while (i != 0)
{
if (nThreadIdX < i)
{
tmp[nThreadIdX] += tmp[nThreadIdX + i];
}
__syncthreads();
i /= 2;
}
if (0 == nThreadIdX)
{
c[0] = tmp[0];
}
}
template <typename T>
__global__ void dot_gpu_3(T* a, T* b, T* c, int n)
{
__shared__ T aTmp[threadnum];
const int nThreadIdX = threadIdx.x;
const int nStep = gridDim.x * blockDim.x;
int nTidIdx = blockIdx.x * blockDim.x + threadIdx.x;
double dTemp = 0.0;
while (nTidIdx < n)
{
dTemp += a[nTidIdx] * b[nTidIdx];
nTidIdx += nStep;
}
aTmp[nThreadIdX] = dTemp;
__syncthreads();
int i = threadnum / 2;
while (i != 0)
{
if (nThreadIdX < i)
{
aTmp[nThreadIdX] += aTmp[nThreadIdX + i];
}
__syncthreads();
i /= 2;
}
if (0 == nThreadIdX)
{
c[blockIdx.x] = aTmp[0];
}
}
int main()
{
float a[N], b[N];
float c = 0;
for (int i = 0; i < N; ++i)
{
a[i] = i * 1.0;
b[i] = 1.0;
}
float* d_a = 0, * d_b = 0, * d_c = 0;
cudaMalloc(&d_a, N * sizeof(float));
cudaMemcpy(d_a, a, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_b, N * sizeof(float));
cudaMemcpy(d_b, b, N * sizeof(float), cudaMemcpyHostToDevice);
cudaMalloc(&d_c, sizeof(float));
dot_cpu(a, b, &c, N);
std::cout << c << std::endl;
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
return 0;
}