Texture Fetching
Texture Reference
Texture Functions
Surface Functions
texture<Type, Dimension, ReadMode> Name;
texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
纹理类型 | 内建函数 |
设备内存 | tex1Dfetch(int) |
1D | tex1D(float x, float y) |
2D | tex2D(float x, float y) |
3D | tex3D(float x, float y, float z) |
立方图纹理 | texCubemap(float x, float y, float z) |
分层纹理(1D) | tex1DLayered(float x, int layer) |
分层纹理(2D) | tex2DLayered(float x, float y, int layer) |
分层纹理(立方图) | tex1DLayered(float x, float y, float z, int layer) |
texture<float, 1, cudaReadModeElementType> tex1;
tex1.filterMode = cudaFilterModeLinear; // enable linear filtering
tex1.normalized = true; // texture coordinates will be normalized
内存 | 函数 |
1D设备内存 | cudaBindTexture() |
2D设备内存 | cudaBindTexture2D() |
CUDA数组 | cudaBindTextureToArray() |
解除绑定 | cudaUnbindTexture() |
内存 | 函数 |
1D设备内存 | cuTexRefSetAddress() |
2D设备内存 | cuTexRefSetAddress2D() |
CUDA数组 | cuTexRefSetArray() |
texture<float, cudaTextureType2D,
cudaReadModeElementType> texRef;
textureReference* texRefPtr;
cudaGetTextureReference(&texRefPtr, &texRef);
cudaChannelFormatDesc channelDesc =
size_t offset;
cudaBindTexture2D(&offset, texRefPtr, devPtr, &channelDesc,
width, height, pitch);
texture<float, cudaTextureType2D,
cudaReadModeElementType> texRef;
cudaChannelFormatDesc channelDesc =
size_t offset;
cudaBindTexture2D(&offset, texRef, devPtr, channelDesc,
width, height, pitch);
// 开辟CUDA数组的内存
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cuArray;
int width = 10, height = 10;
cudaMallocArray(&cuArray, &channelDesc, width, height);
// 从主机内存,拷贝到CUDA数组
cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice);
warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated [-Wdeprecated-declarations]
cudaMemcpy2DToArray(dst, 0, 0, src, w*sizeof(src[0]) , w*sizeof(src[0]), h, cudaMemcpyHostToDevice);
texture<float, cudaTextureType2D,
cudaReadModeElementType> texRef;
textureReference* texRefPtr;
cudaGetTextureReference(&texRefPtr, &texRef);
cudaChannelFormatDesc channelDesc;
cudaGetChannelDesc(&channelDesc, cuArray);
cudaBindTextureToArray(texRef, cuArray, &channelDesc);
texture<float, cudaTextureType2D,
cudaReadModeElementType> texRef;
cudaBindTextureToArray(texRef, cuArray);
oid test_4(){
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cuArray;
int width = 10, height = 10;
cudaMallocArray(&cuArray, &channelDesc, width, height);
float *hData = new float[100];
cudaMemcpy2DToArray(cuArray, 0, 0, hData, width*sizeof(float),
width*sizeof(float), height, cudaMemcpyHostToDevice);
texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeWrap;
texRef.addressMode[1] = cudaAddressModeWrap;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = true;
cudaBindTextureToArray(texRef, cuArray, channelDesc);
//cudaBindTextureToArray(texRef, cuArray);
delete[] hData;
// Simple transformation kernel
__global__ void transformKernel(float* output,
cudaTextureObject_t texObj,
int width, int height,
float theta)
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / (float)width;
float v = y / (float)height;
// Transform coordinates
u -= 0.5f;
v -= 0.5f;
float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
// Read from texture and write to global memory
output[y * width + x] = tex2D<float>(texObj, tu, tv);
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0,
cudaArray* cuArray;
int width = 10, height = 10;
cudaMallocArray(&cuArray, &channelDesc, width, height);
// Copy to device memory some data located at address h_data
// in host memory
float *hData = new float[100];
cudaMemcpy2DToArray(cuArray, 0, 0, hData, width*sizeof(float),
width*sizeof(float), height, cudaMemcpyHostToDevice);
// Specify texture
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
// Specify texture object parameters
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
// Allocate result of transformation in device memory
float* output;
cudaMalloc(&output, width * height * sizeof(float));
// Invoke kernel
dim3 dimBlock(16, 16);
dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x,
(height + dimBlock.y - 1) / dimBlock.y);
// texObj, width, height,
// angle);
// Destroy texture object
// Free device memory
// Simple copy kernel
__global__ void copyKernel(cudaSurfaceObject_t inputSurfObj,
cudaSurfaceObject_t outputSurfObj,
int width, int height)
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
uchar4 data;
// Read from input surface
surf2Dread(&data, inputSurfObj, x * 4, y);
// Write to output surface
surf2Dwrite(data, outputSurfObj, x * 4, y);
// Host code
int main()
// Allocate CUDA arrays in device memory
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(8, 8, 8, 8,
cudaArray* cuInputArray;
cudaMallocArray(&cuInputArray, &channelDesc, width, height,
cudaArray* cuOutputArray;
cudaMallocArray(&cuOutputArray, &channelDesc, width, height,
// Copy to device memory some data located at address h_data
// in host memory
//cudaMemcpyToArray(cuInputArray, 0, 0, h_data, size,
// cudaMemcpyHostToDevice);
cudaMemcpy2DToArray(cuArray, 0, 0, hData, width*sizeof(float),
width*sizeof(float), height, cudaMemcpyHostToDevice);
// Specify surface
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
// Create the surface objects
resDesc.res.array.array = cuInputArray;
cudaSurfaceObject_t inputSurfObj = 0;
cudaCreateSurfaceObject(&inputSurfObj, &resDesc);
resDesc.res.array.array = cuOutputArray;
cudaSurfaceObject_t outputSurfObj = 0;
cudaCreateSurfaceObject(&outputSurfObj, &resDesc);
// Invoke kernel
dim3 dimBlock(16, 16);
dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x,
(height + dimBlock.y - 1) / dimBlock.y);
copyKernel<<<dimGrid, dimBlock>>>(inputSurfObj,
width, height);
// Destroy surface objects
// Free device memory
return 0;
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(8, 8, 8, 8,
cudaChannelFormatDesc channelDesc{
int x, y, z, w;
enum cudaChannelFormatKind f;
enum cudaChannelFormatKind{
cudaChannelFormatKindSigned = 0;
cudaChannelFormatKindUnsigned = 1;
cudaChannelFormatKindFloat = 2;
cudaChannelFormatKindNone = 3;
类型 | 耗时 |
纹理(读) | 966.11ms |
表面(读) | 384.11ms |
设备内存(读) | 370.07ms |
OpenGL缓存(读) | 412.27ms |
表面(写) | 1229.29ms |
设备内存(写) | 1266.11ms |
OpenGL缓存(写) | 1229.65ms |