Texture Fetching
Texture Reference
Texture Functions
Surface Functions
CUDA数组与设备内存从相同的物理内存池中分配,但是前者为2D和3D做了一个局部优化,图形驱动程序则利用这个布局保存纹理,使硬件在2D或3D元素块上操作,取代了1D寻址。对展示出稀疏访问模式的应用程序,特别是有维度局部性的程序,使用CUDA数组会更好。而对于有常规访问模式的应用程序,没有多少数据的重用,选择使用设备内存指针会更好。
纹理引用是CUDA用来设置纹理硬件解释实际内存内容的对象,有了这个间接层,多个纹理引用可以使用不同的属性,来引用相同的内存。
CUDA运行时和CUDA驱动程序的API,处理纹理引用的行为是不同的,不过这两者都需要调用一个名为texture的模板来进行声明。
texture<Type, Dimension, ReadMode> Name;
texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
其中的Type表示类型,Dimension是纹理的维度,ReadMode指定了在纹理读取时整数纹理类型是否转化为归一化浮点数。
纹理引用必须在使用前,绑定在实际内存上,可以绑定在CUDA数组或者是设备内存。
纹理类型 | 内建函数 |
---|---|
设备内存 | tex1Dfetch(int) |
1D | tex1D(float x, float y) |
2D | tex2D(float x, float y) |
3D | tex3D(float x, float y, float z) |
立方图纹理 | texCubemap(float x, float y, float z) |
分层纹理(1D) | tex1DLayered(float x, int layer) |
分层纹理(2D) | tex2DLayered(float x, float y, int layer) |
分层纹理(立方图) | tex1DLayered(float x, float y, float z, int layer) |
通过修改结构体成员,可以设置纹理的属性,其修改会立刻生效,且不用重新绑定。
texture<float, 1, cudaReadModeElementType> tex1;
...
tex1.filterMode = cudaFilterModeLinear; // enable linear filtering
tex1.normalized = true; // texture coordinates will be normalized
内存 | 函数 |
---|---|
1D设备内存 | cudaBindTexture() |
2D设备内存 | cudaBindTexture2D() |
CUDA数组 | cudaBindTextureToArray() |
解除绑定 | cudaUnbindTexture() |
由于在使用驱动程序API时,CPU代码和GPU代码之间存在分区,任何CUDA模块中声明的纹理引用都必须通过cuModuleGetTexRef()查询,这个函数传回一个CUtexref。不同于CUDA运行时,纹理引用必须使用所有正确的属性初始化,因为编译器不会编码纹理引用的不可变属性到CUDA模块中。
内存 | 函数 |
---|---|
1D设备内存 | cuTexRefSetAddress() |
2D设备内存 | cuTexRefSetAddress2D() |
CUDA数组 | cuTexRefSetArray() |
texture<float, cudaTextureType2D,
cudaReadModeElementType> texRef;
textureReference* texRefPtr;
cudaGetTextureReference(&texRefPtr, &texRef);
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc<float>();
size_t offset;
cudaBindTexture2D(&offset, texRefPtr, devPtr, &channelDesc,
width, height, pitch);
texture<float, cudaTextureType2D,
cudaReadModeElementType> texRef;
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc<float>();
size_t offset;
cudaBindTexture2D(&offset, texRef, devPtr, channelDesc,
width, height, pitch);
// 开辟CUDA数组的内存
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cuArray;
int width = 10, height = 10;
cudaMallocArray(&cuArray, &channelDesc, width, height);
// 从主机内存,拷贝到CUDA数组
cudaMemcpyToArray(cuArray, 0, 0, h_data, size, cudaMemcpyHostToDevice);
但是以上的函数会出现一个警告提示,表示这个函数被弃用
warning: ‘cudaError_t cudaMemcpyToArray(cudaArray_t, size_t, size_t, const void*, size_t, cudaMemcpyKind)’ is deprecated [-Wdeprecated-declarations]
所以要使用另一个新的函数cudaMemcpy2DToArray()
cudaMemcpy2DToArray(dst, 0, 0, src, w*sizeof(src[0]) , w*sizeof(src[0]), h, cudaMemcpyHostToDevice);
之后可以绑定CUDA数组到纹理上
texture<float, cudaTextureType2D,
cudaReadModeElementType> texRef;
textureReference* texRefPtr;
cudaGetTextureReference(&texRefPtr, &texRef);
cudaChannelFormatDesc channelDesc;
cudaGetChannelDesc(&channelDesc, cuArray);
cudaBindTextureToArray(texRef, cuArray, &channelDesc);
texture<float, cudaTextureType2D,
cudaReadModeElementType> texRef;
cudaBindTextureToArray(texRef, cuArray);
oid test_4(){
cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc<float>();
cudaArray* cuArray;
int width = 10, height = 10;
cudaMallocArray(&cuArray, &channelDesc, width, height);
float *hData = new float[100];
cudaMemcpy2DToArray(cuArray, 0, 0, hData, width*sizeof(float),
width*sizeof(float), height, cudaMemcpyHostToDevice);
texture<float, cudaTextureType2D, cudaReadModeElementType> texRef;
// Set texture reference parameters
texRef.addressMode[0] = cudaAddressModeWrap;
texRef.addressMode[1] = cudaAddressModeWrap;
texRef.filterMode = cudaFilterModeLinear;
texRef.normalized = true;
cudaBindTextureToArray(texRef, cuArray, channelDesc);
//cudaBindTextureToArray(texRef, cuArray);
delete[] hData;
cudaFreeArray(cuArray);
}
// Simple transformation kernel
__global__ void transformKernel(float* output,
cudaTextureObject_t texObj,
int width, int height,
float theta)
{
// Calculate normalized texture coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
float u = x / (float)width;
float v = y / (float)height;
// Transform coordinates
u -= 0.5f;
v -= 0.5f;
float tu = u * cosf(theta) - v * sinf(theta) + 0.5f;
float tv = v * cosf(theta) + u * sinf(theta) + 0.5f;
// Read from texture and write to global memory
output[y * width + x] = tex2D<float>(texObj, tu, tv);
}
// Allocate CUDA array in device memory
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaArray* cuArray;
int width = 10, height = 10;
cudaMallocArray(&cuArray, &channelDesc, width, height);
// Copy to device memory some data located at address h_data
// in host memory
float *hData = new float[100];
cudaMemcpy2DToArray(cuArray, 0, 0, hData, width*sizeof(float),
width*sizeof(float), height, cudaMemcpyHostToDevice);
// Specify texture
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
resDesc.res.array.array = cuArray;
// Specify texture object parameters
struct cudaTextureDesc texDesc;
memset(&texDesc, 0, sizeof(texDesc));
texDesc.addressMode[0] = cudaAddressModeWrap;
texDesc.addressMode[1] = cudaAddressModeWrap;
texDesc.filterMode = cudaFilterModeLinear;
texDesc.readMode = cudaReadModeElementType;
texDesc.normalizedCoords = 1;
// Create texture object
cudaTextureObject_t texObj = 0;
cudaCreateTextureObject(&texObj, &resDesc, &texDesc, NULL);
// Allocate result of transformation in device memory
float* output;
cudaMalloc(&output, width * height * sizeof(float));
// Invoke kernel
dim3 dimBlock(16, 16);
dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x,
(height + dimBlock.y - 1) / dimBlock.y);
//transformKernel<<>>(output,
// texObj, width, height,
// angle);
// Destroy texture object
cudaDestroyTextureObject(texObj);
// Free device memory
cudaFreeArray(cuArray);
cudaFree(output);
经过试验发现,需要对纹理的归一化坐标值,都加上0.05才能正常调用纹理数据。
纹理只能读取,而表面引用可以写入数据。
// Simple copy kernel
__global__ void copyKernel(cudaSurfaceObject_t inputSurfObj,
cudaSurfaceObject_t outputSurfObj,
int width, int height)
{
// Calculate surface coordinates
unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
if (x < width && y < height) {
uchar4 data;
// Read from input surface
surf2Dread(&data, inputSurfObj, x * 4, y);
// Write to output surface
surf2Dwrite(data, outputSurfObj, x * 4, y);
}
}
// Host code
int main()
{
// Allocate CUDA arrays in device memory
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(8, 8, 8, 8,
cudaChannelFormatKindUnsigned);
cudaArray* cuInputArray;
cudaMallocArray(&cuInputArray, &channelDesc, width, height,
cudaArraySurfaceLoadStore);
cudaArray* cuOutputArray;
cudaMallocArray(&cuOutputArray, &channelDesc, width, height,
cudaArraySurfaceLoadStore);
// Copy to device memory some data located at address h_data
// in host memory
//cudaMemcpyToArray(cuInputArray, 0, 0, h_data, size,
// cudaMemcpyHostToDevice);
cudaMemcpy2DToArray(cuArray, 0, 0, hData, width*sizeof(float),
width*sizeof(float), height, cudaMemcpyHostToDevice);
// Specify surface
struct cudaResourceDesc resDesc;
memset(&resDesc, 0, sizeof(resDesc));
resDesc.resType = cudaResourceTypeArray;
// Create the surface objects
resDesc.res.array.array = cuInputArray;
cudaSurfaceObject_t inputSurfObj = 0;
cudaCreateSurfaceObject(&inputSurfObj, &resDesc);
resDesc.res.array.array = cuOutputArray;
cudaSurfaceObject_t outputSurfObj = 0;
cudaCreateSurfaceObject(&outputSurfObj, &resDesc);
// Invoke kernel
dim3 dimBlock(16, 16);
dim3 dimGrid((width + dimBlock.x - 1) / dimBlock.x,
(height + dimBlock.y - 1) / dimBlock.y);
copyKernel<<<dimGrid, dimBlock>>>(inputSurfObj,
outputSurfObj,
width, height);
// Destroy surface objects
cudaDestroySurfaceObject(inputSurfObj);
cudaDestroySurfaceObject(outputSurfObj);
// Free device memory
cudaFreeArray(cuInputArray);
cudaFreeArray(cuOutputArray);
return 0;
}
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(32, 0, 0, 0,
cudaChannelFormatKindFloat);
cudaChannelFormatDesc channelDesc =
cudaCreateChannelDesc(8, 8, 8, 8,
cudaChannelFormatKindUnsigned);
cudaChannelFormatDesc channelDesc{
int x, y, z, w;
enum cudaChannelFormatKind f;
}
enum cudaChannelFormatKind{
cudaChannelFormatKindSigned = 0;
cudaChannelFormatKindUnsigned = 1;
cudaChannelFormatKindFloat = 2;
cudaChannelFormatKindNone = 3;
}
上述的结构体成员xyzw,指定了纹理元素中每个成员的比特数,仅有一个浮点元素的纹理对应的x为32,其他成员的值为0。
经过简单试验发现,对于一亿个浮点数,执行一千次的全局读写耗时为
类型 | 耗时 |
---|---|
纹理(读) | 966.11ms |
表面(读) | 384.11ms |
设备内存(读) | 370.07ms |
OpenGL缓存(读) | 412.27ms |
表面(写) | 1229.29ms |
设备内存(写) | 1266.11ms |
OpenGL缓存(写) | 1229.65ms |
可以看到,在进行全局读写时,都是设备内存的速度更快,这是因为全局读写的设备内存可以利用到线程的内存事务合并所带来的加速。官方文档上说,在执行稀疏随机的局部读写时,纹理和表面的速度会更快,而此时的设备内存将无法利用到内存事务合并的优势。