从2010年起,基于GPGPU的通用目的计算随着OpenCL以及CUDA的大热而变得异常火热。而基于GPU的通用目的计算,其实从其本质上上来说就是通过GPU内部的Compute Shader来完成的。而OpenCL以及CUDA则是将主机端与GPU端的通信接口做了更为标准化的统一。而在最近这几年中,除了OpenCL与CUDA之外,还有像微软发布的C++ AMP,还有最近被融合到OpenMP的OpenACC等工具,这些都是利用GPU的大规模数据级并行计算来做数据级密集通用目的计算的。
而现在在高性能计算领域,用得比较多的仍然是CUDA与OpenCL。但是对应用开发者来说,如果我们要将一个应用上传到Windows Store,那么我们只能使用微软官方出的API;同理,我们如果要将应用上传到App Store,那么也只能使用Apple推出的Metal API。由于Metal API在使用上来说非常简便,并且Apple在编程指南上都有详细的描述以及demo提供,所以各位要参考基于Metal API的通用目的计算,可以直接上Apple开发者官网即可。而基于Direct3D的Compute shader构建起来比较繁琐,而且完整使用的例子也较少,这里将提供一份完整的,基于纯C语言的demo。
以下代码部分都用到了一些C99标准中所引入的语法特性以及库文件,所以各位应该至少在Visual Studio 2013上,最好是Visual Studio 2015上编写以下代码。笔者用的开发环境是Visual Studio 2015 Express Edition for Desktop,这是微软免费的IDE,尽管自带的工具不多,但够用。
我们首先创建一个名为SimpleCS的Windows Console Application,然后在Application Settings中将复选框里的钩子全都去掉,然后勾选上Empty Project。然后我们添加main.c文件。根据这篇博文设置项目选项: http://blog.csdn.net/zenny_chen/article/details/52938512
然后在链接库选项中,把所有的12改成11即可。因为我们这里要用的是Direct3D 11,而不是12。12用起来非常繁琐,而且有几个C API的实现还有bug,等它稳定了之后我会在介绍Direct3D 12中使用Compute Shader的例子。然后,仍然选择x64进行构建。
以下是main.c的内容:
// compute shader简单示例
#include
#include
#include
#include
#include
#include
#include
#include
#define NUM_ELEMENTS 2048
static struct BufType
{
int i;
float f;
} s_vBuf0[NUM_ELEMENTS], s_vBuf1[NUM_ELEMENTS];
static bool CreateComputeDevice(ID3D11Device** ppDeviceOut, ID3D11DeviceContext** ppContextOut)
{
*ppDeviceOut = NULL;
*ppContextOut = NULL;
const uint32_t uCreationFlags = D3D11_CREATE_DEVICE_SINGLETHREADED | D3D11_CREATE_DEVICE_DEBUG;
D3D_FEATURE_LEVEL flOut;
const D3D_FEATURE_LEVEL flvl[] = { D3D_FEATURE_LEVEL_11_1, D3D_FEATURE_LEVEL_11_0 };
bool result = D3D11CreateDevice(NULL, D3D_DRIVER_TYPE_HARDWARE, NULL, uCreationFlags, flvl,
sizeof(flvl) / sizeof(D3D_FEATURE_LEVEL), D3D11_SDK_VERSION, ppDeviceOut, &flOut, ppContextOut) >= 0;
if (result)
printf("Currently use Direct3D level: %d.%d\n", flOut >> 12, (flOut >> 8) & 0xf);
return result;
}
static bool CreateStructureBuffer(ID3D11Device* pDevice, uint32_t elementSize, uint32_t uCount,
void* pInitData, ID3D11Buffer** ppBufferOut)
{
*ppBufferOut = NULL;
D3D11_BUFFER_DESC desc;
ZeroMemory(&desc, sizeof(desc));
desc.BindFlags = D3D11_BIND_UNORDERED_ACCESS | D3D11_BIND_SHADER_RESOURCE;
desc.ByteWidth = elementSize*uCount;
desc.MiscFlags = D3D11_RESOURCE_MISC_BUFFER_STRUCTURED;
desc.StructureByteStride = elementSize;
if (pInitData != NULL)
{
D3D11_SUBRESOURCE_DATA InitData = { 0 };
InitData.pSysMem = pInitData;
return pDevice->lpVtbl->CreateBuffer(pDevice, &desc, &InitData, ppBufferOut) >= 0;
}
else
return pDevice->lpVtbl->CreateBuffer(pDevice, &desc, NULL, ppBufferOut) >= 0;
}
static bool CreateConstantBuffer(ID3D11Device* pDevice, uint32_t nBytes, void* pInitData, ID3D11Buffer** ppBufferOut)
{
*ppBufferOut = NULL;
D3D11_BUFFER_DESC desc;
ZeroMemory(&desc, sizeof(desc));
desc.BindFlags = D3D11_BIND_CONSTANT_BUFFER;
desc.ByteWidth = nBytes;
desc.Usage = D3D11_USAGE_DYNAMIC;
desc.CPUAccessFlags = D3D11_CPU_ACCESS_WRITE;
D3D11_SUBRESOURCE_DATA initData;
initData.pSysMem = pInitData;
initData.SysMemPitch = 0;
initData.SysMemSlicePitch = 0;
return pDevice->lpVtbl->CreateBuffer(pDevice, &desc, &initData, ppBufferOut) >= 0;
}
static bool CreateComputeShader(LPCWSTR pSrcFile, LPCSTR pFunctionName,
ID3D11Device* pDevice, ID3D11ComputeShader** ppShaderOut)
{
uint32_t dwShaderFlags = D3DCOMPILE_ENABLE_STRICTNESS;
// Set the D3DCOMPILE_DEBUG flag to embed debug information in the shaders.
// Setting this flag improves the shader debugging experience, but still allows
// the shaders to be optimized and to run exactly the way they will run in
// the release configuration of this program.
dwShaderFlags |= D3DCOMPILE_DEBUG;
const D3D_SHADER_MACRO defines[] =
{
"USE_STRUCTURED_BUFFERS", "1",
NULL, NULL
};
// We generally prefer to use the higher CS shader profile when possible as CS 5.0 is better performance on 11-class hardware
ID3DBlob* pErrorBlob = NULL;
ID3DBlob* computeShader = NULL;
if (D3DCompileFromFile(pSrcFile, defines, NULL, pFunctionName, "cs_5_0", dwShaderFlags, 0,
&computeShader, &pErrorBlob) < 0)
{
if (pErrorBlob != NULL)
OutputDebugStringA((char*)pErrorBlob->lpVtbl->GetBufferPointer(pErrorBlob));
if(pErrorBlob != NULL)
pErrorBlob->lpVtbl->Release(pErrorBlob);
if(computeShader != NULL)
computeShader->lpVtbl->Release(computeShader);
return false;
}
bool result = true;
if (pDevice->lpVtbl->CreateComputeShader(pDevice, computeShader->lpVtbl->GetBufferPointer(computeShader),
computeShader->lpVtbl->GetBufferSize(computeShader), NULL, ppShaderOut))
result = false;
if (pErrorBlob != NULL)
pErrorBlob->lpVtbl->Release(pErrorBlob);
if (computeShader != NULL)
computeShader->lpVtbl->Release(computeShader);
return result;
}
/**
利用ID3D11Device::CreateShaderResouceView()来创建GPU中Buffer的resourceView
*/
static bool CreateBufferSRV(ID3D11Device* pDevice, ID3D11Buffer* pBuffer, ID3D11ShaderResourceView** ppSRVOut)
{
D3D11_BUFFER_DESC descBuf;
ZeroMemory(&descBuf, sizeof(descBuf));
pBuffer->lpVtbl->GetDesc(pBuffer, &descBuf);
D3D11_SHADER_RESOURCE_VIEW_DESC desc;
ZeroMemory(&desc, sizeof(desc));
desc.ViewDimension = D3D11_SRV_DIMENSION_BUFFEREX;
desc.BufferEx.FirstElement = 0;
//假定这是个structure buffer
desc.Format = DXGI_FORMAT_UNKNOWN;
desc.BufferEx.NumElements = descBuf.ByteWidth / descBuf.StructureByteStride;
return pDevice->lpVtbl->CreateShaderResourceView(pDevice, (ID3D11Resource*)pBuffer, &desc, ppSRVOut) >= 0;
}
static bool CreateBufferUAV(ID3D11Device* pDevice, ID3D11Buffer* pBuffer, ID3D11UnorderedAccessView** ppUAVOut)
{
D3D11_BUFFER_DESC descBuf;
ZeroMemory(&descBuf, sizeof(descBuf));
pBuffer->lpVtbl->GetDesc(pBuffer, &descBuf);
D3D11_UNORDERED_ACCESS_VIEW_DESC desc;
ZeroMemory(&desc, sizeof(desc));
desc.ViewDimension = D3D11_UAV_DIMENSION_BUFFER;
desc.Buffer.FirstElement = 0;
//假设这是一个structure buffer
desc.Format = DXGI_FORMAT_UNKNOWN;
desc.Buffer.NumElements = descBuf.ByteWidth / descBuf.StructureByteStride;
return pDevice->lpVtbl->CreateUnorderedAccessView(pDevice, (ID3D11Resource*)pBuffer, &desc, ppUAVOut) >= 0;
}
static void RunComputeShader(ID3D11DeviceContext* pImmediateContext, ID3D11ComputeShader* pComputeShader,
uint32_t nSRVs, uint32_t nUAVs, ID3D11ShaderResourceView* pShaderResourceViews[],
ID3D11UnorderedAccessView* pUnorderedViews[], uint32_t X, uint32_t Y, uint32_t Z)
{
pImmediateContext->lpVtbl->CSSetShader(pImmediateContext, pComputeShader, NULL, 0);
pImmediateContext->lpVtbl->CSSetShaderResources(pImmediateContext, 0, nSRVs, pShaderResourceViews);
pImmediateContext->lpVtbl->CSSetUnorderedAccessViews(pImmediateContext, 0, nUAVs, pUnorderedViews, NULL);
pImmediateContext->lpVtbl->Dispatch(pImmediateContext, NUM_ELEMENTS, 1, 1);
//清空Shader和各个Shader Resource View、Unordered Access View以及一些Constant Buffer
pImmediateContext->lpVtbl->CSSetShader(pImmediateContext, NULL, NULL, 0);
ID3D11UnorderedAccessView* ppUAViewNULL[] = { NULL, NULL };
pImmediateContext->lpVtbl->CSSetUnorderedAccessViews(pImmediateContext, 0, 2, ppUAViewNULL, NULL);
ID3D11ShaderResourceView* ppSRVNULL[2] = { NULL,NULL };
pImmediateContext->lpVtbl->CSSetShaderResources(pImmediateContext, 0, 2, ppSRVNULL);
ID3D11Buffer* ppCBNULL[1] = { NULL };
pImmediateContext->lpVtbl->CSSetConstantBuffers(pImmediateContext, 0, 1, ppCBNULL);
}
static ID3D11Buffer* CreateAndCopyToDebugBuf(ID3D11Device* pDevice, ID3D11DeviceContext* pd3dImmediateContext,
ID3D11Buffer* pBuffer)
{
ID3D11Buffer* debugBuf = NULL;
D3D11_BUFFER_DESC desc;
ZeroMemory(&desc, sizeof(desc));
pBuffer->lpVtbl->GetDesc(pBuffer, &desc);
desc.CPUAccessFlags = D3D11_CPU_ACCESS_READ;
desc.Usage = D3D11_USAGE_STAGING;
desc.BindFlags = 0;
desc.MiscFlags = 0;
if (pDevice->lpVtbl->CreateBuffer(pDevice, &desc, NULL, &debugBuf) >= 0)
{
pd3dImmediateContext->lpVtbl->CopyResource(pd3dImmediateContext, (ID3D11Resource*)debugBuf,
(ID3D11Resource*)pBuffer);
}
return debugBuf;
}
int main(void)
{
_CrtSetDbgFlag(_CRTDBG_ALLOC_MEM_DF | _CRTDBG_LEAK_CHECK_DF);
ID3D11Device *device = NULL;
ID3D11DeviceContext *context = NULL;
ID3D11ComputeShader *computeShader = NULL;
//各个Buffer指针变量
ID3D11Buffer *srcBuffer0 = NULL;
ID3D11Buffer *srcBuffer1 = NULL;
ID3D11Buffer *resultBuffer = NULL;
ID3D11Buffer *srcDstBuffer = NULL;
ID3D11Buffer *constBuffer = NULL;
//读写上面buffer的ID3D11ShaderResourceView和UnorderedAccessView接口
ID3D11ShaderResourceView *srcBuf0SRV = NULL;
ID3D11ShaderResourceView *srcBuf1SRV = NULL;
ID3D11UnorderedAccessView *resBufUAV = NULL;
ID3D11UnorderedAccessView *srcdstBufUAV = NULL;
int localBuffer[NUM_ELEMENTS];
for (int i = 0; i < NUM_ELEMENTS; i++)
localBuffer[i] = i + 1;
do
{
if (!CreateComputeDevice(&device, &context))
{
puts("CreateComputeDevice failed!");
break;
}
if (!CreateComputeShader(L"compute.hlsl", "CSMain", device, &computeShader))
{
puts("CreateComputeShader failed!");
break;
}
//初始化计算数据
for (int i = 0; ilpVtbl->CSSetConstantBuffers(context, 0, 1, &constBuffer);
//为buffer创建相应的shader resource view与unordered access view
if (!CreateBufferSRV(device, srcBuffer0, &srcBuf0SRV))
{
puts("create srcBuf0SRV failed");
break;
}
if (!CreateBufferSRV(device, srcBuffer1, &srcBuf1SRV))
{
puts("create srcBuf1SRV failed");
break;
}
if (!CreateBufferUAV(device, resultBuffer, &resBufUAV))
{
puts("create resBufUAV failed");
break;
}
if (!CreateBufferUAV(device, srcDstBuffer, &srcdstBufUAV))
{
puts("create srcdstBufUAV failed!");
break;
}
ID3D11ShaderResourceView* shaderResourceViews[] = { srcBuf0SRV, srcBuf1SRV };
ID3D11UnorderedAccessView* unorderedAccessViews[] = { resBufUAV, srcdstBufUAV };
//运行Shader Compute程序
RunComputeShader(context, computeShader, _countof(shaderResourceViews), _countof(unorderedAccessViews),
shaderResourceViews, unorderedAccessViews, NUM_ELEMENTS, 1, 1);
//将GPU计算的结果写回CPU
ID3D11Buffer* debugBuf = NULL;
// 先查看resultBuffer中的内容
debugBuf = CreateAndCopyToDebugBuf(device, context, resultBuffer);
if (debugBuf == NULL)
{
puts("debugBuf create failed!");
break;
}
D3D11_MAPPED_SUBRESOURCE mappedResource;
context->lpVtbl->Map(context, (ID3D11Resource*)debugBuf, 0, D3D11_MAP_READ, 0, &mappedResource);
struct BufType *p = mappedResource.pData;
puts("Output GPU resultBuffer results, first ten:");
for (int i = 0; i < 10; i++)
printf("i: %d, f: %.1f\n", p[i].i, p[i].f);
puts("last ten:");
for(int i = NUM_ELEMENTS - 10; i < NUM_ELEMENTS; i++)
printf("i: %d, f: %.1f\n", p[i].i, p[i].f);
context->lpVtbl->Unmap(context, (ID3D11Resource*)debugBuf, 0);
debugBuf->lpVtbl->Release(debugBuf);
// 再查看srcdstBuffer中的内容
debugBuf = CreateAndCopyToDebugBuf(device, context, srcDstBuffer);
if (debugBuf == NULL)
{
puts("debugBuf create failed!");
break;
}
context->lpVtbl->Map(context, (ID3D11Resource*)debugBuf, 0, D3D11_MAP_READ, 0, &mappedResource);
int *q = mappedResource.pData;
puts("Output GPU srcDstBuffer results, first ten:");
for (int i = 0; i < 10; i++)
printf("[%d] = %d\n", i, q[i]);
puts("last ten:");
for (int i = NUM_ELEMENTS - 10; i < NUM_ELEMENTS; i++)
printf("[%d] = %d\n", i, q[i]);
context->lpVtbl->Unmap(context, (ID3D11Resource*)debugBuf, 0);
debugBuf->lpVtbl->Release(debugBuf);
}
while (false);
//释放资源
if (srcBuf0SRV != NULL)
srcBuf0SRV->lpVtbl->Release(srcBuf0SRV);
if (srcBuf1SRV != NULL)
srcBuf1SRV->lpVtbl->Release(srcBuf1SRV);
if (resBufUAV != NULL)
resBufUAV->lpVtbl->Release(resBufUAV);
if (srcdstBufUAV != NULL)
srcdstBufUAV->lpVtbl->Release(srcdstBufUAV);
if (srcBuffer0 != NULL)
srcBuffer0->lpVtbl->Release(srcBuffer0);
if (srcBuffer1 != NULL)
srcBuffer1->lpVtbl->Release(srcBuffer1);
if (resultBuffer != NULL)
resultBuffer->lpVtbl->Release(resultBuffer);
if (srcDstBuffer != NULL)
srcDstBuffer->lpVtbl->Release(srcDstBuffer);
if (computeShader != NULL)
computeShader->lpVtbl->Release(computeShader);
if (context != NULL)
context->lpVtbl->Release(context);
if (device != NULL)
device->lpVtbl->Release(device);
puts("\nInput enter to exit...");
getchar();
}
完成之后,我们再创建一个名为compute.hlsl的shader文件,将它存放在与main.c相同的目录下。
// 这是一个计算着色器程序
struct BufType
{
int i;
float f;
};
// 对应于主机端的constant buffer
cbuffer cbNeverChanges : register(b0)
{
int cValue0;
int cValue1;
};
// 对应于主机端的Shader Resource View
StructuredBuffer buffer0 : register(t0);
StructuredBuffer buffer1 : register(t1);
// 对应于主机端的Unordered Access View
RWStructuredBuffer bufferOut : register(u0);
RWStructuredBuffer srcdstBuffer : register(u1);
// Direct3D中,一个线程组(threadgroup)最多允许1024个线程
[numthreads(1024, 1, 1)]
void CSMain(uint3 groupID : SV_GroupID, uint3 tid : SV_DispatchThreadID,
uint3 localTID : SV_GroupThreadID, uint gIdx : SV_GroupIndex)
{
const int index = tid.x;
const int cValue = cValue1 / cValue0;
int resValue = (buffer0[index].i + buffer1[index].i) * cValue - srcdstBuffer[index];
bufferOut[index].i = resValue;
bufferOut[index].f = (buffer0[index].f + buffer1[index].f) * float(cValue);
srcdstBuffer[index] = resValue;
}
我们在保存这两个文件的时候,可以在在菜单栏File下面找到Advanced Save Options...,可以将Encoding改为Unicode(UTF-8 without Signature),这样我们就可以在所有操作系统以及语言环境上看到正常的中文汉字了。否则系统不支持GBK或GB2312,会导致汉字部分出现乱码。完成之后我们就可以编译运行了。