void MatrixMultiplyGPU(std::vector& vC,
const std::vector& vA,
const std::vector& vB, int M, int N, int W)
{
concurrency::array_view a(M, W, vA);
concurrency::array_view b(W, N, vB);
concurrency::array_view c(M, N, vC); c.discard_data();
concurrency::parallel_for_each(c.extent,
[=](concurrency::index<2> idx) restrict(amp) {
int row = idx[0]; int col = idx[1];
float sum = 0.0f;
for(int i = 0; i < W; i++)
sum += a(row, i) * b(i, col);
c[idx] = sum;
});
}
如果你已经装上了vs11的beta,那么ctrl+c/ctrl+v赶紧体验下吧:)好吧,在这个c++十分激进的年代(近期的c++11和这个AMP,喜欢研究的童鞋又可以虐待自己的脑细胞了),上面代码肯定让你有不少迷糊的地方,下面我根据个人经验跟大家分析下,分析的不好,不要扔鞋哦。
void MatrixMultiplyCPU(std::vector& vC,
const std::vector& vA,
const std::vector& vB, int M, int N, int W)
{
// 为和GPU预算的处理保持一致,采用这样的方式
concurrency::array_view a(M, W, vA);
concurrency::array_view b(W, N, vB);
concurrency::array_view c(M, N, vC); c.discard_data();
for (int i = 0; i < M; i++)
{
for (int j = 0; j < N; j++)
{
float sum = 0.0f;
for (int k = 0; k < W; k++)
{
sum += a(i,k) * b(k,j);
}
c(i,j) = sum;
}
}
}
测试主程序如下,从运行时间上进行了对比:
#include
#include
#include
void MatrixMultiplyGPU(std::vector& vC,
const std::vector& vA,
const std::vector& vB, int M, int N, int W);
void MatrixMultiplyCPU(std::vector& vC,
const std::vector& vA,
const std::vector& vB, int M, int N, int W);
int main()
{
int M = 1000, N = 1000, W = 1000;
std::vector vec_rsltGPU(M * N);
std::vector vec_rsltCPU(M * N);
long ACount = M * W;
std::vector vec_A;
for (long i = 0; i < ACount; i++ )
{
vec_A.push_back((float)rand()/(float)(ACount*ACount));
}
int BCount = W * N;
std::vector vec_B;
for (long i = 0; i < BCount; i++ )
{
vec_B.push_back((float)rand() / (float)(ACount * ACount));
}
DWORD tStart1 = GetTickCount();
MatrixMultiplyGPU(vec_rsltGPU, vec_A, vec_B, M, N, W);
DWORD tEnd1 = GetTickCount();
DWORD tStart2 = GetTickCount();
MatrixMultiplyCPU(vec_rsltCPU, vec_A, vec_B, M, N, W);
DWORD tEnd2 = GetTickCount();
std::cout << "GPU time:\t" << tEnd1 - tStart1 << std::endl;
std::cout << "CPU time:\t" << tEnd2 - tStart2 << std::endl;
system("pause");
return 0;
}
再看line2,用api,一定要包含头文件,我们的AMP十分为大家着想,只需要这么简单一个头文件就行了。用过DX的童鞋一定还记得那include无尽的dxxxx.h和dxxxx.lib.
测试结果:
M=1000, N=1000, W=1000 | M=1000, N=1000, W=100 | M=100, N=100, W=1000 | M=100, N=100, W=1000 | M=100, N=100, W=100 | |
GPU Time | 3276 | 484 | 500 | 293 | 234 |
CPU Time | 358007 | 42401 | 38407 | 4026 | 452 |
以上测试大概取的几次作为平均,具有一定的统计意义。
CPU的复杂度是和处理的数据规模正相关的,对于大规模的数据处理可以通过有效的涉及并行化处理减少运算时间。
早上到公司试了下没有DX11支持的电脑,真是要了命,直接开了n个线程,导致出现这样的情况:
M=100, N=100, W=1000, DX11 Support | M=100, N=100, W=1000,No DX11 Support | |
GPU Time | 293 | 108888 |
CPU Time | 4026 | 3167 |
通过本次测试,了解了如何运用GPU进行并行化计算,拟通过次方法进行一个应用:哈夫变换求平面参数的GPU实现【还未实现】
参考:http://www.cnblogs.com/baesky
http://www.infoq.com/cn/articles/cpp_amp_computing_on_GPU
http://www.parallellabs.com/2012/05/09/cplusplus-amp-programming/