1、C++Amp(GPU),
2、C++PPL(多线程16核),
3、SSE/AVX(单线程),
4、AVX-Db(单线程,双精度),
5、Serial(单线程串行)
对于不同阶数的矩阵乘法运算,运行时间统计如下(单位:秒),Rank表示矩阵阶数:
Rank : | 16 | 32 | 64 | 128 | 256 | 512 | 1024 | 2048 | 2548 | 3048 | 3548 | 4048 |
---|---|---|---|---|---|---|---|---|---|---|---|---|
C++Amp: | 0.039954 | 0.000432 | 0.000460 | 0.000715 | 0.001594 | 0.005016 | 0.030988 | 0.426671 | 0.934444s | 2.24189 | 3.799203 | 3.069859s |
C++PPL : | 0.000215 | 0.000128 | 0.000076 | 0.000227 | 0.001685 | 0.014039 | 0.271222 | 4.000625 | 8.792314 | 15.768285 | 27.04492 | 42.068705 |
SSE/AVX: | 0.000005 | 0.000019 | 0.000087 | 0.000353 | 0.002055 | 0.01773 | 0.129564 | 2.449093 | 4.760192 | 7.474997 | 11.942498 | 17.434507 |
AVX-Db : | 0.000004 | 0.000022 | 0.000080 | 0.00051 | 0.00419 | 0.032692 | 0.547273 | 4.430325 | 8.523084 | 13.997932 | 22.1742 | 33.316449 |
Serial : | 0.000003 | 0.000025 | 0.000165 | 0.001467 | 0.013698 | 0.114402 | 0.939655 | 29.144466 | 70.799408 | 0 | 0 | 0 |
采用的部分模板类代码: “CLMatrix.h” 头文件.
测试代码:
//....其他头文件
#include
#include
#include
#include "CLMatrix.h"
using namespace concurrency;
int main() {
CLMatrixD recd;
int cyc = 5;
int rank = 16;
for (; rank < 4100; )
{
if (rank > 1024) cyc = 1;
//数据存储器用 CLMatrixT类,详 #include "CLMatrix.h" 头文件实现。
CLMatrixF A(1, rank * rank, CLMatrixF::initRand_F_0_1), B(1, rank * rank, CLMatrixF::initRand_F_0_1);
CLMatrixF A1(rank, rank, CLMatrixF::initRand_F_0_1), B1(rank, rank, CLMatrixF::initRand_F_0_1);
CLMatrixD A2(rank, rank, CLMatrixD::initRand_F_0_1), B2(rank, rank, CLMatrixD::initRand_F_0_1);
CLMatrixF C(1, rank * rank), D(rank, rank), E(rank, rank), F(rank, rank);
CLMatrixD G(rank, rank);
CLTick tk12;//高精度计时器
auto tk0 = tk12.getSpendTime();
for (int k = 0; k < cyc; k++)
{
array_view<const float, 2> va(rank, rank, &A[0][0]);
array_view<const float, 2> vb(rank, rank, &B[0][0]);
array_view<float, 2> vc(rank, rank, &C[0][0]);
vc.discard_data();
parallel_for_each(vc.extent,
[=](index<2> idx) restrict(amp) {
const unsigned int row = idx[0];
const unsigned int col = idx[1];
float r = 0;
for (int i = 0; i < rank; i++)
{
r += va[row][i] * vb[i][col];
}
vc[idx] = r;
});
vc.synchronize();
}
auto tk1 = tk12.getSpendTime();
for (int k = 0; k < cyc; k++)
{
parallel_for(0, rank, [=, &A1, &B1, &D](int i) {
parallel_for(0, rank, [=, &i, &A1, &B1, &D](int j) {
float r = 0;
for (int k = 0; k < rank; k++){
r += A1[i][k] * B1[k][j];
}
D[i][j] = r;
});
});
}
CLMatrix::setUseSSE(true);
auto tk2 = tk12.getSpendTime();
for (int k = 0; k < cyc; k++)
::matrixMul(A1, B1, E);
auto tk3 = tk12.getSpendTime();
for (int k = 0; k < cyc; k++)
::matrixMul(A2, B2, G);
auto tk4 = tk12.getSpendTime();
CLMatrix::setUseSSE(false);
if (rank <= 2548)
for (int k = 0; k < cyc; k++)
::matrixMul(A1, B1, F);
auto tk5 = tk12.getSpendTime();
recd.add_col(
{ double(rank),
(tk1 - tk0) / cyc,
(tk2 - tk1) / cyc,
(tk3 - tk2) / cyc,
(tk4 - tk3) / cyc,
(tk5 - tk4) / cyc,
}
);
cout << "\nRank = " << rank << ", finish! ...";
if (rank < 2048)rank *= 2;
else rank += 500;
}
recd.print("Time Spend Record") //打印时间结果
.printMatrix("D:\\Documents\\Desktop\\TimeSpend.txt"); //时间结果保存
return 1;
}