使用分块方式,加速效果更为显著,以32*32=1024线程测试,比非Blocking方式(参考这篇文章)提升十几倍。
特别注意线程数量多了也会更消耗资源,额外花费一些时间的。下述param参数结构体与前例不同,可按需要修改。
// Multi-Thread Speedup:Blocking Method
#include
#include
#include "MatrixLib.h"
#pragma comment(lib,"MatrixLib.lib")
#pragma warning(disable:4996)
void checkResult(char* str, int value, FILE* pflog)
{
if (value != 0)
{
fprintf(pflog, "Failed with %d at %s", value, str);
exit(1);
}
}
typedef struct
{
FILE* pflog;
double** R;
double** A;
double** B;
int start_row;
int end_row;
int start_col;
int end_col;
} threadParm_t;
// 在此之前请先执行ZeroInitSquareMatrix(R,N)
void *oneThread(void *param)
{
threadParm_t *p = (threadParm_t *)param;
//fprintf(p->pflog, "# Thread \'%.8X %.8X\' is now running.\n", getpid());
double** R = p->R;
double** A = p->A;
double** B = p->B;
int start_row = p->start_row;
int end_row = p->end_row;
int start_col = p->start_col;
int end_col = p->end_col;
double tmp;
for (int i = start_row; i < end_row; ++i)
{
for (int j = start_col; j < end_col; ++j)
{
tmp = 0;
for (int k = start_col; k < end_col; ++k)
{
tmp += A[i][k] * B[j][k];
}
R[i][j] += tmp;
}
}
return NULL;
}
void OneTry(const int N, const int C, FILE* pflog)
{
int CC = C*C;
fprintf(pflog, "== %4d * %4d Matrix Multiply, %d Threads. ==\n", N, N, CC);
clock_t start = clock();
double** X = NewSquareMatrix(N);
double** Y = NewSquareMatrix(N);
double** Z = NewSquareMatrix(N);
TransformSquareMat(Z, N); // 转置一次
int start_row = 0, end_row = 0;
int start_col = 0, end_col = 0;
int inc_row = N / C,inc_col=N/C;
end_row = start_row + inc_row;
end_col = start_col + inc_col;
int i, j,k,rc;
pthread_t* threads = new pthread_t[CC];
threadParm_t* tparams = new threadParm_t[CC];
for (i = 0; i < C; ++i)
{
for (j = 0; j < C; ++j)
{
k = i*C + j;
tparams[k].pflog = pflog;
tparams[k].R = X;
tparams[k].A = Y;
tparams[k].B = Z;
tparams[k].start_row = start_row;
tparams[k].end_row = end_row;
tparams[k].start_col = start_col;
tparams[k].end_col = end_col;
start_row = end_row + 1;
end_row += inc_row;
start_col = end_col + 1;
end_col += inc_col;
start_row %= N;
end_row %= N;
start_col %= N;
end_col %= N;
rc = pthread_create(&threads[k], NULL, oneThread, &tparams[k]);
checkResult("!! pthread_create()\n", rc, pflog);
//fprintf(pflog, "********** %4d of %4d threads created **********\n", k + 1, CC);
}
}
fprintf(pflog, "@ Waiting for worker threads' end...\n");
int* status = new int[CC];
for (i = 0; i < CC; ++i)
{
rc = pthread_join(threads[i], (void**)(&status[i]));
checkResult("!! pthread_join()\n", rc, pflog);
}
fprintf(pflog, "@ Check all thread's results\n");
for (i = 0; i < CC; ++i)
{
if (status[i] != NULL)
{
fprintf(pflog, "!! Unexpected thread status\n");
}
}
//TransformSquareMat(Z, N); // 恢复
SafeDeleteSquareMat(X, N);
SafeDeleteSquareMat(Y, N);
SafeDeleteSquareMat(Z, N);
clock_t finish = clock();
fprintf(pflog, "@ All finished. Total time:%.8f(sec).\n\n", (finish - start) / (1.0*CLOCKS_PER_SEC));
}
int main(int argc, char **argv)
{
FILE* pflog = fopen("trace_log.txt", "a");
const int N = 4096, C = 32;
printf("Matrix N=%d,Thread C=%d, now running...", N, C*C);
time_t rawtime;
time(&rawtime);
tm* tminfo = localtime(&rawtime);
fprintf(pflog, "\nNEW LOG @%s", asctime(tminfo));
OneTry(N, C, pflog);
fflush(pflog);
fclose(pflog);
printf("finshed!\n");
system("pause");
return 0;
}
NEW LOG @Sun Apr 20 13:18:12 2014
== 4096 * 4096 Matrix Multiply, 1024 Threads. ==
********** 1 of 1024 threads created **********
********** 2 of 1024 threads created **********
********** 3 of 1024 threads created **********
# Thread '000033A8 00F91A80' is now running.
# Thread '000033A8 00F91B60' is now running.
(以下省略...)
@ Check all thread's results
@ All finished. Total time:2.57800000(sec).
NEW LOG @Sun Apr 20 13:18:42 2014
== 4096 * 4096 Matrix Multiply, 256 Threads. ==
********** 1 of 256 threads created **********
********** 2 of 256 threads created **********
********** 3 of 256 threads created **********
********** 4 of 256 threads created **********
# Thread '00003470 01001A80' is now running.
# Thread '00003470 01001B60' is now running.
# Thread '00003470 01003578' is now running.
# Thread '00003470 01003888' is now running.
********** 5 of 256 threads created **********
(以下省略...)
@ Check all thread's results
@ All finished. Total time:3.60900000(sec).
NEW LOG @Sun Apr 20 13:18:52 2014
== 4096 * 4096 Matrix Multiply, 64 Threads. ==
********** 1 of 64 threads created **********
********** 2 of 64 threads created **********
********** 3 of 64 threads created **********
# Thread '0000368C 009B1A80' is now running.
********** 4 of 64 threads created **********
# Thread '0000368C 009B1B60' is now running.
(以下省略...)
# Thread '0000368C 009B3888' is now running.
@ Check all thread's results
@ All finished. Total time:6.90600000(sec).
NEW LOG @Sun Apr 20 13:29:52 2014
== 4096 * 4096 Matrix Multiply, 1024 Threads. ==
@ Waiting for worker threads' end...
@ Check all thread's results
@ All finished. Total time:2.44600000(sec).