pthread多线程加速示例(大型矩阵乘法):Blocking,1024线程^_^

使用分块方式,加速效果更为显著,以32*32=1024线程测试,比非Blocking方式(参考这篇文章)提升十几倍。

特别注意线程数量多了也会更消耗资源,额外花费一些时间的。下述param参数结构体与前例不同,可按需要修改。

// Multi-Thread Speedup:Blocking Method
#include 
#include 
#include "MatrixLib.h"
#pragma comment(lib,"MatrixLib.lib")
#pragma warning(disable:4996)

void  checkResult(char* str, int value, FILE* pflog)
{
	if (value != 0)
	{
		fprintf(pflog, "Failed with %d at %s", value, str);
		exit(1);
	}
}

typedef struct
{
	FILE* pflog;
	double** R;
	double** A;
	double** B;
	int start_row;
	int end_row;
	int start_col;
	int end_col;
} threadParm_t;

// 在此之前请先执行ZeroInitSquareMatrix(R,N)
void *oneThread(void *param)
{
	threadParm_t *p = (threadParm_t *)param;
	//fprintf(p->pflog, "# Thread  \'%.8X %.8X\'  is now running.\n", getpid());
	double** R = p->R;
	double** A = p->A;
	double** B = p->B;
	int start_row = p->start_row;
	int end_row = p->end_row;
	int start_col = p->start_col;
	int end_col = p->end_col;
	double tmp;
	for (int i = start_row; i < end_row; ++i)
	{
		for (int j = start_col; j < end_col; ++j)
		{
			tmp = 0;
			for (int k = start_col; k < end_col; ++k)
			{
				tmp += A[i][k] * B[j][k];
			}
			R[i][j] += tmp;
		}
	}

	return NULL;
}

void OneTry(const int N, const int C, FILE* pflog)
{
	int CC = C*C;
	fprintf(pflog, "== %4d * %4d Matrix Multiply, %d Threads. ==\n", N, N, CC);
	clock_t start = clock();
	double** X = NewSquareMatrix(N);
	double** Y = NewSquareMatrix(N);
	double** Z = NewSquareMatrix(N);
	TransformSquareMat(Z, N); // 转置一次

	int start_row = 0, end_row = 0;
	int start_col = 0, end_col = 0;
	int  inc_row = N / C,inc_col=N/C;
	end_row = start_row + inc_row;
	end_col = start_col + inc_col;
	int i, j,k,rc;
	pthread_t* threads = new pthread_t[CC];
	threadParm_t* tparams = new threadParm_t[CC];
	for (i = 0; i < C; ++i)
	{
		for (j = 0; j < C; ++j)
		{
			k = i*C + j;
			tparams[k].pflog = pflog;
			tparams[k].R = X;
			tparams[k].A = Y;
			tparams[k].B = Z;
			tparams[k].start_row = start_row;
			tparams[k].end_row = end_row;
			tparams[k].start_col = start_col;
			tparams[k].end_col = end_col;
			start_row = end_row + 1;
			end_row += inc_row;
			start_col = end_col + 1;
			end_col += inc_col;
			start_row %= N;
			end_row %= N;
			start_col %= N;
			end_col %= N;
			rc = pthread_create(&threads[k], NULL, oneThread, &tparams[k]);
			checkResult("!! pthread_create()\n", rc, pflog);
			//fprintf(pflog, "**********  %4d of %4d threads created  **********\n", k + 1, CC);
		}
	}
	fprintf(pflog, "@ Waiting for worker threads' end...\n");
	int* status = new int[CC];
	for (i = 0; i < CC; ++i)
	{
		rc = pthread_join(threads[i], (void**)(&status[i]));
		checkResult("!! pthread_join()\n", rc, pflog);
	}
	fprintf(pflog, "@ Check all thread's results\n");
	for (i = 0; i < CC; ++i)
	{
		if (status[i] != NULL)
		{
			fprintf(pflog, "!! Unexpected thread status\n");
		}
	}
	//TransformSquareMat(Z, N); // 恢复
	SafeDeleteSquareMat(X, N);
	SafeDeleteSquareMat(Y, N);
	SafeDeleteSquareMat(Z, N);
	clock_t finish = clock();
	fprintf(pflog, "@ All finished. Total time:%.8f(sec).\n\n", (finish - start) / (1.0*CLOCKS_PER_SEC));
}

int main(int argc, char **argv)
{
	FILE* pflog = fopen("trace_log.txt", "a");
	const int N = 4096, C = 32;
	printf("Matrix N=%d,Thread C=%d, now running...", N, C*C);
	time_t rawtime;
	time(&rawtime);
	tm* tminfo = localtime(&rawtime);
	fprintf(pflog, "\nNEW LOG @%s", asctime(tminfo));
	OneTry(N, C, pflog);
	fflush(pflog);
	fclose(pflog);
	printf("finshed!\n");
	system("pause");
	return 0;
}

日志节选

NEW LOG @Sun Apr 20 13:18:12 2014
== 4096 * 4096 Matrix Multiply, 1024 Threads. ==
**********     1 of 1024 threads created  **********
**********     2 of 1024 threads created  **********
**********     3 of 1024 threads created  **********
# Thread  '000033A8 00F91A80'  is now running.
# Thread  '000033A8 00F91B60'  is now running.
(以下省略...)
@ Check all thread's results
@ All finished. Total time:2.57800000(sec).

NEW LOG @Sun Apr 20 13:18:42 2014
== 4096 * 4096 Matrix Multiply, 256 Threads. ==
**********     1 of  256 threads created  **********
**********     2 of  256 threads created  **********
**********     3 of  256 threads created  **********
**********     4 of  256 threads created  **********
# Thread  '00003470 01001A80'  is now running.
# Thread  '00003470 01001B60'  is now running.
# Thread  '00003470 01003578'  is now running.
# Thread  '00003470 01003888'  is now running.
**********     5 of  256 threads created  **********
(以下省略...)
@ Check all thread's results
@ All finished. Total time:3.60900000(sec).

NEW LOG @Sun Apr 20 13:18:52 2014
== 4096 * 4096 Matrix Multiply, 64 Threads. ==
**********     1 of   64 threads created  **********
**********     2 of   64 threads created  **********
**********     3 of   64 threads created  **********
# Thread  '0000368C 009B1A80'  is now running.
**********     4 of   64 threads created  **********
# Thread  '0000368C 009B1B60'  is now running.
(以下省略...)
# Thread  '0000368C 009B3888'  is now running.
@ Check all thread's results
@ All finished. Total time:6.90600000(sec).

NEW LOG @Sun Apr 20 13:29:52 2014
== 4096 * 4096 Matrix Multiply, 1024 Threads. ==
@ Waiting for worker threads' end...
@ Check all thread's results
@ All finished. Total time:2.44600000(sec).


你可能感兴趣的:(Parallel,CUDA,etc.)