基于SSE的多线程矩阵乘积算法

/*
 ============================================================================
 Name        : matrix_mul_block.c
 Author      : yin
 Version     :
 Copyright   : Copyright received yinhongliang
 Description : Hello World in C, Ansi-style
 ============================================================================
 */

#include <stdio.h>
#include <stdlib.h>
#include <malloc.h>
#include <pthread.h>
#include <time.h>
#include <math.h>
#include <xmmintrin.h>

#define CLK_TCK 150000 //由主频决定的每个clock周期的倒数
#define L1_SIZE 32768  //L1 Cache的大小
#define CACHE_LINE_SIZE 64 //cache调度的行所占字节数
float *matrix_a, *matrix_b, *matrix_c1, *matrix_c2;
int n;//矩阵大小,每行元素个数
int thread_num;//线程个数
float L1_rate = 1;//分组占用L1空间比率
int global_index = 0;
pthread_mutex_t mutex1;
int local_index = 0;
FILE *fp;

// @param n int 矩阵大小
// @return void
// 两个矩阵的初始化
void init(int n) {
	int i = 0, j = 0;
	char* filename = "log.txt";
	clock_t start, end;
	start=clock();
	if ((fp = fopen(filename, "a+")) == NULL) {
		printf("cannot open this file %s in blocked_matrix_mul.\n ", filename);
		exit(0);
	}
	if ((matrix_a = (float *) malloc(n * n * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for matrix_a\n");
		exit(1);
	}
	if ((matrix_b = (float *) malloc(n * n * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for matrix_b\n");
		exit(1);
	}
	if ((matrix_c1 = (float *) malloc(n * n * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for matrix_c1\n");
		exit(1);
	}
	if ((matrix_c2 = (float *) malloc(n * n * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for matrix_c2\n");
		exit(1);
	}
	for (i = 0; i < n; i++) {
		for (j = 0; j < n; j++) {
			*(matrix_a + n * i + j) = (rand() % 10) / 10.0;
		}
	}
	for (i = 0; i < n; i++) {
		for (j = 0; j < n; j++) {
			*(matrix_b + n * i + j) = (rand() % 10) / 10.0;
		}
	}
	for (i = 0; i < n; i++) {
		for (j = 0; j < n; j++) {
			*(matrix_c1 + n * i + j) = 0;
			*(matrix_c2 + n * i + j) = 0;
		}
	}
}

// @param thread_num	int 线程数目
// @param n 			int 矩阵大小
// @param L1_rate	 	float 占用L1比率,1则为全部占用,0.5则为占用一半
// @return int
int main(int argc, char* args[]) {
	clock_t start, end;//分别记录函数执行前后的时钟数
	float cost;//函数执行时间
	if (argc != 4) {
		thread_num = 1;
		n = 500;
		L1_rate = 1.0;
	} else if (argc == 4) {
		thread_num = atoi(args[1]);
		n = atoi(args[2]);
		L1_rate = atof(args[3]);
	}
	init(n);

	//	串行程序
	start = clock();
	serial(n);
	end = clock();
	cost = (float) (end - start) / CLK_TCK;
	printf("The serial program cost time:T=%fs\n", cost);

	//	并行程序
	start = clock();
	parallel(thread_num, n, L1_rate);
	end = clock();
	cost = (float) (end - start) / CLK_TCK;
	printf("The parallel program cost time:T=%fs\n", cost);
	compare();
	uninit();
	return 0;
}
//@param n 矩阵长度
//@return void
//@discribe 串行计算两个大小为n的矩阵的乘积
void serial(int n) {
	int i, j, k;
	float temp = 0.0;
	for (i = 0; i < n; i++) {
		for (j = 0; j < n; j++) {
			temp = 0;
			for (k = 0; k < n; k++) {
				//在结果矩阵中,逻辑坐标为(i,j)的元素
				//是逻辑坐标为(i,k)和(k,j)的元素乘积之和
				temp += matrix_a[i * n + k] * matrix_b[k * n + j];
			}
			matrix_c1[i * n + j] = temp;
		}
	}
}
//@param matrix_left int* 乘法公式左边矩阵分块
//@param row_left int     乘法公式左边矩阵分块行数
//@param col_left int     乘法公式左边矩阵分块列数
//@param matrix_right int* 乘法公式右边矩阵分块
//@param row_right int     乘法公式右边矩阵分块行数
//@param col_right int     乘法公式右边矩阵分块列数
//@param matrix_res int*   结果矩阵分块
//@return void
//@describe 两个矩阵分块相乘
void blocked_matrix_mul(float* matrix_left, int row_left, int col_left,
		float* matrix_right, int row_right, int col_right, float * matrix_res) {
	int i, j, k;
	float temp;
	float* array_left;
	float* array_right;
	float* array_res;

	if ((array_left = (float *) malloc(col_left * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for array_left\n");
		exit(1);
	}
	if ((array_right = (float *) malloc(col_left * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for array_right\n");
		exit(1);
	}
	if ((array_res = (float *) malloc(col_left * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for array_res\n");
		exit(1);
	}

	//两个矩阵相乘,必须要求左边矩阵的列数与右边矩阵的行数相等
	if (col_left != row_right) {
		printf("Illegal matrixes multiply in blocked_matrix_mul.\n");
		return;
	}

	//串行计算两个矩阵的乘积
	for (i = 0; i < row_left; i++) {
		for (j = 0; j < col_right; j++) {
//			fprintf(fp,"Compute array_res[%d][%d].\n",i,j);
			for (k = 0; k < col_left; k++) {
				//对应相乘的元素分别是matrix_left(i,k),即matrix_left[i*col_left+k],
				//matrix_right(k,j),即matrix_right[k*col_right+j]
				array_left[k] = matrix_left[i * col_left + k];
				array_right[k] = matrix_right[k * col_right + j];
			}
			temp = 0;
			ComputeArraySSE(array_left, array_right, array_res, col_left);
			for (k = 0; k < col_left; k++) {
				temp += array_res[k];
			}
			matrix_res[i * col_right + j] = temp;
		}
	}
}

void *slave(void *ignored) {
	int i = 0, j = 0, k = 0, l = 0, m = 0;
	int block_row, block_col; //矩阵分块在原矩阵中的位置
	int block_row_count; //矩阵分块的行数
	int block_col_count; //矩阵分块的列数
	int row_left, col_left;//乘法公式左边矩阵分块的行数和列数
	int row_right, col_right;//乘法公式右边矩阵分块的行数和列数
	int row_block_count; //同一列可见的矩阵分块的个数
	int col_block_count; //同一行可见的矩阵分块的个数
	int block_count;//总的矩阵分块的个数
	int block_size; //矩阵分块存储的元素个数
	float *block_left, *block_right, *block_res;

	//每个cache行能存储CACHE_LINE_SIZE/sizeof(int)个数,
	//所以将矩阵分块的列数定为CACHE_LINE_SIZE/sizeof(int)的整数倍,
	//为简单起见,假设默认建立的是方阵
	//考虑到整除的问题,实际上矩阵分块的大小为矩阵分块的行数与列数之积
	block_col_count = CACHE_LINE_SIZE / sizeof(int);
	for (i = 0; (block_col_count * block_col_count * i * i) <= (L1_SIZE
			* L1_rate / sizeof(int)); i++) {
	}
	block_col_count *= (i - 1);
	block_row_count = block_col_count;
	block_size = block_row_count * block_col_count;
	row_block_count = (n - 1) / block_col_count + 1;//同一列可见的矩阵分块的个数
	col_block_count = (n - 1) / block_row_count + 1;//同一行可见的矩阵分块的个数
	if ((block_left = (float *) malloc(block_size * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for block_left\n");
		exit(1);
	}
	if ((block_right = (float *) malloc(block_size * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for block_right\n");
		exit(1);
	}
	if ((block_res = (float *) malloc(block_size * sizeof(float))) == NULL) {
		printf("Not enough memory to allocate buffer for block_res\n");
		exit(1);
	}
	block_count = row_block_count * col_block_count;

	do {
		pthread_mutex_lock(&mutex1);
		i = global_index;
		global_index++;
		pthread_mutex_unlock(&mutex1);

		//编号为i的矩阵分块在矩阵中的逻辑坐标为(i/block_col_count,i%block_col_count)
		if (i >= block_count)
			printf("Out of matrix!\nExecuting %dth element.\n", i);
		block_row = i / col_block_count;
		block_col = i % col_block_count;

		//若结果矩阵分块位于最后一行,
		//则乘法公式左边矩阵分块的行数是(n - 1) % block_row_count + 1,
		//否则是block_row_count
		if (block_row == row_block_count - 1) {
			row_left = (n - 1) % block_row_count + 1;
		} else {
			row_left = block_row_count;
		}

		//若结果矩阵分块位于最后一列,
		//则乘法公式右边矩阵分块的列数是(n - 1) % block_col_count + 1,
		//否则是block_col_count
		if (block_col == col_block_count - 1) {
			col_right = (n - 1) % block_col_count + 1;
		} else {
			col_right = block_col_count;
		}

		//初始化结果矩阵分块对应的结果矩阵,结果矩阵分块坐标位置为(block_row,block_col),
		//矩阵分块的大小为(row_left,col_right)
		//原理见下文
		m = n * block_row * block_row_count + block_col_count * block_col;//指向矩阵分块在矩阵中的位置
		for (k = 0; k < row_left; k++) {
			for (l = 0; l < col_right; l++) {
				matrix_c2[m + l] = 0;
			}
			m += n;
		}

		//编号为i的结果矩阵分块为坐标位置分别为(block_row,j)
		//和(j,block_col)的矩阵分块的乘积之和
		//+a(block_row,j)*b(j,block_col)
		for (j = 0; j < col_block_count; j++) {
			//若乘法公式右边矩阵分块位于最后一行,
			//则其行数是(n - 1) % block_row_count + 1,
			//否则是block_row_count
			if (j == row_block_count - 1) {
				row_right = (n - 1) % block_row_count + 1;
				col_left = row_right;
			} else {
				row_right = block_row_count;
				col_left = row_right;
			}

			//初始化乘法公式左边的矩阵分块,已知矩阵分块坐标位置为(block_row,j),
			//矩阵分块的大小为(row_left,col_left),
			//第一个元素在原矩阵中的位置是n*block_row*block_row_count+block_col_count*j
			//在矩阵分块中与其同一行的元素可依次赋值
			//下一行的第一个元素相对位移偏移量是n
			m = n * block_row * block_row_count + block_col_count * j;//指向矩阵分块在矩阵中的位置
			for (k = 0; k < row_left; k++) {
				for (l = 0; l < col_left; l++) {
					block_left[k * col_left + l] = matrix_a[m + l];
				}
				m += n;
			}

			//初始化乘法公式右边的矩阵分块,已知矩阵分块坐标位置为(j,block_col),
			//矩阵分块的大小为(row_right,col_right),
			//原理同上
			m = n * j * block_row_count + block_col_count * block_col;//指向矩阵分块在矩阵中的位置
			for (k = 0; k < row_right; k++) {
				for (l = 0; l < col_right; l++) {
					block_right[k * col_right + l] = matrix_b[m + l];
				}
				m += n;
			}
			blocked_matrix_mul(block_left, row_left, col_left, block_right,
					row_right, col_right, block_res);

			//将结果矩阵分块保存到结果矩阵中,结果矩阵分块坐标位置为(block_row,block_col),
			//矩阵分块的大小为(row_left,col_right)
			//原理同上
			m = n * block_row * block_row_count + block_col_count * block_col;//指向矩阵分块在矩阵中的位置
			for (k = 0; k < row_left; k++) {
				for (l = 0; l < col_right; l++) {
					matrix_c2[m + l] += block_res[k * col_right + l];
				}
				m += n;
			}
		}
	} while (global_index < block_count);
	free(block_left);
	free(block_right);
	free(block_res);
}

//@param int thread_num 线程数目
//@param int n 			向量长度
//@return void
//@discribe 使用thread_num个线程并行计算两个长度为n的矩阵乘积
void parallel(int thread_num, int n) {
	int i;
	pthread_t thread[thread_num];
	pthread_mutex_init(&mutex1, NULL);
	for (i = 0; i < thread_num; i++)
		if (pthread_create(&thread[i], NULL, slave, NULL) != 0)
			perror("pthread_create fails");
	for (i = 0; i < thread_num; i++)
		if (pthread_join(thread[i], NULL) != 0)
			perror("Pthread_join fails");
}

//比较串行程序和并行程序运行结果
void compare() {
	int i;
	int index = 0;
	float total = 0.0;
	time_t now;
	now = time(NULL);
	printf("The number of elements in the matrix is %d.\n", n * n);
	fprintf(fp, "\n************************************************\n");
	fprintf(fp, "\n************************************************\n");
	fprintf(fp, "The debug time is %s.\n", ctime(&now));
	for (i = 0; i < n * n; i++) {
		total += (matrix_c1[i]>matrix_c2[i]?(matrix_c1[i]-matrix_c2[i]):(matrix_c2[i]-matrix_c1[i]));
		if (matrix_c1[i] != matrix_c2[i]) {
			fprintf(fp,"matrix_c1[%d]=%f\tmatrix_c2[%d]=%f\n",i,matrix_c1[i],i,matrix_c2[i]);
			index++;
		}
	}
	printf("The number of wrong result is %d\n", index);
	printf("Check Result:%f\n", total);
}

//释放占用的空间
void uninit() {
	fclose(fp);
	free(matrix_a);
	free(matrix_b);
	free(matrix_c1);
	free(matrix_c2);
}

void ComputeArraySSE(float* pArray1, float* pArray2, float* pResult, int nSize) {
	int nLoop;
	int i = 0;
	int j = 0;
	__m128 m1, m2, m3;
	float* p1;
	float* p2;
	float* pr;
	p1 = pArray1;
	p2 = pArray2;
	pr = pResult;
	nLoop = nSize / 4;
	for (i = 0; i < nLoop && pr != NULL; i++) {
		m1 = _mm_loadu_ps(p1);
		m2 = _mm_loadu_ps(p2);
		m3 = _mm_mul_ps(m1, m2);
		_mm_storeu_ps(pr, m3);
		p1 += 4;
		p2 += 4;
		pr += 4;
	}
	j = nSize % 4;
	if (j != 0) {
		printf("The SSE request array size of times of 4.\n");
	}
}

 

你可能感兴趣的:(多线程,thread,算法,J#,FP)