/* ============================================================================ Name : matrix_mul_block.c Author : yin Version : Copyright : Copyright received yinhongliang Description : Hello World in C, Ansi-style ============================================================================ */ #include <stdio.h> #include <stdlib.h> #include <malloc.h> #include <pthread.h> #include <time.h> #include <math.h> #include <xmmintrin.h> #define CLK_TCK 150000 //由主频决定的每个clock周期的倒数 #define L1_SIZE 32768 //L1 Cache的大小 #define CACHE_LINE_SIZE 64 //cache调度的行所占字节数 float *matrix_a, *matrix_b, *matrix_c1, *matrix_c2; int n;//矩阵大小,每行元素个数 int thread_num;//线程个数 float L1_rate = 1;//分组占用L1空间比率 int global_index = 0; pthread_mutex_t mutex1; int local_index = 0; FILE *fp; // @param n int 矩阵大小 // @return void // 两个矩阵的初始化 void init(int n) { int i = 0, j = 0; char* filename = "log.txt"; clock_t start, end; start=clock(); if ((fp = fopen(filename, "a+")) == NULL) { printf("cannot open this file %s in blocked_matrix_mul.\n ", filename); exit(0); } if ((matrix_a = (float *) malloc(n * n * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for matrix_a\n"); exit(1); } if ((matrix_b = (float *) malloc(n * n * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for matrix_b\n"); exit(1); } if ((matrix_c1 = (float *) malloc(n * n * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for matrix_c1\n"); exit(1); } if ((matrix_c2 = (float *) malloc(n * n * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for matrix_c2\n"); exit(1); } for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { *(matrix_a + n * i + j) = (rand() % 10) / 10.0; } } for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { *(matrix_b + n * i + j) = (rand() % 10) / 10.0; } } for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { *(matrix_c1 + n * i + j) = 0; *(matrix_c2 + n * i + j) = 0; } } } // @param thread_num int 线程数目 // @param n int 矩阵大小 // @param L1_rate float 占用L1比率,1则为全部占用,0.5则为占用一半 // @return int int main(int argc, char* args[]) { clock_t start, end;//分别记录函数执行前后的时钟数 float cost;//函数执行时间 if (argc != 4) { thread_num = 1; n = 500; L1_rate = 1.0; } else if (argc == 4) { thread_num = atoi(args[1]); n = atoi(args[2]); L1_rate = atof(args[3]); } init(n); // 串行程序 start = clock(); serial(n); end = clock(); cost = (float) (end - start) / CLK_TCK; printf("The serial program cost time:T=%fs\n", cost); // 并行程序 start = clock(); parallel(thread_num, n, L1_rate); end = clock(); cost = (float) (end - start) / CLK_TCK; printf("The parallel program cost time:T=%fs\n", cost); compare(); uninit(); return 0; } //@param n 矩阵长度 //@return void //@discribe 串行计算两个大小为n的矩阵的乘积 void serial(int n) { int i, j, k; float temp = 0.0; for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { temp = 0; for (k = 0; k < n; k++) { //在结果矩阵中,逻辑坐标为(i,j)的元素 //是逻辑坐标为(i,k)和(k,j)的元素乘积之和 temp += matrix_a[i * n + k] * matrix_b[k * n + j]; } matrix_c1[i * n + j] = temp; } } } //@param matrix_left int* 乘法公式左边矩阵分块 //@param row_left int 乘法公式左边矩阵分块行数 //@param col_left int 乘法公式左边矩阵分块列数 //@param matrix_right int* 乘法公式右边矩阵分块 //@param row_right int 乘法公式右边矩阵分块行数 //@param col_right int 乘法公式右边矩阵分块列数 //@param matrix_res int* 结果矩阵分块 //@return void //@describe 两个矩阵分块相乘 void blocked_matrix_mul(float* matrix_left, int row_left, int col_left, float* matrix_right, int row_right, int col_right, float * matrix_res) { int i, j, k; float temp; float* array_left; float* array_right; float* array_res; if ((array_left = (float *) malloc(col_left * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for array_left\n"); exit(1); } if ((array_right = (float *) malloc(col_left * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for array_right\n"); exit(1); } if ((array_res = (float *) malloc(col_left * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for array_res\n"); exit(1); } //两个矩阵相乘,必须要求左边矩阵的列数与右边矩阵的行数相等 if (col_left != row_right) { printf("Illegal matrixes multiply in blocked_matrix_mul.\n"); return; } //串行计算两个矩阵的乘积 for (i = 0; i < row_left; i++) { for (j = 0; j < col_right; j++) { // fprintf(fp,"Compute array_res[%d][%d].\n",i,j); for (k = 0; k < col_left; k++) { //对应相乘的元素分别是matrix_left(i,k),即matrix_left[i*col_left+k], //matrix_right(k,j),即matrix_right[k*col_right+j] array_left[k] = matrix_left[i * col_left + k]; array_right[k] = matrix_right[k * col_right + j]; } temp = 0; ComputeArraySSE(array_left, array_right, array_res, col_left); for (k = 0; k < col_left; k++) { temp += array_res[k]; } matrix_res[i * col_right + j] = temp; } } } void *slave(void *ignored) { int i = 0, j = 0, k = 0, l = 0, m = 0; int block_row, block_col; //矩阵分块在原矩阵中的位置 int block_row_count; //矩阵分块的行数 int block_col_count; //矩阵分块的列数 int row_left, col_left;//乘法公式左边矩阵分块的行数和列数 int row_right, col_right;//乘法公式右边矩阵分块的行数和列数 int row_block_count; //同一列可见的矩阵分块的个数 int col_block_count; //同一行可见的矩阵分块的个数 int block_count;//总的矩阵分块的个数 int block_size; //矩阵分块存储的元素个数 float *block_left, *block_right, *block_res; //每个cache行能存储CACHE_LINE_SIZE/sizeof(int)个数, //所以将矩阵分块的列数定为CACHE_LINE_SIZE/sizeof(int)的整数倍, //为简单起见,假设默认建立的是方阵 //考虑到整除的问题,实际上矩阵分块的大小为矩阵分块的行数与列数之积 block_col_count = CACHE_LINE_SIZE / sizeof(int); for (i = 0; (block_col_count * block_col_count * i * i) <= (L1_SIZE * L1_rate / sizeof(int)); i++) { } block_col_count *= (i - 1); block_row_count = block_col_count; block_size = block_row_count * block_col_count; row_block_count = (n - 1) / block_col_count + 1;//同一列可见的矩阵分块的个数 col_block_count = (n - 1) / block_row_count + 1;//同一行可见的矩阵分块的个数 if ((block_left = (float *) malloc(block_size * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for block_left\n"); exit(1); } if ((block_right = (float *) malloc(block_size * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for block_right\n"); exit(1); } if ((block_res = (float *) malloc(block_size * sizeof(float))) == NULL) { printf("Not enough memory to allocate buffer for block_res\n"); exit(1); } block_count = row_block_count * col_block_count; do { pthread_mutex_lock(&mutex1); i = global_index; global_index++; pthread_mutex_unlock(&mutex1); //编号为i的矩阵分块在矩阵中的逻辑坐标为(i/block_col_count,i%block_col_count) if (i >= block_count) printf("Out of matrix!\nExecuting %dth element.\n", i); block_row = i / col_block_count; block_col = i % col_block_count; //若结果矩阵分块位于最后一行, //则乘法公式左边矩阵分块的行数是(n - 1) % block_row_count + 1, //否则是block_row_count if (block_row == row_block_count - 1) { row_left = (n - 1) % block_row_count + 1; } else { row_left = block_row_count; } //若结果矩阵分块位于最后一列, //则乘法公式右边矩阵分块的列数是(n - 1) % block_col_count + 1, //否则是block_col_count if (block_col == col_block_count - 1) { col_right = (n - 1) % block_col_count + 1; } else { col_right = block_col_count; } //初始化结果矩阵分块对应的结果矩阵,结果矩阵分块坐标位置为(block_row,block_col), //矩阵分块的大小为(row_left,col_right) //原理见下文 m = n * block_row * block_row_count + block_col_count * block_col;//指向矩阵分块在矩阵中的位置 for (k = 0; k < row_left; k++) { for (l = 0; l < col_right; l++) { matrix_c2[m + l] = 0; } m += n; } //编号为i的结果矩阵分块为坐标位置分别为(block_row,j) //和(j,block_col)的矩阵分块的乘积之和 //+a(block_row,j)*b(j,block_col) for (j = 0; j < col_block_count; j++) { //若乘法公式右边矩阵分块位于最后一行, //则其行数是(n - 1) % block_row_count + 1, //否则是block_row_count if (j == row_block_count - 1) { row_right = (n - 1) % block_row_count + 1; col_left = row_right; } else { row_right = block_row_count; col_left = row_right; } //初始化乘法公式左边的矩阵分块,已知矩阵分块坐标位置为(block_row,j), //矩阵分块的大小为(row_left,col_left), //第一个元素在原矩阵中的位置是n*block_row*block_row_count+block_col_count*j //在矩阵分块中与其同一行的元素可依次赋值 //下一行的第一个元素相对位移偏移量是n m = n * block_row * block_row_count + block_col_count * j;//指向矩阵分块在矩阵中的位置 for (k = 0; k < row_left; k++) { for (l = 0; l < col_left; l++) { block_left[k * col_left + l] = matrix_a[m + l]; } m += n; } //初始化乘法公式右边的矩阵分块,已知矩阵分块坐标位置为(j,block_col), //矩阵分块的大小为(row_right,col_right), //原理同上 m = n * j * block_row_count + block_col_count * block_col;//指向矩阵分块在矩阵中的位置 for (k = 0; k < row_right; k++) { for (l = 0; l < col_right; l++) { block_right[k * col_right + l] = matrix_b[m + l]; } m += n; } blocked_matrix_mul(block_left, row_left, col_left, block_right, row_right, col_right, block_res); //将结果矩阵分块保存到结果矩阵中,结果矩阵分块坐标位置为(block_row,block_col), //矩阵分块的大小为(row_left,col_right) //原理同上 m = n * block_row * block_row_count + block_col_count * block_col;//指向矩阵分块在矩阵中的位置 for (k = 0; k < row_left; k++) { for (l = 0; l < col_right; l++) { matrix_c2[m + l] += block_res[k * col_right + l]; } m += n; } } } while (global_index < block_count); free(block_left); free(block_right); free(block_res); } //@param int thread_num 线程数目 //@param int n 向量长度 //@return void //@discribe 使用thread_num个线程并行计算两个长度为n的矩阵乘积 void parallel(int thread_num, int n) { int i; pthread_t thread[thread_num]; pthread_mutex_init(&mutex1, NULL); for (i = 0; i < thread_num; i++) if (pthread_create(&thread[i], NULL, slave, NULL) != 0) perror("pthread_create fails"); for (i = 0; i < thread_num; i++) if (pthread_join(thread[i], NULL) != 0) perror("Pthread_join fails"); } //比较串行程序和并行程序运行结果 void compare() { int i; int index = 0; float total = 0.0; time_t now; now = time(NULL); printf("The number of elements in the matrix is %d.\n", n * n); fprintf(fp, "\n************************************************\n"); fprintf(fp, "\n************************************************\n"); fprintf(fp, "The debug time is %s.\n", ctime(&now)); for (i = 0; i < n * n; i++) { total += (matrix_c1[i]>matrix_c2[i]?(matrix_c1[i]-matrix_c2[i]):(matrix_c2[i]-matrix_c1[i])); if (matrix_c1[i] != matrix_c2[i]) { fprintf(fp,"matrix_c1[%d]=%f\tmatrix_c2[%d]=%f\n",i,matrix_c1[i],i,matrix_c2[i]); index++; } } printf("The number of wrong result is %d\n", index); printf("Check Result:%f\n", total); } //释放占用的空间 void uninit() { fclose(fp); free(matrix_a); free(matrix_b); free(matrix_c1); free(matrix_c2); } void ComputeArraySSE(float* pArray1, float* pArray2, float* pResult, int nSize) { int nLoop; int i = 0; int j = 0; __m128 m1, m2, m3; float* p1; float* p2; float* pr; p1 = pArray1; p2 = pArray2; pr = pResult; nLoop = nSize / 4; for (i = 0; i < nLoop && pr != NULL; i++) { m1 = _mm_loadu_ps(p1); m2 = _mm_loadu_ps(p2); m3 = _mm_mul_ps(m1, m2); _mm_storeu_ps(pr, m3); p1 += 4; p2 += 4; pr += 4; } j = nSize % 4; if (j != 0) { printf("The SSE request array size of times of 4.\n"); } }