设两个矩阵A和B,大小分别为M * N 和 N * P, 如果C = A * B, 则C的大小为M * P。
相应的代码表示如下:
这里可能大家直观想法可能是int A[M][N]
,但是这样的话二维传参比较麻烦。
然后如果是new/malloc
二维数组的话,由于这样的数组每行之间不一定连续存储,所以可能不会有较好的cache命中率。
// int A[M * N], B[N * P], C[M * P];
void func(int *A, int *B, int *C, int M, int P, int N) {
// C(M,P) = A(M,N) * B(N,P)
for (int i = 0; i < M; ++i) {
for (int j = 0; j < P; ++j) {
C[i * P + j] = 0;
for (int k = 0; k < N; ++k) {
C[i * P + j] += A[i * N + k] * B[k * P + j];
}
}
}
}
这里先给一个可以运行的代码:
#include
using namespace std;
void func(int *A, int *B, int *C, int M, int P, int N) {
// C(M,P) = A(M,N) * B(N,P)
for (int i = 0; i < M; ++i) {
for (int j = 0; j < P; ++j) {
C[i * P + j] = 0;
for (int k = 0; k < N; ++k) {
C[i * P + j] += A[i * N + k] * B[k * P + j];
}
}
}
}
void printM(int *A, int M, int N) {
// print A(M,N)
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++)
cout << A[i * N + j] << " ";
cout << endl;
}
cout << endl;
}
const int M = 3, N = 3, P = 3;
int A[M * N], B[N * P], C[M * P];
int main() {
for (int i = 0; i < M * N; i++) A[i] = i;
for (int i = 0; i < N * P; i++) B[i] = i;
func(A, B, C, M, P, N);
printM(A, M, N);
printM(B, N, P);
printM(C, M, P);
return 0;
}
从上面的算法中可以看出该算法的时间复杂度为O(MNP),当M,N,P都非常大时该计算将非常耗时。那么如何将上面的串行算法转换成并行算法呢?
这里选择的多线程库是windows自带的多线程(#include
首先是要考虑如何并行:
从上面的三层循环中可以看出最外层的两个循环是独立的,即对C(i,j)
的计算不依赖于任何C(i,j)
的计算,因此我们可以通过这个对每个线程分配任务。
假设有m
个线程,矩阵C的元素有M*P
个。那么平均到每个线程的计算C元素就是(M*P)/m
个。所以我们先写一下每一个线程要执行的函数:
struct MYDATA { // 每个线程都会接受一个这样的结构体指针
int begin, end;
int *A, *B, *C;
int P, N;
};
DWORD ThreadProc(LPVOID IpParam) {
// 接受的数据
MYDATA *pmd = (MYDATA *) IpParam;
int *A = pmd->A, *B = pmd->B, *C = pmd->C;
int begin = pmd->begin, end = pmd->end, P = pmd->P, N = pmd->N;
// 矩阵运算,计算C的元素编号范围[begin,end)
for (int index = begin; index < end; index++) {
// 通过编号计算元素的行和列
int i = index / P, j = index % P;
C[i * P + j] = 0;
for (int k = 0; k < N; ++k) {
C[i * P + j] += A[i * N + k] * B[k * P + j];
}
}
return 0;
}
接下来就是设计创建线程和传入参数了:
// 假设m个线程,每个线程一个hThread和一个配套的mydt(static让变量的生命周期不局限于主函数内)
const int m = 4;
HANDLE hThread[m];
static MYDATA mydt[m];
// temp就是平均到每个线程的计算元素
int temp = (M * P) / m;
for (int i = 0; i < m; ++i) {
mydt[i].A = A, mydt[i].B = B, mydt[i].C = C;
mydt[i].begin = i * temp, mydt[i].end = i * temp + temp, mydt[i].P = P, mydt[i].N = N;
if (i == m - 1) // 最后一个线程计算剩余的
mydt[i].end = M * P;
hThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) ThreadProc, &mydt[i], 0, NULL);
}
WaitForMultipleObjects(m, hThread, TRUE, INFINITE);
这里给出完整代码:
#include
#include
#include
using namespace std;
struct MYDATA {
int begin, end;
int *A, *B, *C;
int P, N;
};
DWORD ThreadProc(LPVOID IpParam) {
MYDATA *pmd = (MYDATA *) IpParam;
int *A = pmd->A, *B = pmd->B, *C = pmd->C;
int begin = pmd->begin, end = pmd->end, P = pmd->P, N = pmd->N;
for (int index = begin; index < end; index++) {
int i = index / P, j = index % P;
C[i * P + j] = 0;
for (int k = 0; k < N; ++k) {
C[i * P + j] += A[i * N + k] * B[k * P + j];
}
}
return 0;
}
void func(int *A, int *B, int *C, int M, int P, int N) {
// C(M,P) = A(M,N) * B(N,P)
for (int i = 0; i < M; ++i) {
for (int j = 0; j < P; ++j) {
C[i * P + j] = 0;
for (int k = 0; k < N; ++k) {
C[i * P + j] += A[i * N + k] * B[k * P + j];
}
}
}
}
void printM(int *A, int M, int N) {
// print A(M,N)
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++)
cout << A[i * N + j] << " ";
cout << endl;
}
cout << endl;
}
const int M = 1000, N = 1000, P = 1000;
int A[M * N], B[N * P], C[M * P];
int main() {
clock_t startTime, endTime;
startTime = clock();//计时开始
for (int i = 0; i < M * N; i++) A[i] = i;
for (int i = 0; i < N * P; i++) B[i] = i;
// ----------------------------------- 多线程
const int m = 4;
HANDLE hThread[m];
static MYDATA mydt[m];
int temp = (M * P) / m;
for (int i = 0; i < m; ++i) {
mydt[i].A = A, mydt[i].B = B, mydt[i].C = C;
mydt[i].begin = i * temp, mydt[i].end = i * temp + temp, mydt[i].P = P, mydt[i].N = N;
if (i == m - 1) // 最后一个线程计算剩余的
mydt[i].end = M * P;
hThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) ThreadProc, &mydt[i], 0, NULL);
}
WaitForMultipleObjects(m, hThread, TRUE, INFINITE);
// ----------------------------------- 串行
// func(A, B, C, M, P, N);
// printM(A, M, N);
// printM(B, N, P);
// printM(C, M, P);
endTime = clock();//计时结束
cout << "use time: " << (double) (endTime - startTime) / CLOCKS_PER_SEC << endl;
return 0;
}
4个线程的用时为1.819s,1个线程的用时为7.219s(我的电脑是4核CPU)
由于内存中二维数组是以行优先进行存储的(我们的一维数组在内存方面和二维数组没区别),因此 B[k * P + j]
存在严重的cache命中率问题,解决这个问题的方法是也将B进行一次沿对角线进行翻转,使得最里面的计算变成 B[j * P + k]
优化后的完整代码如下:
#include
#include
#include
using namespace std;
struct MYDATA {
int begin, end;
int *A, *B, *C;
int P, N;
};
DWORD ThreadProc(LPVOID IpParam) {
MYDATA *pmd = (MYDATA *) IpParam;
int *A = pmd->A, *B = pmd->B, *C = pmd->C;
int begin = pmd->begin, end = pmd->end, P = pmd->P, N = pmd->N;
// 线程多的话,这个最好应该提出来
int sizeB = N * P;
int *revB = new int[sizeB];
for (int index = 0; index < sizeB; index++) {
int i = index / P, j = index % P;
revB[i * P + j] = B[j * P + i];
}
for (int index = begin; index < end; index++) {
int i = index / P, j = index % P;
C[i * P + j] = 0;
for (int k = 0; k < N; ++k) {
C[i * P + j] += A[i * N + k] * revB[j * P + k];
}
}
delete[]revB;
return 0;
}
void func(int *A, int *B, int *C, int M, int P, int N) {
// C(M,P) = A(M,N) * B(N,P)
for (int i = 0; i < M; ++i) {
for (int j = 0; j < P; ++j) {
C[i * P + j] = 0;
for (int k = 0; k < N; ++k) {
C[i * P + j] += A[i * N + k] * B[k * P + j];
}
}
}
}
void printM(int *A, int M, int N) {
// print A(M,N)
for (int i = 0; i < M; i++) {
for (int j = 0; j < N; j++)
cout << A[i * N + j] << " ";
cout << endl;
}
cout << endl;
}
const int M = 1000, N = 1000, P = 1000;
int A[M * N], B[N * P], C[M * P];
int main() {
clock_t startTime, endTime;
startTime = clock();//计时开始
for (int i = 0; i < M * N; i++) A[i] = i;
for (int i = 0; i < N * P; i++) B[i] = i;
// ----------------------------------- 多线程
const int m = 4;
HANDLE hThread[m];
static MYDATA mydt[m];
int temp = (M * P) / m;
for (int i = 0; i < m; ++i) {
mydt[i].A = A, mydt[i].B = B, mydt[i].C = C;
mydt[i].begin = i * temp, mydt[i].end = i * temp + temp, mydt[i].P = P, mydt[i].N = N;
if (i == m - 1) // 最后一个线程计算剩余的
mydt[i].end = M * P;
hThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) ThreadProc, &mydt[i], 0, NULL);
}
WaitForMultipleObjects(m, hThread, TRUE, INFINITE);
// ----------------------------------- 串行
// func(A, B, C, M, P, N);
// printM(A, M, N);
// printM(B, N, P);
// printM(C, M, P);
endTime = clock();//计时结束
cout << "use time: " << (double) (endTime - startTime) / CLOCKS_PER_SEC << endl;
return 0;
}
4个线程的用时为1.199s,可以看到效果还是很显著的。
参考文章: https://blog.csdn.net/realxie/article/details/7260072