Windows下多线程编程 C/C++ —— 矩阵乘法的并行算法

一、串行算法

设两个矩阵A和B,大小分别为M * N 和 N * P, 如果C = A * B, 则C的大小为M * P。

相应的代码表示如下:

这里可能大家直观想法可能是int A[M][N],但是这样的话二维传参比较麻烦。

然后如果是new/malloc二维数组的话,由于这样的数组每行之间不一定连续存储,所以可能不会有较好的cache命中率。

// int A[M * N], B[N * P], C[M * P];

void func(int *A, int *B, int *C, int M, int P, int N) {
    // C(M,P) = A(M,N) * B(N,P)
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < P; ++j) {
            C[i * P + j] = 0;
            for (int k = 0; k < N; ++k) {
                C[i * P + j] += A[i * N + k] * B[k * P + j];
            }
        }
    }
}

这里先给一个可以运行的代码:

#include 

using namespace std;

void func(int *A, int *B, int *C, int M, int P, int N) {
    // C(M,P) = A(M,N) * B(N,P)
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < P; ++j) {
            C[i * P + j] = 0;
            for (int k = 0; k < N; ++k) {
                C[i * P + j] += A[i * N + k] * B[k * P + j];
            }
        }
    }
}

void printM(int *A, int M, int N) {
    // print A(M,N)
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++)
            cout << A[i * N + j] << " ";
        cout << endl;
    }
    cout << endl;
}

const int M = 3, N = 3, P = 3;
int A[M * N], B[N * P], C[M * P];

int main() {
    for (int i = 0; i < M * N; i++) A[i] = i;
    for (int i = 0; i < N * P; i++) B[i] = i;
    func(A, B, C, M, P, N);
    printM(A, M, N);
    printM(B, N, P);
    printM(C, M, P);
    return 0;
}

运行结果如下:
Windows下多线程编程 C/C++ —— 矩阵乘法的并行算法_第1张图片

二、并行算法

从上面的算法中可以看出该算法的时间复杂度为O(MNP),当M,N,P都非常大时该计算将非常耗时。那么如何将上面的串行算法转换成并行算法呢?

这里选择的多线程库是windows自带的多线程(#include ),作者刚学习windows自带的多线程,有任何问题欢迎指正。

首先是要考虑如何并行:

从上面的三层循环中可以看出最外层的两个循环是独立的,即对C(i,j)的计算不依赖于任何C(i,j)的计算,因此我们可以通过这个对每个线程分配任务。

假设有m个线程,矩阵C的元素有M*P个。那么平均到每个线程的计算C元素就是(M*P)/m个。所以我们先写一下每一个线程要执行的函数:

struct MYDATA { // 每个线程都会接受一个这样的结构体指针
    int begin, end;
    int *A, *B, *C;
    int P, N;
};

DWORD ThreadProc(LPVOID IpParam) {
    // 接受的数据
    MYDATA *pmd = (MYDATA *) IpParam;
    int *A = pmd->A, *B = pmd->B, *C = pmd->C;
    int begin = pmd->begin, end = pmd->end, P = pmd->P, N = pmd->N;
    
    // 矩阵运算,计算C的元素编号范围[begin,end)
    for (int index = begin; index < end; index++) {
        // 通过编号计算元素的行和列
        int i = index / P, j = index % P;
        C[i * P + j] = 0;
        for (int k = 0; k < N; ++k) {
            C[i * P + j] += A[i * N + k] * B[k * P + j];
        }
    }
    return 0;
}

接下来就是设计创建线程和传入参数了:

    // 假设m个线程,每个线程一个hThread和一个配套的mydt(static让变量的生命周期不局限于主函数内)
	const int m = 4;
    HANDLE hThread[m];
    static MYDATA mydt[m];
    
    // temp就是平均到每个线程的计算元素
    int temp = (M * P) / m;
    for (int i = 0; i < m; ++i) {
        mydt[i].A = A, mydt[i].B = B, mydt[i].C = C;
        mydt[i].begin = i * temp, mydt[i].end = i * temp + temp, mydt[i].P = P, mydt[i].N = N;
        if (i == m - 1) // 最后一个线程计算剩余的
            mydt[i].end = M * P;
        hThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) ThreadProc, &mydt[i], 0, NULL);
    }
    WaitForMultipleObjects(m, hThread, TRUE, INFINITE);

这里给出完整代码:

#include 
#include 
#include 

using namespace std;

struct MYDATA {
    int begin, end;
    int *A, *B, *C;
    int P, N;
};

DWORD ThreadProc(LPVOID IpParam) {
    MYDATA *pmd = (MYDATA *) IpParam;
    int *A = pmd->A, *B = pmd->B, *C = pmd->C;
    int begin = pmd->begin, end = pmd->end, P = pmd->P, N = pmd->N;
    for (int index = begin; index < end; index++) {
        int i = index / P, j = index % P;
        C[i * P + j] = 0;
        for (int k = 0; k < N; ++k) {
            C[i * P + j] += A[i * N + k] * B[k * P + j];
        }
    }
    return 0;
}

void func(int *A, int *B, int *C, int M, int P, int N) {
    // C(M,P) = A(M,N) * B(N,P)
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < P; ++j) {
            C[i * P + j] = 0;
            for (int k = 0; k < N; ++k) {
                C[i * P + j] += A[i * N + k] * B[k * P + j];
            }
        }
    }
}

void printM(int *A, int M, int N) {
    // print A(M,N)
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++)
            cout << A[i * N + j] << " ";
        cout << endl;
    }
    cout << endl;
}

const int M = 1000, N = 1000, P = 1000;
int A[M * N], B[N * P], C[M * P];

int main() {
    clock_t startTime, endTime;
    startTime = clock();//计时开始

    for (int i = 0; i < M * N; i++) A[i] = i;
    for (int i = 0; i < N * P; i++) B[i] = i;

    // ----------------------------------- 多线程
    const int m = 4;
    HANDLE hThread[m];
    static MYDATA mydt[m];
    int temp = (M * P) / m;
    for (int i = 0; i < m; ++i) {
        mydt[i].A = A, mydt[i].B = B, mydt[i].C = C;
        mydt[i].begin = i * temp, mydt[i].end = i * temp + temp, mydt[i].P = P, mydt[i].N = N;
        if (i == m - 1) // 最后一个线程计算剩余的
            mydt[i].end = M * P;
        hThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) ThreadProc, &mydt[i], 0, NULL);
    }
    WaitForMultipleObjects(m, hThread, TRUE, INFINITE);

    // ----------------------------------- 串行
//    func(A, B, C, M, P, N);
//    printM(A, M, N);
//    printM(B, N, P);
//    printM(C, M, P);

    endTime = clock();//计时结束
    cout << "use time: " << (double) (endTime - startTime) / CLOCKS_PER_SEC << endl;
    return 0;
}

4个线程的用时为1.819s,1个线程的用时为7.219s(我的电脑是4核CPU)

优化

由于内存中二维数组是以行优先进行存储的(我们的一维数组在内存方面和二维数组没区别),因此 B[k * P + j]存在严重的cache命中率问题,解决这个问题的方法是也将B进行一次沿对角线进行翻转,使得最里面的计算变成 B[j * P + k]

优化后的完整代码如下:

#include 
#include 
#include 

using namespace std;

struct MYDATA {
    int begin, end;
    int *A, *B, *C;
    int P, N;
};

DWORD ThreadProc(LPVOID IpParam) {
    MYDATA *pmd = (MYDATA *) IpParam;
    int *A = pmd->A, *B = pmd->B, *C = pmd->C;
    int begin = pmd->begin, end = pmd->end, P = pmd->P, N = pmd->N;

    // 线程多的话,这个最好应该提出来
    int sizeB = N * P;
    int *revB = new int[sizeB];
    for (int index = 0; index < sizeB; index++) {
        int i = index / P, j = index % P;
        revB[i * P + j] = B[j * P + i];
    }

    for (int index = begin; index < end; index++) {
        int i = index / P, j = index % P;
        C[i * P + j] = 0;
        for (int k = 0; k < N; ++k) {
            C[i * P + j] += A[i * N + k] * revB[j * P + k];
        }
    }
    delete[]revB;
    return 0;
}

void func(int *A, int *B, int *C, int M, int P, int N) {
    // C(M,P) = A(M,N) * B(N,P)
    for (int i = 0; i < M; ++i) {
        for (int j = 0; j < P; ++j) {
            C[i * P + j] = 0;
            for (int k = 0; k < N; ++k) {
                C[i * P + j] += A[i * N + k] * B[k * P + j];
            }
        }
    }
}

void printM(int *A, int M, int N) {
    // print A(M,N)
    for (int i = 0; i < M; i++) {
        for (int j = 0; j < N; j++)
            cout << A[i * N + j] << " ";
        cout << endl;
    }
    cout << endl;
}

const int M = 1000, N = 1000, P = 1000;
int A[M * N], B[N * P], C[M * P];

int main() {
    clock_t startTime, endTime;
    startTime = clock();//计时开始

    for (int i = 0; i < M * N; i++) A[i] = i;
    for (int i = 0; i < N * P; i++) B[i] = i;

    // ----------------------------------- 多线程
    const int m = 4;
    HANDLE hThread[m];
    static MYDATA mydt[m];
    int temp = (M * P) / m;
    for (int i = 0; i < m; ++i) {
        mydt[i].A = A, mydt[i].B = B, mydt[i].C = C;
        mydt[i].begin = i * temp, mydt[i].end = i * temp + temp, mydt[i].P = P, mydt[i].N = N;
        if (i == m - 1) // 最后一个线程计算剩余的
            mydt[i].end = M * P;
        hThread[i] = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE) ThreadProc, &mydt[i], 0, NULL);
    }
    WaitForMultipleObjects(m, hThread, TRUE, INFINITE);

    // ----------------------------------- 串行
//    func(A, B, C, M, P, N);
//    printM(A, M, N);
//    printM(B, N, P);
//    printM(C, M, P);

    endTime = clock();//计时结束
    cout << "use time: " << (double) (endTime - startTime) / CLOCKS_PER_SEC << endl;
    return 0;
}

4个线程的用时为1.199s,可以看到效果还是很显著的。

参考文章: https://blog.csdn.net/realxie/article/details/7260072

你可能感兴趣的:(计算机基础)