1. 有两个矩阵:A和B(矩阵实际上就是二维数组)
A矩阵和B矩阵可以做乘法运算必须满足A矩阵的列的数量等于B矩阵的行的数量
运算规则:A的每一行中的数字对应乘以B的每一列的数字把结果相加起来
矩阵乘法的结果为行与列的关系为:行数量为A的行数量, 列数量为B的列数量
2.stressen加速矩阵乘法原理:
通俗的说,斯特拉森算法把原先普通二阶矩阵相乘需要的8次乘法压缩到7次乘法,而在计算机,乘法运算的耗时远远高于加减运算,所以斯特拉森算法可以将O(d^ 3) 压缩到O(d^2.8)。
需要知道的是,斯特拉森算法只是对矩阵分治的算法而不是单独的乘法算法,分治完成时最后使用的还是普通矩阵乘法,在阶数小于等于32(或者64?看过不同的实验结果)时普通的矩阵乘法会有更快的速度,而随着矩阵的阶不断增加,斯特拉森可以提供更快的速度。
在这里,提供另外一种形式的斯特拉森:
同样是通过代数的分解与合并,我们构造出另外一种看起来更复杂的算法。这次我们分解出的项数更多,意味着拓展性更高,于是我们在求矩阵平方时有了新的改进。
我们只需要沿用之前图中的2.s。然后7次乘法使用上图方法,便可以减少一部分的预运算,而上图中,我们依然有P1P2P3可以递归入经过平方优化的斯特拉森算法,获得更快的速度。
利用openMP实现并行加速:
OpenMP是一种用于共享内存并行系统的多线程程序设计方案,支持的编程语言包括C、C++和Fortran。OpenMP提供了对并行算法的高层抽象描述,特别适合在多核CPU机器上的并行程序设计。编译器根据程序中添加的pragma指令,自动将程序并行处理,使用OpenMP降低了并行编程的难度和复杂度。当编译器不支持OpenMP时,程序会退化成普通(串行)程序。程序中已有的OpenMP指令不会影响程序的正常编译运行。
在VS中启用OpenMP很简单,很多主流的编译环境都内置了OpenMP。在项目上右键->属性->配置属性->C/C++->语言->OpenMP支持,选择“是”即可。
OpenMP采用fork-join的执行模式。开始的时候只存在一个主线程,当需要进行并行计算的时候,派生出若干个分支线程来执行并行任务。当并行代码执行完成之后,分支线程会合,并把控制流程交给单独的主线程。
一个典型的fork-join执行模型的示意图如下:
OpenMP编程模型以线程为基础,通过编译制导指令制导并行化,有三种编程要素可以实现并行化控制,他们分别是编译制导、API函数集和环境变量。
编译制导指令以#pragma omp 开始,后边跟具体的功能指令,格式如:#pragma omp 指令[子句[,子句] …]。常用的功能指令如下:
相应的OpenMP子句为:
利用omp_set_num_threads()来设置线程数,
利用#pragma omp parallel sections 声明下面大括号中的语句要并行多线程执行;
利用#pragma omp section 分配线程。
看懂下列helloworld的代码对openmp并行就会有一定了解。
代码实现:
#include
#include
#include
#include
#include
#include
using namespace std;
/*
可以传递二维数组作为参数,有两种方法,
1.change(int **a)直接传递一个指针进去
2.change(int a[][10])数组的第二维维度一定要显式指定
*/
int N=1024;//N必须为2的幂次方
int** MatrixA = new int *[N];
int** MatrixB = new int *[N];
int** MatrixC = new int *[N];//测试并行Strassen矩阵
int** MatrixC2 = new int *[N];//测试串行Strassen矩阵
int** MatrixC1 = new int*[N];//测试串行朴素相乘矩阵
int** MatrixC3 = new int*[N];//测试并行朴素相乘矩阵
//打印矩阵
void PrintMatrix(int **MatrixA, int N){
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
cout << MatrixA[i][j] << " ";
}
cout << endl;
}
}
//矩阵加法
void Matrix_Sum(int N, int** MatrixA, int** MatrixB, int** Sum_Matrix){
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
Sum_Matrix[i][j] = MatrixA[i][j] + MatrixB[i][j];
}
}
}
//矩阵减法
void Matrix_Sub(int N, int** MatrixA, int** MatrixB, int** Sub_Matrix){
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
Sub_Matrix[i][j] = MatrixA[i][j] -MatrixB[i][j];
}
}
}
//矩阵乘法
void Matrix_Mul(int N, int** MatrixA, int** MatrixB, int** Mul_Matrix){
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
Mul_Matrix[i][j] = 0;
for (int k = 0; k < N; k++){
Mul_Matrix[i][j] += MatrixA[i][k] * MatrixB[k][j];
}
}
}
}
//矩阵乘法
void Matrix_Mul1(int N, int** MatrixA, int** MatrixB, int** Mul_Matrix){
#pragma omp parallel for
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
Mul_Matrix[i][j] = 0;
for (int k = 0; k < N; k++){
Mul_Matrix[i][j] += MatrixA[i][k] * MatrixB[k][j];
}
}
}
}
//Strassen串行
void Strassen1(int N, int** MatrixA, int** MatrixB, int** MatrixC2){
//新建矩阵
int** MatrixA11;
int** MatrixA12;
int** MatrixA21;
int** MatrixA22;
int** MatrixB11;
int** MatrixB12;
int** MatrixB21;
int** MatrixB22;
int** MatrixC11;
int** MatrixC12;
int** MatrixC21;
int** MatrixC22;
//初始化每个小矩阵的大小
MatrixA11 = new int*[N/2];//数组的第二维一定要显示指定
MatrixA12 = new int*[N/2];
MatrixA21 = new int*[N/2];
MatrixA22 = new int*[N/2];
MatrixB11 = new int*[N/2];
MatrixB12 = new int*[N/2];
MatrixB21 = new int*[N/2];
MatrixB22 = new int*[N/2];
MatrixC11 = new int*[N/2];
MatrixC12 = new int*[N/2];
MatrixC21 = new int*[N/2];
MatrixC22 = new int*[N/2];
for (int i = 0; i < N/2; i++)//分配连续内存
{
MatrixA11[i] = new int[N/2];
MatrixA12[i] = new int[N / 2];
MatrixA21[i] = new int[N / 2];
MatrixA22[i] = new int[N / 2];
MatrixB11[i] = new int[N / 2];
MatrixB12[i] = new int[N / 2];
MatrixB21[i] = new int[N / 2];
MatrixB22[i] = new int[N / 2];
MatrixC11[i] = new int[N / 2];
MatrixC12[i] = new int[N / 2];
MatrixC21[i] = new int[N / 2];
MatrixC22[i] = new int[N / 2];
}
//为每个小矩阵赋值,将大矩阵分割为4个小矩阵
for (int i = 0; i < N / 2; i++){
for (int j = 0; j < N / 2; j++){
MatrixA11[i][j] = MatrixA[i][j];
MatrixA12[i][j] = MatrixA[i][j + N / 2];
MatrixA21[i][j] = MatrixA[i + N / 2][j];
MatrixA22[i][j] = MatrixA[i + N / 2][j + N / 2];
MatrixB11[i][j] = MatrixB[i][j];
MatrixB12[i][j] = MatrixB[i][j + N / 2];
MatrixB21[i][j] = MatrixB[i + N / 2][j];
MatrixB22[i][j] = MatrixB[i + N / 2][j + N / 2];
}
}
//做10个辅助矩阵S,计算加法
int** MatrixS1=new int*[N/2];
int** MatrixS2 = new int*[N/2];
int** MatrixS3 = new int*[N/2];
int** MatrixS4 = new int*[N / 2];
int** MatrixS5 = new int*[N / 2];
int** MatrixS6 = new int*[N / 2];
int** MatrixS7 = new int*[N / 2];
int** MatrixS8 = new int*[N / 2];
int** MatrixS9 = new int*[N / 2];
int** MatrixS10 = new int*[N / 2];
for (int i = 0; i < N / 2; i++)//分配连续内存
{
MatrixS1[i] = new int[N / 2];
MatrixS2[i] = new int[N / 2];
MatrixS3[i] = new int[N / 2];
MatrixS4[i] = new int[N / 2];
MatrixS5[i] = new int[N / 2];
MatrixS6[i] = new int[N / 2];
MatrixS7[i] = new int[N / 2];
MatrixS8[i] = new int[N / 2];
MatrixS9[i] = new int[N / 2];
MatrixS10[i] = new int[N / 2];
}
Matrix_Sub(N/2, MatrixB12, MatrixB22, MatrixS1);//S1 = B12 - B22
Matrix_Sum(N / 2, MatrixA11, MatrixA12, MatrixS2);//S2 = A11 + A12
Matrix_Sum(N / 2, MatrixA21, MatrixA22, MatrixS3);//S3 = A21 + A22
Matrix_Sub(N / 2, MatrixB21, MatrixB11, MatrixS4);//S4 = B21 - B11
Matrix_Sum(N / 2, MatrixA11, MatrixA22, MatrixS5);//S5 = A11 + A22
Matrix_Sum(N / 2, MatrixB11, MatrixB22, MatrixS6);//S6 = B11 + B22
Matrix_Sub(N / 2, MatrixA12, MatrixA22, MatrixS7);//S7 = A12 - A22
Matrix_Sum(N / 2, MatrixB21, MatrixB22, MatrixS8);//S8 = B21 + B22
Matrix_Sub(N / 2, MatrixA11, MatrixA21, MatrixS9);//S9 = A11 - A21
Matrix_Sum(N / 2, MatrixB11, MatrixB12, MatrixS10);//S10 = B11 + B12
//做7个辅助矩阵P,计算乘法
int** MatrixP1 = new int*[N / 2];
int** MatrixP2 = new int*[N / 2];
int** MatrixP3 = new int*[N / 2];
int** MatrixP4 = new int*[N / 2];
int** MatrixP5 = new int*[N / 2];
int** MatrixP6 = new int*[N / 2];
int** MatrixP7 = new int*[N / 2];
for (int i = 0; i < N / 2; i++)//分配连续内存
{
MatrixP1[i] = new int[N / 2];
MatrixP2[i] = new int[N / 2];
MatrixP3[i] = new int[N / 2];
MatrixP4[i] = new int[N / 2];
MatrixP5[i] = new int[N / 2];
MatrixP6[i] = new int[N / 2];
MatrixP7[i] = new int[N / 2];
}
Matrix_Mul(N / 2, MatrixA11, MatrixS1, MatrixP1);//P1 = A11 ? S1
Matrix_Mul(N / 2, MatrixS2, MatrixB22, MatrixP2);//P2 = S2 ? B22
Matrix_Mul(N / 2, MatrixS3, MatrixB11, MatrixP3);//P3 = S3 ? B11
Matrix_Mul(N / 2, MatrixA22, MatrixS4, MatrixP4);//P4 = A22 ? S4
Matrix_Mul(N / 2, MatrixS5, MatrixS6, MatrixP5);//P5 = S5 ? S6
Matrix_Mul(N / 2, MatrixS7, MatrixS8, MatrixP6);//P6 = S7 ? S8
Matrix_Mul(N / 2, MatrixS9, MatrixS10, MatrixP7);//P7 = S9 ? S10
//根据以上7个结果计算C2矩阵
Matrix_Sum(N / 2, MatrixP5, MatrixP4, MatrixC11); //C11 = P5 + P4 - P2 + P6
Matrix_Sub(N / 2, MatrixC11, MatrixP2, MatrixC11);
Matrix_Sum(N / 2, MatrixC11, MatrixP6, MatrixC11);
Matrix_Sum(N / 2, MatrixP1, MatrixP2, MatrixC12);//C12 = P1 + P2
Matrix_Sum(N / 2, MatrixP3, MatrixP4, MatrixC21); //C21 = P3 + P4
Matrix_Sum(N / 2, MatrixP5, MatrixP1, MatrixC22); //C22 = P5 + P1 - P3 - P7
Matrix_Sub(N / 2, MatrixC22, MatrixP3, MatrixC22);
Matrix_Sub(N / 2, MatrixC22, MatrixP7, MatrixC22);
//将C11,C12,C21,C21合并为C2矩阵
for (int i = 0; i < N / 2; i++){
for (int j = 0; j < N / 2; j++){
MatrixC2[i][j] = MatrixC11[i][j];
MatrixC2[i][j+N/2] = MatrixC12[i][j];
MatrixC2[i+N/2][j] = MatrixC21[i][j];
MatrixC2[i+N/2][j+N/2] = MatrixC22[i][j];
}
}
}
//并行Strassen的矩阵乘法,输入矩阵A,B,C
void Strassen(int N, int** MatrixA, int** MatrixB, int** MatrixC){
//新建矩阵
int** MatrixA11;
int** MatrixA12;
int** MatrixA21;
int** MatrixA22;
int** MatrixB11;
int** MatrixB12;
int** MatrixB21;
int** MatrixB22;
int** MatrixC11;
int** MatrixC12;
int** MatrixC21;
int** MatrixC22;
//初始化每个小矩阵的大小
MatrixA11 = new int*[N/2];//数组的第二维一定要显示指定
MatrixA12 = new int*[N/2];
MatrixA21 = new int*[N/2];
MatrixA22 = new int*[N/2];
MatrixB11 = new int*[N/2];
MatrixB12 = new int*[N/2];
MatrixB21 = new int*[N/2];
MatrixB22 = new int*[N/2];
MatrixC11 = new int*[N/2];
MatrixC12 = new int*[N/2];
MatrixC21 = new int*[N/2];
MatrixC22 = new int*[N/2];
//#pragma omp parallel for
for (int i = 0; i < N/2; i++)//分配连续内存
{
MatrixA11[i] = new int[N/2];
MatrixA12[i] = new int[N / 2];
MatrixA21[i] = new int[N / 2];
MatrixA22[i] = new int[N / 2];
MatrixB11[i] = new int[N / 2];
MatrixB12[i] = new int[N / 2];
MatrixB21[i] = new int[N / 2];
MatrixB22[i] = new int[N / 2];
MatrixC11[i] = new int[N / 2];
MatrixC12[i] = new int[N / 2];
MatrixC21[i] = new int[N / 2];
MatrixC22[i] = new int[N / 2];
}
//为每个小矩阵赋值,将大矩阵分割为4个小矩阵
//#pragma omp parallel for
for (int i = 0; i < N / 2; i++){
for (int j = 0; j < N / 2; j++){
MatrixA11[i][j] = MatrixA[i][j];
MatrixA12[i][j] = MatrixA[i][j + N / 2];
MatrixA21[i][j] = MatrixA[i + N / 2][j];
MatrixA22[i][j] = MatrixA[i + N / 2][j + N / 2];
MatrixB11[i][j] = MatrixB[i][j];
MatrixB12[i][j] = MatrixB[i][j + N / 2];
MatrixB21[i][j] = MatrixB[i + N / 2][j];
MatrixB22[i][j] = MatrixB[i + N / 2][j + N / 2];
}
}
//做10个辅助矩阵S,计算加法
int** MatrixS1=new int*[N/2];
int** MatrixS2 = new int*[N/2];
int** MatrixS3 = new int*[N/2];
int** MatrixS4 = new int*[N / 2];
int** MatrixS5 = new int*[N / 2];
int** MatrixS6 = new int*[N / 2];
int** MatrixS7 = new int*[N / 2];
int** MatrixS8 = new int*[N / 2];
int** MatrixS9 = new int*[N / 2];
int** MatrixS10 = new int*[N / 2];
//#pragma omp parallel for
for (int i = 0; i < N / 2; i++)//分配连续内存
{
MatrixS1[i] = new int[N / 2];
MatrixS2[i] = new int[N / 2];
MatrixS3[i] = new int[N / 2];
MatrixS4[i] = new int[N / 2];
MatrixS5[i] = new int[N / 2];
MatrixS6[i] = new int[N / 2];
MatrixS7[i] = new int[N / 2];
MatrixS8[i] = new int[N / 2];
MatrixS9[i] = new int[N / 2];
MatrixS10[i] = new int[N / 2];
}
Matrix_Sub(N/2, MatrixB12, MatrixB22, MatrixS1);//S1 = B12 - B22
Matrix_Sum(N / 2, MatrixA11, MatrixA12, MatrixS2);//S2 = A11 + A12
Matrix_Sum(N / 2, MatrixA21, MatrixA22, MatrixS3);//S3 = A21 + A22
Matrix_Sub(N / 2, MatrixB21, MatrixB11, MatrixS4);//S4 = B21 - B11
Matrix_Sum(N / 2, MatrixA11, MatrixA22, MatrixS5);//S5 = A11 + A22
Matrix_Sum(N / 2, MatrixB11, MatrixB22, MatrixS6);//S6 = B11 + B22
Matrix_Sub(N / 2, MatrixA12, MatrixA22, MatrixS7);//S7 = A12 - A22
Matrix_Sum(N / 2, MatrixB21, MatrixB22, MatrixS8);//S8 = B21 + B22
Matrix_Sub(N / 2, MatrixA11, MatrixA21, MatrixS9);//S9 = A11 - A21
Matrix_Sum(N / 2, MatrixB11, MatrixB12, MatrixS10);//S10 = B11 + B12
//做7个辅助矩阵P,计算乘法
int** MatrixP1 = new int*[N / 2];
int** MatrixP2 = new int*[N / 2];
int** MatrixP3 = new int*[N / 2];
int** MatrixP4 = new int*[N / 2];
int** MatrixP5 = new int*[N / 2];
int** MatrixP6 = new int*[N / 2];
int** MatrixP7 = new int*[N / 2];
//#pragma omp parallel for
for (int i = 0; i < N / 2; i++)//分配连续内存
{
MatrixP1[i] = new int[N / 2];
MatrixP2[i] = new int[N / 2];
MatrixP3[i] = new int[N / 2];
MatrixP4[i] = new int[N / 2];
MatrixP5[i] = new int[N / 2];
MatrixP6[i] = new int[N / 2];
MatrixP7[i] = new int[N / 2];
}
Matrix_Mul1(N / 2, MatrixA11, MatrixS1, MatrixP1);//P1 = A11 ? S1
Matrix_Mul1(N / 2, MatrixS2, MatrixB22, MatrixP2);//P2 = S2 ? B22
Matrix_Mul1(N / 2, MatrixS3, MatrixB11, MatrixP3);//P3 = S3 ? B11
Matrix_Mul1(N / 2, MatrixA22, MatrixS4, MatrixP4);//P4 = A22 ? S4
Matrix_Mul1(N / 2, MatrixS5, MatrixS6, MatrixP5);//P5 = S5 ? S6
Matrix_Mul1(N / 2, MatrixS7, MatrixS8, MatrixP6);//P6 = S7 ? S8
Matrix_Mul1(N / 2, MatrixS9, MatrixS10, MatrixP7);//P7 = S9 ? S10
//根据以上7个结果计算C矩阵
Matrix_Sum(N / 2, MatrixP5, MatrixP4, MatrixC11); //C11 = P5 + P4 - P2 + P6
Matrix_Sub(N / 2, MatrixC11, MatrixP2, MatrixC11);
Matrix_Sum(N / 2, MatrixC11, MatrixP6, MatrixC11);
Matrix_Sum(N / 2, MatrixP1, MatrixP2, MatrixC12);//C12 = P1 + P2
Matrix_Sum(N / 2, MatrixP3, MatrixP4, MatrixC21); //C21 = P3 + P4
Matrix_Sum(N / 2, MatrixP5, MatrixP1, MatrixC22); //C22 = P5 + P1 - P3 - P7
Matrix_Sub(N / 2, MatrixC22, MatrixP3, MatrixC22);
Matrix_Sub(N / 2, MatrixC22, MatrixP7, MatrixC22);
//将C11,C12,C21,C21合并为C矩阵
//#pragma omp parallel for
for (int i = 0; i < N / 2; i++){
for (int j = 0; j < N / 2; j++){
MatrixC[i][j] = MatrixC11[i][j];
MatrixC[i][j+N/2] = MatrixC12[i][j];
MatrixC[i+N/2][j] = MatrixC21[i][j];
MatrixC[i+N/2][j+N/2] = MatrixC22[i][j];
}
}
}
//朴素矩阵相乘算法
void NormalMul_Matrix(int N, int **MatrixA, int **MatrixB, int **MatrixC){
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
MatrixC[i][j] = 0;
for (int k = 0; k < N; k++){
MatrixC[i][j] = MatrixC[i][j]+MatrixA[i][k] * MatrixB[k][j];
}
}
}
}
void NormalMul_Matrix1(int N, int **MatrixA, int **MatrixB, int **MatrixC3){
#pragma omp parallel for
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
MatrixC3[i][j] = 0;
for (int k = 0; k < N; k++){
MatrixC3[i][j] = MatrixC3[i][j]+MatrixA[i][k] * MatrixB[k][j];
}
}
}
}
//初始化矩阵A,B
void Init_Matrix(int N,int** MatrixA, int** MatrixB){
srand(time(NULL)+rand());
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
MatrixA[i][j] = rand() % 10 + 1;//产生1~10
MatrixB[i][j] = rand() % 10 + 1;
}
}
/*
C++中rand() 函数的用法
1、rand()不需要参数,它会返回一个从0到最大随机数的任意整数,最大随机数的大小通常是固定的一个大整数。
2、如果你要产生0~99这100个整数中的一个随机整数,可以表达为:int num = rand() % 100;
这样,num的值就是一个0~99中的一个随机数了。
3、如果要产生1~100,则是这样:int num = rand() % 100 + 1;
4、总结来说,可以表示为:int num = rand() % n +a;
其中的a是起始值,n-1+a是终止值,n是整数的范围。
*/
}
//矩阵A,B测试用例
void Test_Matrix(int N, int** MatrixA, int** MatrixB){
for (int i = 0; i < N; i++){
for (int j = 0; j < N; j++){
MatrixA[i][j] = 1;
MatrixB[i][j] = 2;
}
}
}
int main(){
for (int i = 0; i < N; i++)//分配连续内存
{
MatrixA[i] = new int[N];
MatrixB[i] = new int[N];
MatrixC[i] = new int[N];
MatrixC1[i] = new int[N];
MatrixC2[i] = new int[N];
}
Init_Matrix(N, MatrixA, MatrixB);//给矩阵A B赋值
//Test_Matrix(N, MatrixA, MatrixB);
//cout << "A矩阵为:" << endl;
//PrintMatrix(MatrixA, N);
//cout << "B矩阵为:" << endl;
//PrintMatrix(MatrixB, N);
//cout << "朴素矩矩阵为:" << endl;
double starttime=omp_get_wtime();
NormalMul_Matrix(N, MatrixA, MatrixB, MatrixC1);
double endtime=omp_get_wtime();
printf("\n朴素串行时间:%lfs\n",endtime-starttime);
double starttime3=omp_get_wtime();
NormalMul_Matrix1(N, MatrixA, MatrixB, MatrixC1);
double endtime3=omp_get_wtime();
printf("\n朴素并行时间:%lfs\n",endtime3-starttime3);
//PrintMatrix(MatrixC1, N);
//cout << "Strassen矩阵为:" << endl;
double starttime2=omp_get_wtime();
Strassen1(N, MatrixA, MatrixB, MatrixC2);
double endtime2=omp_get_wtime();
printf("\nStrassen串行时间:%lfs\n",endtime2-starttime2);
double starttime1=omp_get_wtime();
Strassen(N, MatrixA, MatrixB, MatrixC);
double endtime1=omp_get_wtime();
printf("\nStrassen并行时间:%lfs\n",endtime1-starttime1);
//PrintMatrix(MatrixC, N);
//system("pause");
return 0; //等待按任意键退出
}
编译:g++ -fopenMP stressen.c -o stressen
运行:./stressen
结果如下:
所有矩阵均为随机生成的大小统一为1024*1024 可自行修改。