python 加速计算矩阵乘法

python 加速计算矩阵乘法

小记

2022.4.30

做的毕设要使用C++对python做加速,用的时pybind11接口。

原以为即使用C++加速了,但自己写的代码总不上库里的,加速不了,结果确是真的可以大大加速!

暂时不用Cache Blocking,目前用这个调换循环顺序的算法就可以加速很多了。

参考网站: https://zhuanlan.zhihu.com/p/362854057

方案1:用C++实现朴素算法

//2维矩阵相乘,朴素算法
py::array_t matmul_2d(py::array_t& input1, py::array_t& input2)
{
    // 获取input1, input2的信息
    py::buffer_info buf1 = input1.request();
    py::buffer_info buf2 = input2.request();
  
    if (buf1.ndim != 2 || buf2.ndim != 2)
    {
        throw std::runtime_error("Number of dimensions must be two!");
    }
    if (buf1.shape[1] != buf2.shape[0])
    {
        throw std::runtime_error("Input shape must match");
    }

    const int buf1_r = buf1.shape[0];
    const int buf1_c = buf1.shape[1];
    const int buf2_r = buf2.shape[0];
    const int buf2_c = buf2.shape[1];
    const int buf3_r = buf1.shape[0];
    const int buf3_c = buf2.shape[1];

    //申请空间,确定结果矩阵的形状
    auto result = py::array_t({ buf3_r,buf3_c });
    py::buffer_info buf3 = result.request();

    //获取numpy.ndarray 数据指针
    double* ptr1 = (double*)buf1.ptr;
    double* ptr2 = (double*)buf2.ptr;
    double* ptr3 = (double*)buf3.ptr;

    //指针访问numpy.ndarray
    //loc1,loc3的提出会影响代码易读性,但是可以加速
    for (int i = 0; i < buf3_r; ++i)
    {
        int loc3 = i * buf3_c;
        for (int j = 0; j < buf3_c;++j)
        {
            double sum = 0;
            int loc1 = i * buf1_c;
            for (int k = 0; k < buf1_c; ++k)
            {
                sum += ptr1[ loc1+ k] * ptr2[k * buf2_c + j];
            }
            ptr3[loc3 + j] = sum;
        }
    }
    return result;
}

方案二:C++实现,调换循环顺序的算法

算法思想来自于这里 : https://zhuanlan.zhihu.com/p/146250334

/// 2d矩阵相乘,调换循环顺序的算法
py::array_t matmul_2d2 (py::array_t & input1, py::array_t & input2)
{
    // 获取input1, input2的信息
    py::buffer_info buf1 = input1.request();
    py::buffer_info buf2 = input2.request();

    if (buf1.ndim != 2 || buf2.ndim != 2)
    {
        throw std::runtime_error("Number of dimensions must be two!");
    }

    if (buf1.shape[1] != buf2.shape[0])
    {
        throw std::runtime_error("Input shape must match");
    }
    const int buf1_r = buf1.shape[0];
    const int buf1_c = buf1.shape[1];
    const int buf2_r = buf2.shape[0];
    const int buf2_c = buf2.shape[1];
    const int buf3_r = buf1.shape[0];
    const int buf3_c = buf2.shape[1];
    //申请空间,确定结果矩阵的形状
    auto result = py::array_t({ buf3_r,buf3_c });
    py::buffer_info buf3 = result.request();

   

    //获取numpy.ndarray 数据指针
    double* ptr1 = (double*)buf1.ptr;
    double* ptr2 = (double*)buf2.ptr;
    double* ptr3 = (double*)buf3.ptr;

    for (int i = 0; i < buf3_r; ++i)
        for (int j = 0; j < buf3_c; ++j)
            ptr3[i * buf3_c + j] = 0.0;
    //return result;
    //指针访问numpy.ndarray
    for (int i = 0; i < buf3_r; ++i)
    {
        for (int k = 0; k < buf1_c; ++k)
        {
            double v = ptr1[i * buf1_c+k];
            int loc3 = i * buf3_c;
            int loc2 = k * buf2_c;
            for (int j = 0; j < buf3_c; ++j)
            {
               ptr3[loc3+j]+= v* ptr2[loc2 + j];
            }
        }
    }
    return result;
}

方案三:使用numpy

测速代码

import numpy as np
from icecream import ic
import example
import time
import platform

#生成矩阵
a=np.random.randint(2,4,(400,300))
b=np.random.randint(2,4,(300,500))

#测试
T1 = time.perf_counter()
c1=example.matmul_2d(a,b)
T2 =time.perf_counter()
print('算法1(朴素C++)——程序运行时间:%s毫秒' % ((T2 - T1)*1000))

#算法2
T1 = time.perf_counter()
c2=example.matmul_2d2(a,b)
T2 =time.perf_counter()
print('算法2(调换循环顺序C++)——程序运行时间:%s毫秒' % ((T2 - T1)*1000))

#使用python算
T1 = time.perf_counter()
c3=np.matmul(a,b)
T2 =time.perf_counter()
print('算法3(numpy)——程序运行时间:%s毫秒' % ((T2 - T1)*1000))

#确保计算结果相同
print((c1==c2).all())
print((c1==c3).all())

"""
算法1(朴素C++)——程序运行时间:74.98659999999995毫秒
算法2(调换循环顺序C++)——程序运行时间:44.84659999999996毫秒
算法3(numpy)——程序运行时间:80.46960000000003毫秒
True
True

"""

你可能感兴趣的:(深度学习,python)