SIMD

SIMD:single instruction multiply data
单指令多数据,多用于矢量运算中,可以加速指令运算,比如矩阵乘

SIMD的思想在不同平台架构下进行了实现

  • X86:SSE指令
  • ARM:NEON指令
  • MIPS:MSA

不同平台架构下为实现SIMD思想也添加了不同的硬件支持(专用寄存器):

  • X86:目前支持128bit寄存器,256bit,甚至是512bit
  • ARM:128bit
  • MIPS:128bit

为何SIMD可以更加快速的进行矩阵运算
一般流程:每次从内存加载一个数据寄存器进行运算
SIMD:一次加载多个数据到专用寄存器,则一条指令就能进行多个数据的加乘运算

SSE指令下:矩阵运算主要流程如下:
SIMD_第1张图片
SIMD.png

简单来说就是:加载 -> 运算 -> 存储

  1. 加载:将数据从内存加载到专用寄存器,一次可以加载多个数据
  2. 运算:使用SSE乘/加命令讲点积和累加
  3. 存储:将结果保存到内存中
    具体命令可参考Intel SSE指令网址

MSA与NEON指令类似。
在矩阵运算中,SIMD优化视具体情况而定,除了SIMD,还可进行循环展开,缓存控制(将数据拆分至缓存可以存储大小,方便数据读取),通用寄存器使用(频繁使用的数据直接放至寄存器,C++使用register关键字)

具体代码参考:
Matrix.h

#include 
#include 
#include 
#include 
using namespace std;



template    //T is char, short or int;
class Matrix{

public:
    T **_Matrix;
    size_t _Row, _Column;
    Matrix() :_Matrix(nullptr), _Row(0), _Column(0) {}
    Matrix(size_t r, size_t c) :_Row(r), _Column(c) {
        if (!_Column || !_Row) return;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **p = _Matrix, **end = _Matrix + _Row;
        do {
            *(p++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
        } while (p != end);
    }
    Matrix(size_t r, size_t c, const T init) :_Row(r), _Column(c) {//use init to initialize the matrix
        if (!_Column || !_Row) return;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **pr = _Matrix, **endr = _Matrix + _Row, *p, *end;
        do {
            p = *(pr++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
            end = p + _Column;
            do {
                *(p++) = init;
            } while (p != end);
        } while (pr != endr);
    }
    Matrix(const Matrix& B) {//拷贝构造
        _Row = B._Row;
        _Column = B._Column;
        _Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
        T **pbr = B._Matrix, **endbr = B._Matrix + _Row, *pb, *endb,
            **par = _Matrix, **endar = _Matrix + _Row, *pa, *enda;
        do {
            pa = *(par++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
            enda = pa + _Column;
            pb = *(pbr++);
            endb = pb + _Column;
            do {
                *(pa++) = *(pb++);
            } while (pa != enda);
        } while (par != endar);
    }
    ~Matrix() {//析构
        if (!_Matrix || !_Column || !_Row) return;
        T **p = _Matrix, **end = _Matrix + _Row;
        do {
            _mm_free(*(p++));
        } while (p != end);
        _Column = _Row = 0;
        _mm_free(_Matrix);
    }
    T& operator()(size_t i, size_t j) { return _Matrix[i][j]; }//get the  elements of row i and column j

    const T operator()(size_t i, size_t j)const { return _Matrix[i][j]; }//get the  elements of row i and column j

    Matrix& operator=(Matrix& B) {//
        if (_Matrix && _Row && _Column) {
            T **p = _Matrix, **end = _Matrix + _Row;
            do {
                _mm_free(*(p++));
            } while (p != end);
            _mm_free(_Matrix);
        }
        _Row = B._Row;
        _Column = B._Column;
        _Matrix = B._Matrix;
        B._Matrix = nullptr;
        return *this;
    }

    //transpose the matrix
    Matrix Tranpose() const{
        Matrix temp(_Column, _Row);
        for (int i = 0; i < _Row; i++) {
            for (int j = 0; j < _Column; j++) {
                temp(j, i) = (*this)(i, j);
            }
        }
        return temp;
    }

    //alignas
    Matrix _alignas(int align_row, int align_col) {
        int rest_row = align_row - _Row % align_row, rest_col = align_col - _Column % align_col;
        if (rest_row || rest_col) {
            int row = _Row + rest_row, col = _Column + rest_col;
            Matrix temp(row, col, 0);
            T **pbr = _Matrix, **endbr = _Matrix + _Row, *pb, *endb,
                **par = temp._Matrix, *pa;
            do {
                pa = *(par++);
                pb = *(pbr++);
                endb = pb + _Column;
                do {
                    *(pa++) = *(pb++);
                } while (pb != endb);
            } while (pbr != endbr);
            return temp;
        }
        return *this;
    }
    void print() {
        cout << "the matrix is:\n" << endl;
        int column_min = 10 < _Column ? 10 : _Column;
        int row_min = 10 <_Row ? 10 : _Row;
        for (short i = 0; i < row_min; i++) {
            for (short j = 0; j < column_min; j++) {
                cout << (short)_Matrix[i][j] << "  ";
            }
            cout << endl;
        }
    }
};

Vector.h

#pragma once
#include 
#include 
#include 
#include 
using namespace std;



template    //T is char, short or int;
class Vector {

public:
    T *_Vector;
    size_t _Column;
    Vector() :_Vector(NULL),  _Column(0) {}
    Vector(size_t c) : _Column(c) {
        if (!_Column) return;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T), 16);

    }
    Vector(size_t c, const T init) : _Column(c) {//use init to initialize the Vector
        if (!_Column) return;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T), 16);
        T *pr = _Vector, *endr = _Vector + _Column, *p, *end;
        do {
            *(pr++) = init; 
        } while (pr != endr);
    }
    Vector(const Vector& B) {//拷贝构造
        _Column = B._Column;
        _Vector = (T*)_mm_malloc(_Column * sizeof(T*), 16);
        T *pbr = B._Vector, *endbr = B._Vector + _Column, *par = _Vector;
        do {
            *(par++) = *(pbr++);
        } while (pbr != endbr);
    }
    ~Vector() {//析构
        if (!_Vector) return;
        T *p = _Vector;
        _mm_free(_Vector);
    }
    T& operator()(size_t i) { return _Vector[i]; }//get the  elements of row i and column j

    const T operator()(size_t i)const { return _Vector[i]; }//get the  elements of row i and column j

    
    Vector& operator=(Vector& B) {//
        if (_Column && _Vector != NULL) {
            _mm_free(_Vector);
        }
        _Column = B._Column;
        _Vector = B._Vector;
        B._Vector = nullptr;
        return *this;
    }
    

    //alignas(16)
    Vector _alignas(int align_size) {
        int rest = align_size - _Column % align_size;
        if (rest) {
            int col = _Column + rest;
            Vector temp(col, 0);
            T *p = _Vector, *end = _Vector + _Column, *t = temp._Vector;
            do {
                *(t++) = *(p++);
            } while (p != end);
            return temp;
        }
        return *this;
    }
    //print
    void print() {
        cout << "the Vector Col is:  " <<_Column << "\nThe Vector is:\n" << endl;
        int column_min = 10 < _Column ? 10 : _Column;
        for (int i = 0; i < column_min; i++) {
            cout << (short)_Vector[i] << "  ";
        }
        cout << endl;
    }

};


Operator.h

#pragma once
#include 
#include 
#include 
#include "Matrix.h"
#include "Vector.h"
#include                  
#include  
using namespace std;

///Vec*Matrix
template 
Vector Vec_Mul_Mat_C(const Vector &V, const Matrix &A) {
    if (V._Column != A._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return NULL;
    }
    Vector ret(A._Column, 0);
    int sum = 0;
    for (int i = 0; i < A._Column; ++i) {
        for (int j = 0; j < V._Column; ++j) {
            ret(i) += V(j) * A(j, i);
        }
    }
    return ret;
}


template 
Vector Vec_Mul_Mat_SSE(const Vector &V, const Matrix &A) {
    if (V._Column != A._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return NULL;
    }

    Vector ret(A._Column, 0);
    Matrix C = A.Tranpose();
    
    if (sizeof(T) == 1) {
        __m128i X, Y;
        __m128i acc;
        alignas(16) short temp[8];
        int i(0), j(0);
        for (i = 0; i < C._Row; ++i) {
            acc = _mm_setzero_si128();
            for (j = 0; j + 16 <= V._Column; j += 16) {
                X = _mm_load_si128((__m128i*)V._Vector);
                Y = _mm_load_si128(((__m128i*)&(C._Matrix[i][j])));

                acc = _mm_add_epi16(acc, _mm_maddubs_epi16(X, Y));
            }
            _mm_store_si128((__m128i*)&temp[0], acc);
            ret(i) = temp[0] + temp[1] + temp[2] + temp[3] + temp[4] + temp[5] + temp[6] + temp[7];
        }
    }
    else if (sizeof(T) == 2) {
        Vector ret(A._Column, 0);
        __m128i X, Y;
        __m128i acc;
        alignas(16) int temp[4];
        int i(0), j(0);
        for (i = 0; i < C._Row; ++i) {
            acc = _mm_setzero_si128();
            for (j = 0; j + 8 <= V._Column; j += 8) {
                X = _mm_load_si128((__m128i*)V._Vector);
                Y = _mm_load_si128(((__m128i*)&(C._Matrix[i][j])));

                acc = _mm_add_epi32(acc, _mm_madd_epi16(X, Y));
            }
            _mm_store_si128((__m128i*)&temp[0], acc);
            ret(i) = temp[0] + temp[1] + temp[2] + temp[3];
        }
    }
    return ret;
}



//matrix *matrix C
template 
Matrix Matrix_Mul_C(const Matrix &A, const Matrix &B) {

    Matrix retC(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retC;
    }
    
    for (short i = 0; i < A._Row; i++) {
        for (short j = 0; j < B._Column; j++) {
            for (short k = 0; k < A._Column; k++) {
                retC(i, j) += A(i, k)*B(k, j);
            }
        }
    }
    return retC;
}


//unroll-loop for 1*8
template 
Matrix Matrix_Mul_C_(const Matrix &A, const Matrix &B) {
    
    Matrix retC(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retC;
    }

    
    int i, j, k;
    for (i = 0; i < A._Row; ++i) {
        for (j = 0; j < B._Column; j++) {
            for (k = 0; k + 8 <= A._Column; k += 8) {
                retC(i, j) += A(i, k)*B(k, j) + A(i, k + 1)*B(k + 1, j) + A(i, k + 2)*B(k + 2, j) + A(i, k + 3)*B(k + 3, j)
                    + A(i, k + 4)*B(k + 4, j) + A(i, k + 5)*B(k + 5, j) + A(i, k + 6)*B(k + 6, j) + A(i, k + 7)*B(k + 7, j);
            }
        }
    }
    return retC;
}


//unroll-loop for 8*8
template 
Matrix Matrix_Mul_CC_(const Matrix &A, const Matrix &B) {
    Matrix retC(A._Row, B._Column, 0);
    int i, j, k;
    for (i = 0; i + 8 <= A._Row; i += 8) {
        for (j = 0; j < B._Column; j++) {
            for (k = 0; k + 8 <= A._Column; k += 8) {
                for (int t = 0; t < 8; t++) {
                    retC(i + t, j) += A(i + t, k)*B(k, j) + A(i + t, k + 1)*B(k + 1, j) + A(i + t, k + 2)*B(k + 2, j) + A(i + t, k + 3)*B(k + 3, j)
                        + A(i + t, k + 4)*B(k + 4, j) + A(i + t, k + 5)*B(k + 5, j) + A(i + t, k + 6)*B(k + 6, j) + A(i + t, k + 7)*B(k + 7, j);
                }
            }
        }
    }
    return retC;
}



template
Matrix Matrix_Mul_SSE(const Matrix &A, const Matrix &B) {
    Matrix retSSE(A._Row, B._Column, 0);
    if (A._Column != B._Row) {
        cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
        return retSSE;
    }
    Matrix C = B.Tranpose();
    __m128i X, Y;
    __m128i acc = _mm_setzero_si128();

    int i, j, k;
    if(sizeof(T) == 1) {
        alignas(16) short temp[8];
        for (i = 0; i < A._Row; ++i) {
            for (j = 0; j < C._Row; ++j) {
                acc = _mm_setzero_si128();
                for (k = 0; k + 16 <= A._Column; k += 16) {
                    X = _mm_load_si128((__m128i*)&(A._Matrix[i][k]));
                    Y = _mm_load_si128((__m128i*)&(C._Matrix[j][k]));
                    acc = _mm_add_epi16(acc, _mm_maddubs_epi16(X, Y));
                }
                _mm_store_si128((__m128i*)&temp[0], acc);
                retSSE(i, j) = temp[0] + temp[1] + temp[2] + temp[3] + temp[4] + temp[5] + temp[6] + temp[7];
            }
        }
    }
    else if(sizeof(T) == 2) {
        alignas(16) int temp[4];
        for (i = 0; i < A._Row; ++i) {
            for (j = 0; j < C._Row; ++j) {
                acc = _mm_setzero_si128();
                for (k = 0; k + 8 <= A._Column; k += 8) {
                    X = _mm_load_si128((__m128i*)&(A._Matrix[i][k]));
                    Y = _mm_load_si128((__m128i*)&(C._Matrix[j][k]));
                    acc = _mm_add_epi32(acc, _mm_madd_epi16(X, Y));
                }
                _mm_store_si128((__m128i*)&temp[0], acc);
                retSSE(i, j) = temp[0] + temp[1] + temp[2] + temp[3];
                //retC(i, j) = inner_prod;
            }
        }
    }   
    return retSSE;
}


///compare the result of C and SSE
template 
bool C_Compare_SSE(Matrix &C_Matrix, Matrix SSE_Matrix) {
    int i(C_Matrix._Row);
    while (--i >= 0) {
        if (memcmp(C_Matrix._Matrix[i], SSE_Matrix._Matrix[i], C_Matrix._Column * sizeof(T)) != 0) {
            return false;
        }
    }
    return true;
}

main.c

#include 
#include "Operate.h"
#include 
#include 

//Vec * Matrix
template 
void Vec_Mat() {
    int N = 251;
    Matrix A(10 * N, N*3, 2);
    double start, dt;
    A(3, 1) = 3;
    Vector C(10 * N, 2);
    Vector C1 = C._alignas(16);
    Matrix C_Trans = A._alignas(16, 16); 

    cout << "\nThe time of C Vec * Matrix: " << endl;
    start = clock();
    int cir = 100;
    Vector ret, retSSE;
    while (cir-- > 0) {
        Vec_Mul_Mat_C(C1, C_Trans);
    }
    dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    ret = Vec_Mul_Mat_C(C1, C_Trans);
    //ret.print();


    cout << "\nThe time of SSE Vec * Matrix :" << endl;
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Vec_Mul_Mat_SSE(C1, C_Trans);
    }

    dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Vec_Mul_Mat_SSE(C1, C_Trans);
    //retSSE.print();

    if (memcmp(ret._Vector, retSSE._Vector, ret._Column * sizeof(char)) == 0) {
        cout << "C and SSE have the same result " << endl;
    }
}

template 
void Mat_Mat() {
    int N = 254;
    Matrix A(N, N, 2), B(N, N, 1);
    A(3, 1) = 3, B(1, 0) = 4; B(2, 0) = 3;
    Matrix A_align = A._alignas(16, 16);
    Matrix B_align = B._alignas(16, 16);
    double start, dt;

    cout << "\nThe time of C Matrix * Matrix: " << endl;
    start = clock();
    int cir = 100;
    Matrix ret, retSSE;
    while (cir-- > 0) {
        Matrix_Mul_C(A_align, B_align);
    }
    dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    ret = Matrix_Mul_C(A_align, B_align);
    //ret.print();
    

    /*
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_C_(A_align, B_align);
    }

    dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;

    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_CC_(A_align, B_align);
    }

    dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;

*/

    cout << "\nThe time of SSE Matrix * Matrix :" << endl;
    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_SSE(A_align, B_align);
    }

    dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Matrix_Mul_SSE(A_align, B_align);
    retSSE.print();


    start = clock();
    cir = 100;
    while (cir-- > 0) {
        Matrix_Mul_SSE_8(A_align, B_align);
    }

    dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 100 << "s" << endl;
    retSSE = Matrix_Mul_SSE_8(A_align, B_align);
    retSSE.print();

    /*
    cout << "\nThe time of SSE Vec * Matrix :" << endl;
    start = clock();
    cir = 10;
    while (cir-- > 0) {
        retSSE = Matrix_Mul_SSE_8(A_align, B_align);
    }

    dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
    cout << dt / 10 << "s" << endl;
    retSSE.print();
    */
    if (C_Compare_SSE(ret, retSSE)) {
        cout << "C and SSE have the same result " << endl;
    }
    
}


void main(short argc, char* argv[]) {

    //Vec_Mat();

    Mat_Mat();
    cout << "hello" << endl;
    return;
}

其中,Operator.h主要是进行矩阵运算,为了方便进行char和short类型的数据运算,使用了模板,初次使用,莫怪
本文思想[参考网址]。(https://blog.csdn.net/artorias123/article/details/86527456)

你可能感兴趣的:(SIMD)