SIMD:single instruction multiply data
单指令多数据,多用于矢量运算中,可以加速指令运算,比如矩阵乘
SIMD的思想在不同平台架构下进行了实现
- X86:SSE指令
- ARM:NEON指令
- MIPS:MSA
不同平台架构下为实现SIMD思想也添加了不同的硬件支持(专用寄存器):
- X86:目前支持128bit寄存器,256bit,甚至是512bit
- ARM:128bit
- MIPS:128bit
为何SIMD可以更加快速的进行矩阵运算
一般流程:每次从内存加载一个数据寄存器进行运算
SIMD:一次加载多个数据到专用寄存器,则一条指令就能进行多个数据的加乘运算
简单来说就是:加载 -> 运算 -> 存储
- 加载:将数据从内存加载到专用寄存器,一次可以加载多个数据
- 运算:使用SSE乘/加命令讲点积和累加
- 存储:将结果保存到内存中
具体命令可参考Intel SSE指令网址
MSA与NEON指令类似。
在矩阵运算中,SIMD优化视具体情况而定,除了SIMD,还可进行循环展开,缓存控制(将数据拆分至缓存可以存储大小,方便数据读取),通用寄存器使用(频繁使用的数据直接放至寄存器,C++使用register关键字)
具体代码参考:
Matrix.h
#include
#include
#include
#include
using namespace std;
template //T is char, short or int;
class Matrix{
public:
T **_Matrix;
size_t _Row, _Column;
Matrix() :_Matrix(nullptr), _Row(0), _Column(0) {}
Matrix(size_t r, size_t c) :_Row(r), _Column(c) {
if (!_Column || !_Row) return;
_Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
T **p = _Matrix, **end = _Matrix + _Row;
do {
*(p++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
} while (p != end);
}
Matrix(size_t r, size_t c, const T init) :_Row(r), _Column(c) {//use init to initialize the matrix
if (!_Column || !_Row) return;
_Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
T **pr = _Matrix, **endr = _Matrix + _Row, *p, *end;
do {
p = *(pr++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
end = p + _Column;
do {
*(p++) = init;
} while (p != end);
} while (pr != endr);
}
Matrix(const Matrix& B) {//拷贝构造
_Row = B._Row;
_Column = B._Column;
_Matrix = (T**)_mm_malloc(_Row * sizeof(T*), 16);
T **pbr = B._Matrix, **endbr = B._Matrix + _Row, *pb, *endb,
**par = _Matrix, **endar = _Matrix + _Row, *pa, *enda;
do {
pa = *(par++) = (T*)_mm_malloc(_Column * sizeof(T), 16);
enda = pa + _Column;
pb = *(pbr++);
endb = pb + _Column;
do {
*(pa++) = *(pb++);
} while (pa != enda);
} while (par != endar);
}
~Matrix() {//析构
if (!_Matrix || !_Column || !_Row) return;
T **p = _Matrix, **end = _Matrix + _Row;
do {
_mm_free(*(p++));
} while (p != end);
_Column = _Row = 0;
_mm_free(_Matrix);
}
T& operator()(size_t i, size_t j) { return _Matrix[i][j]; }//get the elements of row i and column j
const T operator()(size_t i, size_t j)const { return _Matrix[i][j]; }//get the elements of row i and column j
Matrix& operator=(Matrix& B) {//
if (_Matrix && _Row && _Column) {
T **p = _Matrix, **end = _Matrix + _Row;
do {
_mm_free(*(p++));
} while (p != end);
_mm_free(_Matrix);
}
_Row = B._Row;
_Column = B._Column;
_Matrix = B._Matrix;
B._Matrix = nullptr;
return *this;
}
//transpose the matrix
Matrix Tranpose() const{
Matrix temp(_Column, _Row);
for (int i = 0; i < _Row; i++) {
for (int j = 0; j < _Column; j++) {
temp(j, i) = (*this)(i, j);
}
}
return temp;
}
//alignas
Matrix _alignas(int align_row, int align_col) {
int rest_row = align_row - _Row % align_row, rest_col = align_col - _Column % align_col;
if (rest_row || rest_col) {
int row = _Row + rest_row, col = _Column + rest_col;
Matrix temp(row, col, 0);
T **pbr = _Matrix, **endbr = _Matrix + _Row, *pb, *endb,
**par = temp._Matrix, *pa;
do {
pa = *(par++);
pb = *(pbr++);
endb = pb + _Column;
do {
*(pa++) = *(pb++);
} while (pb != endb);
} while (pbr != endbr);
return temp;
}
return *this;
}
void print() {
cout << "the matrix is:\n" << endl;
int column_min = 10 < _Column ? 10 : _Column;
int row_min = 10 <_Row ? 10 : _Row;
for (short i = 0; i < row_min; i++) {
for (short j = 0; j < column_min; j++) {
cout << (short)_Matrix[i][j] << " ";
}
cout << endl;
}
}
};
Vector.h
#pragma once
#include
#include
#include
#include
using namespace std;
template //T is char, short or int;
class Vector {
public:
T *_Vector;
size_t _Column;
Vector() :_Vector(NULL), _Column(0) {}
Vector(size_t c) : _Column(c) {
if (!_Column) return;
_Vector = (T*)_mm_malloc(_Column * sizeof(T), 16);
}
Vector(size_t c, const T init) : _Column(c) {//use init to initialize the Vector
if (!_Column) return;
_Vector = (T*)_mm_malloc(_Column * sizeof(T), 16);
T *pr = _Vector, *endr = _Vector + _Column, *p, *end;
do {
*(pr++) = init;
} while (pr != endr);
}
Vector(const Vector& B) {//拷贝构造
_Column = B._Column;
_Vector = (T*)_mm_malloc(_Column * sizeof(T*), 16);
T *pbr = B._Vector, *endbr = B._Vector + _Column, *par = _Vector;
do {
*(par++) = *(pbr++);
} while (pbr != endbr);
}
~Vector() {//析构
if (!_Vector) return;
T *p = _Vector;
_mm_free(_Vector);
}
T& operator()(size_t i) { return _Vector[i]; }//get the elements of row i and column j
const T operator()(size_t i)const { return _Vector[i]; }//get the elements of row i and column j
Vector& operator=(Vector& B) {//
if (_Column && _Vector != NULL) {
_mm_free(_Vector);
}
_Column = B._Column;
_Vector = B._Vector;
B._Vector = nullptr;
return *this;
}
//alignas(16)
Vector _alignas(int align_size) {
int rest = align_size - _Column % align_size;
if (rest) {
int col = _Column + rest;
Vector temp(col, 0);
T *p = _Vector, *end = _Vector + _Column, *t = temp._Vector;
do {
*(t++) = *(p++);
} while (p != end);
return temp;
}
return *this;
}
//print
void print() {
cout << "the Vector Col is: " <<_Column << "\nThe Vector is:\n" << endl;
int column_min = 10 < _Column ? 10 : _Column;
for (int i = 0; i < column_min; i++) {
cout << (short)_Vector[i] << " ";
}
cout << endl;
}
};
Operator.h
#pragma once
#include
#include
#include
#include "Matrix.h"
#include "Vector.h"
#include
#include
using namespace std;
///Vec*Matrix
template
Vector Vec_Mul_Mat_C(const Vector &V, const Matrix &A) {
if (V._Column != A._Row) {
cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
return NULL;
}
Vector ret(A._Column, 0);
int sum = 0;
for (int i = 0; i < A._Column; ++i) {
for (int j = 0; j < V._Column; ++j) {
ret(i) += V(j) * A(j, i);
}
}
return ret;
}
template
Vector Vec_Mul_Mat_SSE(const Vector &V, const Matrix &A) {
if (V._Column != A._Row) {
cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
return NULL;
}
Vector ret(A._Column, 0);
Matrix C = A.Tranpose();
if (sizeof(T) == 1) {
__m128i X, Y;
__m128i acc;
alignas(16) short temp[8];
int i(0), j(0);
for (i = 0; i < C._Row; ++i) {
acc = _mm_setzero_si128();
for (j = 0; j + 16 <= V._Column; j += 16) {
X = _mm_load_si128((__m128i*)V._Vector);
Y = _mm_load_si128(((__m128i*)&(C._Matrix[i][j])));
acc = _mm_add_epi16(acc, _mm_maddubs_epi16(X, Y));
}
_mm_store_si128((__m128i*)&temp[0], acc);
ret(i) = temp[0] + temp[1] + temp[2] + temp[3] + temp[4] + temp[5] + temp[6] + temp[7];
}
}
else if (sizeof(T) == 2) {
Vector ret(A._Column, 0);
__m128i X, Y;
__m128i acc;
alignas(16) int temp[4];
int i(0), j(0);
for (i = 0; i < C._Row; ++i) {
acc = _mm_setzero_si128();
for (j = 0; j + 8 <= V._Column; j += 8) {
X = _mm_load_si128((__m128i*)V._Vector);
Y = _mm_load_si128(((__m128i*)&(C._Matrix[i][j])));
acc = _mm_add_epi32(acc, _mm_madd_epi16(X, Y));
}
_mm_store_si128((__m128i*)&temp[0], acc);
ret(i) = temp[0] + temp[1] + temp[2] + temp[3];
}
}
return ret;
}
//matrix *matrix C
template
Matrix Matrix_Mul_C(const Matrix &A, const Matrix &B) {
Matrix retC(A._Row, B._Column, 0);
if (A._Column != B._Row) {
cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
return retC;
}
for (short i = 0; i < A._Row; i++) {
for (short j = 0; j < B._Column; j++) {
for (short k = 0; k < A._Column; k++) {
retC(i, j) += A(i, k)*B(k, j);
}
}
}
return retC;
}
//unroll-loop for 1*8
template
Matrix Matrix_Mul_C_(const Matrix &A, const Matrix &B) {
Matrix retC(A._Row, B._Column, 0);
if (A._Column != B._Row) {
cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
return retC;
}
int i, j, k;
for (i = 0; i < A._Row; ++i) {
for (j = 0; j < B._Column; j++) {
for (k = 0; k + 8 <= A._Column; k += 8) {
retC(i, j) += A(i, k)*B(k, j) + A(i, k + 1)*B(k + 1, j) + A(i, k + 2)*B(k + 2, j) + A(i, k + 3)*B(k + 3, j)
+ A(i, k + 4)*B(k + 4, j) + A(i, k + 5)*B(k + 5, j) + A(i, k + 6)*B(k + 6, j) + A(i, k + 7)*B(k + 7, j);
}
}
}
return retC;
}
//unroll-loop for 8*8
template
Matrix Matrix_Mul_CC_(const Matrix &A, const Matrix &B) {
Matrix retC(A._Row, B._Column, 0);
int i, j, k;
for (i = 0; i + 8 <= A._Row; i += 8) {
for (j = 0; j < B._Column; j++) {
for (k = 0; k + 8 <= A._Column; k += 8) {
for (int t = 0; t < 8; t++) {
retC(i + t, j) += A(i + t, k)*B(k, j) + A(i + t, k + 1)*B(k + 1, j) + A(i + t, k + 2)*B(k + 2, j) + A(i + t, k + 3)*B(k + 3, j)
+ A(i + t, k + 4)*B(k + 4, j) + A(i + t, k + 5)*B(k + 5, j) + A(i + t, k + 6)*B(k + 6, j) + A(i + t, k + 7)*B(k + 7, j);
}
}
}
}
return retC;
}
template
Matrix Matrix_Mul_SSE(const Matrix &A, const Matrix &B) {
Matrix retSSE(A._Row, B._Column, 0);
if (A._Column != B._Row) {
cout << "The col of vector and row of matrix is not same, please check!!!" << endl;
return retSSE;
}
Matrix C = B.Tranpose();
__m128i X, Y;
__m128i acc = _mm_setzero_si128();
int i, j, k;
if(sizeof(T) == 1) {
alignas(16) short temp[8];
for (i = 0; i < A._Row; ++i) {
for (j = 0; j < C._Row; ++j) {
acc = _mm_setzero_si128();
for (k = 0; k + 16 <= A._Column; k += 16) {
X = _mm_load_si128((__m128i*)&(A._Matrix[i][k]));
Y = _mm_load_si128((__m128i*)&(C._Matrix[j][k]));
acc = _mm_add_epi16(acc, _mm_maddubs_epi16(X, Y));
}
_mm_store_si128((__m128i*)&temp[0], acc);
retSSE(i, j) = temp[0] + temp[1] + temp[2] + temp[3] + temp[4] + temp[5] + temp[6] + temp[7];
}
}
}
else if(sizeof(T) == 2) {
alignas(16) int temp[4];
for (i = 0; i < A._Row; ++i) {
for (j = 0; j < C._Row; ++j) {
acc = _mm_setzero_si128();
for (k = 0; k + 8 <= A._Column; k += 8) {
X = _mm_load_si128((__m128i*)&(A._Matrix[i][k]));
Y = _mm_load_si128((__m128i*)&(C._Matrix[j][k]));
acc = _mm_add_epi32(acc, _mm_madd_epi16(X, Y));
}
_mm_store_si128((__m128i*)&temp[0], acc);
retSSE(i, j) = temp[0] + temp[1] + temp[2] + temp[3];
//retC(i, j) = inner_prod;
}
}
}
return retSSE;
}
///compare the result of C and SSE
template
bool C_Compare_SSE(Matrix &C_Matrix, Matrix SSE_Matrix) {
int i(C_Matrix._Row);
while (--i >= 0) {
if (memcmp(C_Matrix._Matrix[i], SSE_Matrix._Matrix[i], C_Matrix._Column * sizeof(T)) != 0) {
return false;
}
}
return true;
}
main.c
#include
#include "Operate.h"
#include
#include
//Vec * Matrix
template
void Vec_Mat() {
int N = 251;
Matrix A(10 * N, N*3, 2);
double start, dt;
A(3, 1) = 3;
Vector C(10 * N, 2);
Vector C1 = C._alignas(16);
Matrix C_Trans = A._alignas(16, 16);
cout << "\nThe time of C Vec * Matrix: " << endl;
start = clock();
int cir = 100;
Vector ret, retSSE;
while (cir-- > 0) {
Vec_Mul_Mat_C(C1, C_Trans);
}
dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
cout << dt / 100 << "s" << endl;
ret = Vec_Mul_Mat_C(C1, C_Trans);
//ret.print();
cout << "\nThe time of SSE Vec * Matrix :" << endl;
start = clock();
cir = 100;
while (cir-- > 0) {
Vec_Mul_Mat_SSE(C1, C_Trans);
}
dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
cout << dt / 100 << "s" << endl;
retSSE = Vec_Mul_Mat_SSE(C1, C_Trans);
//retSSE.print();
if (memcmp(ret._Vector, retSSE._Vector, ret._Column * sizeof(char)) == 0) {
cout << "C and SSE have the same result " << endl;
}
}
template
void Mat_Mat() {
int N = 254;
Matrix A(N, N, 2), B(N, N, 1);
A(3, 1) = 3, B(1, 0) = 4; B(2, 0) = 3;
Matrix A_align = A._alignas(16, 16);
Matrix B_align = B._alignas(16, 16);
double start, dt;
cout << "\nThe time of C Matrix * Matrix: " << endl;
start = clock();
int cir = 100;
Matrix ret, retSSE;
while (cir-- > 0) {
Matrix_Mul_C(A_align, B_align);
}
dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
cout << dt / 100 << "s" << endl;
ret = Matrix_Mul_C(A_align, B_align);
//ret.print();
/*
start = clock();
cir = 100;
while (cir-- > 0) {
Matrix_Mul_C_(A_align, B_align);
}
dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
cout << dt / 100 << "s" << endl;
start = clock();
cir = 100;
while (cir-- > 0) {
Matrix_Mul_CC_(A_align, B_align);
}
dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
cout << dt / 100 << "s" << endl;
*/
cout << "\nThe time of SSE Matrix * Matrix :" << endl;
start = clock();
cir = 100;
while (cir-- > 0) {
Matrix_Mul_SSE(A_align, B_align);
}
dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
cout << dt / 100 << "s" << endl;
retSSE = Matrix_Mul_SSE(A_align, B_align);
retSSE.print();
start = clock();
cir = 100;
while (cir-- > 0) {
Matrix_Mul_SSE_8(A_align, B_align);
}
dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
cout << dt / 100 << "s" << endl;
retSSE = Matrix_Mul_SSE_8(A_align, B_align);
retSSE.print();
/*
cout << "\nThe time of SSE Vec * Matrix :" << endl;
start = clock();
cir = 10;
while (cir-- > 0) {
retSSE = Matrix_Mul_SSE_8(A_align, B_align);
}
dt = static_cast(clock() - start) / CLOCKS_PER_SEC;
cout << dt / 10 << "s" << endl;
retSSE.print();
*/
if (C_Compare_SSE(ret, retSSE)) {
cout << "C and SSE have the same result " << endl;
}
}
void main(short argc, char* argv[]) {
//Vec_Mat();
Mat_Mat();
cout << "hello" << endl;
return;
}
其中,Operator.h主要是进行矩阵运算,为了方便进行char和short类型的数据运算,使用了模板,初次使用,莫怪
本文思想[参考网址]。(https://blog.csdn.net/artorias123/article/details/86527456)