eigen的安装很简单:两种方法,一种直接命令行安装,另一种通过源文件安装。无论哪种都比较简单。
eigen安装参考:https://zhuanlan.zhihu.com/p/462494086
eigen官方网站:http://eigen.tuxfamily.org/index.php?title=Main_Page
查看安装位置
locate eigen3
安装后,头文件安装在/usr/local/include/eigen3/, 一般系统默认寻找路径有 /usr/local/include/,
所以如果包含文件的时候想要使用 #include
, 而不是使用 #include
可以使用下面的命令:移动头文件
sudo cp -r /usr/local/include/eigen3/Eigen /usr/local/include
或者设置软链接
cd /usr/local/include/
sudo ln -s eigen3/Eigen Eigen
Eigen是一个C++语言中的开源的模板库,支持线性代数的运算,包括向量运算,矩阵运算,数值分析等相关算法。因为eigen只包含头文件,所以使用的话不需要进行编译,只需要在cpp文件开头写#include
就好。
直接 g++编译即可。
比如g++ src.cpp -o out
我是参考一下第一个链接下载和安装的。
Linux下MKL库的安装部署与使用
下面链接也可以参考:
cpp, mkl 加速 eigen 实例
Linux 版的 Intel MKL 的安装使用
编译指令:
gcc -I/opt/mkl/mkl/include test_mkl.c /opt/mkl/mkl/lib/intel64/libmkl_rt.so -L/opt/mkl/mkl/lib/intel64 -L/opt/mkl/lib/intel64
能够顺利编译通过和运行,说明安装成功。
测试文件test_mkl.c 代码:
#define min(x,y) (((x) < (y)) ? (x) : (y))
#include
#include
#include "mkl.h"
int main()
{
double *A, *B, *C;
int m, n, p, i, j;
double alpha, beta;
printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
" Intel(R) MKL function dgemm, where A, B, and C are matrices and \n"
" alpha and beta are double precision scalars\n\n");
m = 2000, p = 200, n = 1000;
printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
" A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
alpha = 1.0; beta = 0.0;
printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
" performance \n\n");
A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
if (A == NULL || B == NULL || C == NULL) {
printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
return 1;
}
printf (" Intializing matrix data \n\n");
for (i = 0; i < (m*p); i++) {
A[i] = (double)(i+1);
}
for (i = 0; i < (p*n); i++) {
B[i] = (double)(-i-1);
}
for (i = 0; i < (m*n); i++) {
C[i] = 0.0;
}
printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
m, n, p, alpha, A, p, B, n, beta, C, n);
printf ("\n Computations completed.\n\n");
printf (" Top left corner of matrix A: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(p,6); j++) {
printf ("%12.0f", A[j+i*p]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix B: \n");
for (i=0; i<min(p,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.0f", B[j+i*n]);
}
printf ("\n");
}
printf ("\n Top left corner of matrix C: \n");
for (i=0; i<min(m,6); i++) {
for (j=0; j<min(n,6); j++) {
printf ("%12.5G", C[j+i*n]);
}
printf ("\n");
}
printf ("\n Deallocating memory \n\n");
mkl_free(A);
mkl_free(B);
mkl_free(C);
printf (" Example completed. \n\n");
return 0;
}
eigen是一个矩阵库,使用的时候只需要包含头文件即可。
mkl是intel的一个数学计算相关库,速度比较快。
一般使用eigen可以不使用mkl, 如果使用eigen的时候 使能mkl可能会有助于提升程序的运行速度。
eigen中使用mkl,只需要在程序开头:
#define EIGEN_USE_MKL_ALL
#define EIGEN_VECTORIZE_SSE4_2
下面利用eigen完成一个简单的全连接层:
eigen2.cpp 是一个7层的全连接网络。
不使用mkl的时候 9微秒,g++ -march=native -O2 eigen2.cpp -o eigen2
使用mkl后 也是9微秒左右,g++ -march=native -O2 eigen2.cpp -o eigen2 /opt/mkl/mkl/lib/intel64/libmkl_rt.so -I/opt/mkl/mkl/include -L/opt/mkl/mkl/lib/intel64
这个程序使用和不使用mkl差异不大。
#define EIGEN_USE_MKL_ALL
#define EIGEN_VECTORIZE_SSE4_2
#include
#include
#include
#include
using namespace Eigen;
using namespace std;
typedef MatrixXf mat;
typedef VectorXf vec;
constexpr int d_in = 20;
constexpr int d_inter = 100;
constexpr int d_out = 30;
constexpr int num_round = 1000;
vec prelu(vec in, const vec &a){
for (int i=0; i<in.size(); i++){
if (in[i] < 0)
in[i] = in[i] * a[i];
}
return in;
}
double one_run(){
// eigen has no normal distributed initialization. approximate it with uniform distribution
// weights
mat w1 = mat::Random(d_inter, d_in);
mat w2 = mat::Random(d_inter, d_inter);
mat w3 = mat::Random(d_inter, d_inter);
mat w4 = mat::Random(d_inter, d_inter);
mat w5 = mat::Random(d_inter, d_inter);
mat w6 = mat::Random(d_inter, d_inter);
mat w7 = mat::Random(d_out, d_inter);
// bias
vec b1 = vec::Random(d_inter);
vec b2 = vec::Random(d_inter);
vec b3 = vec::Random(d_inter);
vec b4 = vec::Random(d_inter);
vec b5 = vec::Random(d_inter);
vec b6 = vec::Random(d_inter);
vec b7 = vec::Random(d_out);
// param for prelu
vec a1 = vec::Random(d_inter);
vec a2 = vec::Random(d_inter);
vec a3 = vec::Random(d_inter);
vec a4 = vec::Random(d_inter);
vec a5 = vec::Random(d_inter);
vec a6 = vec::Random(d_inter);
auto t_start = std::chrono::high_resolution_clock::now();
// random input
vec input = vec::Random(d_in);
// forward
vec result;
result = prelu(w1 * input + b1, a1);
result = prelu(w2 * result + b2, a2);
result = prelu(w3 * result + b3, a3);
result = prelu(w4 * result + b4, a4);
result = prelu(w5 * result + b5, a5);
result = prelu(w6 * result + b6, a6);
result = (w7 * result + b7).eval(); // force evaluation. just in case.
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_us = std::chrono::duration<double, std::micro>(t_end-t_start).count();
return elapsed_time_us;
}
int main(){
VectorXd all = VectorXd::Random(num_round);
for (int i=0; i< num_round; ++i){
all[i] = one_run();
}
cout << "time in micro second" << endl;
cout << "mean: " << all.mean() << endl;
cout << "max: " << all.maxCoeff() << endl;
cout << "min: " << all.minCoeff() << endl;
VectorXd err = all - VectorXd::Constant(num_round, all.mean());
err = err.array() * err.array();
float std = err.mean();
std = sqrt(std);
cout << "std: " << std << endl;
return 0;
}
每个函数内部有注释
#define EIGEN_USE_MKL_ALL
#define EIGEN_VECTORIZE_SSE4_2
#include
#include
#include
#include
#include
#include
#include "lut3d.h"
using namespace Eigen;
using namespace std;
#include "opencv2/opencv.hpp"
using namespace cv;
// matrix
typedef MatrixXd matd;
typedef VectorXd vecd;
typedef MatrixXf matf;
typedef VectorXf vecf;
typedef MatrixXi mati;
typedef VectorXi veci;
// print eigen matrix
#define printMat(M) cout << #M <<"= "<<endl; \
cout << M <<endl; \
cout << "========================================" <<endl; \
void test_eigen_lookuptable(){
// 1. select row and col
matd m1 = matd::Random(5, 5);
printMat(m1);
vector<int> keep_rows = {0,1,3,4};
veci keep_cols = veci::LinSpaced(m1.cols(), 0, m1.cols());
vector<int> keep_cols2 = {0,1,3,4,2,2,2};
matd m1_sel = m1(keep_rows, keep_cols);
printMat(m1_sel);
// 2. condition and setting new value
Eigen::MatrixXi m(1, 5);
m << 1, 2, 3, 4, 5;
m = (m.array() == 3).select(5666, m);
std::cout << m << std::endl; //1,2,5566,4,5
}
void run_eigen_test2()
{
MatrixXf M1 = MatrixXf::Random(3, 8);
cout << "Column major input:" << endl << M1 << "\n";
// 1. eigen is col priority
//innerStride既表示沿着矩阵的数据存储方向移动一个元素的位置,在内存中需要移动的宽度。
//outerStride的含义就是不沿着数据存储方向移动一个位置
cout << "M1.outerStride() = " << M1.outerStride() << endl;
cout << "M1.innerStride() = " << M1.innerStride() << endl;
//2. 最经常用的就是取出一行或者一列的操作
cout << "Column major input:" << endl << M1 << "\n";
cout << "The first column is:" << endl << M1.col(0) << "\n";
cout << "The last column is: " << endl << M1.rightCols(1) << "\n";
cout << "The first row is: " << endl << M1.topRows<1>() << endl;
cout << "The last row is: " << endl << M1.bottomRows<1>() << endl;
//3. read matrix file and select index row. notice transpose
Map<MatrixXf> lut(lut3d, 3, DIM*DIM*DIM);
printMat(lut.transpose());
vector<int> index={0,0,0,1,3,4,17*17*17-1,17*17*17-2};
matf lut_sel = lut.transpose()(index, Eigen::all);
printMat(lut_sel);
cout<< lut_sel.rows()<<" "<<lut_sel.cols()<<endl;
}
void test_convert()
{
// 1. opencv to eigen, rgb data
Mat mat_opencv = Mat::zeros(4, 4, CV_32FC3);
float* mat_opencv_p = mat_opencv.ptr<float>(0);
for(int i=0;i<16*3;i++){
mat_opencv_p[i] = 0.1+i;
}
cout<<"opencv mat: "<<mat_opencv<<endl;
Map<MatrixXf> mat_eigen2(mat_opencv_p, 12, 4);
printMat(mat_eigen2.transpose());
// 2. opencv to eigen, splited r,g, b data
vector<Mat> rgb;
split(mat_opencv, rgb);
Map<MatrixXf> r_eigen((float*)rgb[0].data, rgb[0].cols, rgb[0].rows);
Map<MatrixXf> g_eigen((float*)rgb[1].data, rgb[1].cols, rgb[1].rows);
Map<MatrixXf> b_eigen((float*)rgb[2].data, rgb[2].cols, rgb[2].rows);
printMat(r_eigen.transpose());
printMat(g_eigen.transpose());
int output_size = rgb[0].rows * rgb[0].cols;
// 3. eigen 2 opencv
Mat rr(rgb[0].rows, rgb[0].cols, CV_32FC1, r_eigen.data());
cout<<rr.rows<<" dd "<<rr.cols<<endl;
cout<<rr<<endl;
// 4. RowVectorXf init
// Map r_eigen1((float*)rgb[0].data, output_size);
// Map g_eigen1((float*)rgb[1].data, output_size);
// Map b_eigen1((float*)rgb[2].data, output_size);
// 5. ArrayXf , one dim, Array和Matrix不同,Array的一般算术运算是element-wise.
Map<ArrayXf> r_eigen1((float*)rgb[0].data, output_size);
Map<ArrayXf> g_eigen1((float*)rgb[1].data, output_size);
Map<ArrayXf> b_eigen1((float*)rgb[2].data, output_size);
cout<<r_eigen1.size()<<endl;
ArrayXf c = r_eigen1*0.2 + 45 ;
cout << " c = "<< c<<endl;
cout << " c = "<< c.cast<int>()<<endl;
// 6. ArrayXXf , two dim
ArrayXXf a(2,2);
ArrayXXf b(2,2);
a << 1,2,
3,4;
b << 5,6,
7,8;
cout << "a * b = " << endl << a * b << endl;
cout << "a / b = " << endl << a / b << endl;
}
//三线性插值:apply 3dlut
void TriLinearForwardCpu_eigen(float* lut3d, Mat& image, Mat& output, const int dim, const float binsize, const int width, const int height)
{
vector<Mat> rgb;
split(image, rgb);
int output_size = height * width;
Map<ArrayXf> r_eigen((float*)rgb[0].data, output_size);
Map<ArrayXf> g_eigen((float*)rgb[1].data, output_size);
Map<ArrayXf> b_eigen((float*)rgb[2].data, output_size);
ArrayXf r = r_eigen / binsize;
ArrayXf g = g_eigen / binsize;
ArrayXf b = b_eigen / binsize;
ArrayXi r_id, g_id, b_id;
r_id = r.cast<int>();
g_id = g.cast<int>();
b_id = b.cast<int>();
// ArrayXf r_d(output_size), g_d(output_size), b_d(output_size);
ArrayXf r_d = r - r_id.cast<float>();
ArrayXf g_d = g - g_id.cast<float>();
ArrayXf b_d = b - b_id.cast<float>();
ArrayXi id000, id100, id010, id110, id001, id101, id011, id111;
id000 = r_id + g_id * dim + b_id * dim * dim;
id100 = r_id + 1 + g_id * dim + b_id * dim * dim;
id010 = r_id + (g_id + 1) * dim + b_id * dim * dim;
id110 = r_id + 1 + (g_id + 1) * dim + b_id * dim * dim;
id001 = r_id + g_id * dim + (b_id + 1) * dim * dim;
id101 = r_id + 1 + g_id * dim + (b_id + 1) * dim * dim;
id011 = r_id + (g_id + 1) * dim + (b_id + 1) * dim * dim;
id111 = r_id + 1 + (g_id + 1) * dim + (b_id + 1) * dim * dim;
ArrayXf w000, w100, w010, w110, w001, w101, w011, w111;
w000 = (1 - r_d) * (1 - g_d) * (1 - b_d);
w100 = r_d * (1 - g_d) * (1 - b_d);
w010 = (1 - r_d) * g_d * (1 - b_d);
w110 = r_d * g_d * (1 - b_d);
w001 = (1 - r_d) * (1 - g_d) * b_d;
w101 = r_d * (1 - g_d) * b_d;
w011 = (1 - r_d) * g_d * b_d;
w111 = r_d * g_d * b_d;
Map<ArrayXXf> lutt(lut3d, 3, DIM*DIM*DIM);
ArrayXXf lut = lutt.transpose();
r = w000 * lut(id000, 0) + w100 * lut(id100, 0) +
w010 * lut(id010, 0) + w110 * lut(id110, 0) +
w001 * lut(id001, 0) + w101 * lut(id101, 0) +
w011 * lut(id011, 0) + w111 * lut(id111, 0);
g = w000 * lut(id000, 1) + w100 * lut(id100, 1) +
w010 * lut(id010, 1) + w110 * lut(id110, 1) +
w001 * lut(id001, 1) + w101 * lut(id101, 1) +
w011 * lut(id011, 1) + w111 * lut(id111, 1);
b = w000 * lut(id000, 2) + w100 * lut(id100, 2) +
w010 * lut(id010, 2) + w110 * lut(id110, 2) +
w001 * lut(id001, 2) + w101 * lut(id101, 2) +
w011 * lut(id011, 2) + w111 * lut( id111, 2);
Mat r1(height, width, CV_32FC1, r.data());
Mat g1(height, width, CV_32FC1, g.data());
Mat b1(height, width, CV_32FC1, b.data());
rgb[0] = r1;
rgb[1] = g1;
rgb[2] = b1;
merge(rgb, output);
}
int test_trilinear_eigen()
{
std::string file = "image/IMG_0002.tif";
cv::Mat img0 = cv::imread(file);
cv::imshow("window", img0);
//cv::waitKey(0);// 按任意键在0秒后退出窗口,不写这句话是不会显示出窗口的
cv::Mat img1;
cv::cvtColor(img0, img1, cv::COLOR_BGR2RGB);
cv::Mat img;
img1.convertTo(img, CV_32F, 1.0 / 255);
int height = img.rows;
int width = img.cols;
int channels = img.channels();
printf("hello height, width: %d,%d\n", height, width);
cv::Vec3f color_value = img.ptr<cv::Vec3f>(0)[0];
cout << img.ptr<float>(0)[0] << " " << img.ptr<float>(0)[1] << "" << img.ptr<float>(0)[2] << endl;
cout << color_value << endl;
int N = 10;
vector<double> all(N, 0);
float* lut = lut3d;
float* image = (float*)img.data;
cv::Mat output_img = Mat::zeros(height, width, CV_32FC3);
float* output = (float*)output_img.data;
const int shift = 1;
const float binsize = 1.00001 / (DIM - 1);
for (int i = 0; i < N; i++) {
auto t_start = std::chrono::high_resolution_clock::now();
struct timeval t1,t2;
double timeuse;
gettimeofday(&t1,NULL);
TriLinearForwardCpu_eigen(lut, img, output_img, 17, binsize, width, height);
gettimeofday(&t2,NULL);
timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000000.0;
printf("timeuse:%lf\n", timeuse);
auto t_end = std::chrono::high_resolution_clock::now();
double elapsed_time_us = std::chrono::duration<double, std::micro>(t_end - t_start).count();
cout << "time in micro second " << elapsed_time_us / 1000 << endl;
all[i] = elapsed_time_us;
}
output_img.convertTo(output_img, CV_8UC3, 255);
cv::cvtColor(output_img, output_img, cv::COLOR_RGB2BGR);
cv::imshow("window3", output_img);
cv::waitKey(0);// 按任意键在0秒后退出窗口,不写这句话是不会显示出窗口的
return 0;
}
int main()
{
test_trilinear_eigen();
//test_subset();
//run_eigen_test2();
//test_eigen_lookuptable();
return 0;
}