【gcc, cmake, eigen, opencv,ubuntu】三.eigen和mkl安装和使用


eigen 和 mkl介绍




locate eigen3

安装后,头文件安装在/usr/local/include/eigen3/, 一般系统默认寻找路径有 /usr/local/include/,
所以如果包含文件的时候想要使用 #include , 而不是使用 #include

sudo cp -r /usr/local/include/eigen3/Eigen /usr/local/include 


cd   /usr/local/include/
sudo  ln  -s   eigen3/Eigen   Eigen


Eigen是一个C++语言中的开源的模板库,支持线性代数的运算,包括向量运算,矩阵运算,数值分析等相关算法。因为eigen只包含头文件,所以使用的话不需要进行编译,只需要在cpp文件开头写#include 就好。

直接 g++编译即可。
比如g++ src.cpp -o out



cpp, mkl 加速 eigen 实例
Linux 版的 Intel MKL 的安装使用



gcc -I/opt/mkl/mkl/include test_mkl.c /opt/mkl/mkl/lib/intel64/libmkl_rt.so -L/opt/mkl/mkl/lib/intel64 -L/opt/mkl/lib/intel64

测试文件test_mkl.c 代码:

#define min(x,y) (((x) < (y)) ? (x) : (y))
#include "mkl.h"
int main()
    double *A, *B, *C;
    int m, n, p, i, j;
    double alpha, beta;
    printf ("\n This example computes real matrix C=alpha*A*B+beta*C using \n"
            " Intel(R) MKL function dgemm, where A, B, and  C are matrices and \n"
            " alpha and beta are double precision scalars\n\n");
    m = 2000, p = 200, n = 1000;
    printf (" Initializing data for matrix multiplication C=A*B for matrix \n"
            " A(%ix%i) and matrix B(%ix%i)\n\n", m, p, p, n);
    alpha = 1.0; beta = 0.0;
    printf (" Allocating memory for matrices aligned on 64-byte boundary for better \n"
            " performance \n\n");
    A = (double *)mkl_malloc( m*p*sizeof( double ), 64 );
    B = (double *)mkl_malloc( p*n*sizeof( double ), 64 );
    C = (double *)mkl_malloc( m*n*sizeof( double ), 64 );
    if (A == NULL || B == NULL || C == NULL) {
        printf( "\n ERROR: Can't allocate memory for matrices. Aborting... \n\n");
        return 1;
    printf (" Intializing matrix data \n\n");
    for (i = 0; i < (m*p); i++) {
        A[i] = (double)(i+1);
    for (i = 0; i < (p*n); i++) {
        B[i] = (double)(-i-1);
    for (i = 0; i < (m*n); i++) {
        C[i] = 0.0;
    printf (" Computing matrix product using Intel(R) MKL dgemm function via CBLAS interface \n\n");
    cblas_dgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, 
                m, n, p, alpha, A, p, B, n, beta, C, n);
    printf ("\n Computations completed.\n\n");
    printf (" Top left corner of matrix A: \n");
    for (i=0; i<min(m,6); i++) {
        for (j=0; j<min(p,6); j++) {
            printf ("%12.0f", A[j+i*p]);
        printf ("\n");
    printf ("\n Top left corner of matrix B: \n");
    for (i=0; i<min(p,6); i++) {
        for (j=0; j<min(n,6); j++) {
            printf ("%12.0f", B[j+i*n]);
        printf ("\n");
    printf ("\n Top left corner of matrix C: \n");
    for (i=0; i<min(m,6); i++) {
        for (j=0; j<min(n,6); j++) {
            printf ("%12.5G", C[j+i*n]);
        printf ("\n");
    printf ("\n Deallocating memory \n\n");
    printf (" Example completed. \n\n");
    return 0;



一般使用eigen可以不使用mkl, 如果使用eigen的时候 使能mkl可能会有助于提升程序的运行速度。



eigen2.cpp 是一个7层的全连接网络。

不使用mkl的时候 9微秒,g++ -march=native -O2 eigen2.cpp -o eigen2
使用mkl后 也是9微秒左右,g++ -march=native -O2 eigen2.cpp -o eigen2 /opt/mkl/mkl/lib/intel64/libmkl_rt.so -I/opt/mkl/mkl/include -L/opt/mkl/mkl/lib/intel64



using namespace Eigen;
using namespace std;
typedef MatrixXf mat;
typedef VectorXf vec;

constexpr int d_in = 20;
constexpr int d_inter = 100;
constexpr int d_out = 30;
constexpr int num_round = 1000;

vec prelu(vec in, const vec &a){
    for (int i=0; i<in.size(); i++){
        if (in[i] < 0)
            in[i] = in[i] * a[i];
    return in;

double one_run(){
    // eigen has no normal distributed initialization. approximate it with uniform distribution
    // weights
    mat w1 = mat::Random(d_inter, d_in);
    mat w2 = mat::Random(d_inter, d_inter);
    mat w3 = mat::Random(d_inter, d_inter);
    mat w4 = mat::Random(d_inter, d_inter);
    mat w5 = mat::Random(d_inter, d_inter);
    mat w6 = mat::Random(d_inter, d_inter);
    mat w7 = mat::Random(d_out, d_inter);

    // bias
    vec b1 = vec::Random(d_inter);
    vec b2 = vec::Random(d_inter);
    vec b3 = vec::Random(d_inter);
    vec b4 = vec::Random(d_inter);
    vec b5 = vec::Random(d_inter);
    vec b6 = vec::Random(d_inter);
    vec b7 = vec::Random(d_out);

    // param for prelu
    vec a1 = vec::Random(d_inter);
    vec a2 = vec::Random(d_inter);
    vec a3 = vec::Random(d_inter);
    vec a4 = vec::Random(d_inter);
    vec a5 = vec::Random(d_inter);
    vec a6 = vec::Random(d_inter);

    auto t_start = std::chrono::high_resolution_clock::now();

    // random input
    vec input = vec::Random(d_in);

    // forward
    vec result;

    result = prelu(w1 * input + b1, a1);
    result = prelu(w2 * result + b2, a2);
    result = prelu(w3 * result + b3, a3);
    result = prelu(w4 * result + b4, a4);
    result = prelu(w5 * result + b5, a5);
    result = prelu(w6 * result + b6, a6);
    result = (w7 * result + b7).eval();         // force evaluation. just in case. 
    auto t_end = std::chrono::high_resolution_clock::now();
    double elapsed_time_us = std::chrono::duration<double, std::micro>(t_end-t_start).count();

    return elapsed_time_us;

int main(){
    VectorXd all = VectorXd::Random(num_round);

    for (int i=0; i< num_round; ++i){
        all[i] = one_run();

    cout << "time in micro second" << endl;
    cout << "mean: " << all.mean() << endl;
    cout << "max: " << all.maxCoeff() << endl;
    cout << "min: " << all.minCoeff() << endl;
    VectorXd err = all - VectorXd::Constant(num_round, all.mean());
    err = err.array() * err.array();
    float std = err.mean();
    std = sqrt(std);
    cout << "std: " << std << endl;

    return 0;



#include "lut3d.h"
using namespace Eigen;
using namespace std;

#include "opencv2/opencv.hpp"
using namespace cv;

// matrix
typedef MatrixXd matd;
typedef VectorXd vecd;
typedef MatrixXf matf;
typedef VectorXf vecf;
typedef MatrixXi mati;
typedef VectorXi veci;

// print eigen matrix
#define printMat(M) cout << #M <<"= "<<endl; \
cout << M <<endl; \
cout << "========================================" <<endl; \

void test_eigen_lookuptable(){
    // 1. select row and col
    matd m1 = matd::Random(5, 5);

    vector<int> keep_rows = {0,1,3,4};
    veci keep_cols = veci::LinSpaced(m1.cols(), 0, m1.cols());
    vector<int> keep_cols2 = {0,1,3,4,2,2,2};
    matd m1_sel = m1(keep_rows, keep_cols); 

    // 2. condition and setting new value
    Eigen::MatrixXi m(1, 5);
    m << 1, 2, 3, 4, 5;
    m = (m.array() == 3).select(5666, m);
    std::cout << m << std::endl; //1,2,5566,4,5

void run_eigen_test2()
	MatrixXf M1 = MatrixXf::Random(3, 8);
	cout << "Column major input:" << endl << M1 << "\n";
    // 1. eigen is col priority
	cout << "M1.outerStride() = " << M1.outerStride() << endl;
	cout << "M1.innerStride() = " << M1.innerStride() << endl;

	//2. 最经常用的就是取出一行或者一列的操作
	cout << "Column major input:" << endl << M1 << "\n";
	cout << "The first column is:" << endl << M1.col(0) << "\n";
	cout << "The last column is: " << endl << M1.rightCols(1) << "\n";
	cout << "The first row is: " << endl << M1.topRows<1>() << endl;
	cout << "The last row is: " << endl << M1.bottomRows<1>() << endl;
    //3. read matrix file and select index row. notice transpose
    Map<MatrixXf> lut(lut3d, 3, DIM*DIM*DIM);
    vector<int> index={0,0,0,1,3,4,17*17*17-1,17*17*17-2};
    matf lut_sel = lut.transpose()(index, Eigen::all);
    cout<< lut_sel.rows()<<"  "<<lut_sel.cols()<<endl;
void test_convert()

    // 1. opencv to eigen, rgb data
    Mat mat_opencv = Mat::zeros(4, 4, CV_32FC3);
    float* mat_opencv_p = mat_opencv.ptr<float>(0);
    for(int i=0;i<16*3;i++){
        mat_opencv_p[i] = 0.1+i;
    cout<<"opencv mat: "<<mat_opencv<<endl;

    Map<MatrixXf> mat_eigen2(mat_opencv_p, 12, 4);

    // 2. opencv to eigen, splited r,g, b data
    vector<Mat> rgb;
    split(mat_opencv, rgb);

    Map<MatrixXf> r_eigen((float*)rgb[0].data, rgb[0].cols, rgb[0].rows);
    Map<MatrixXf> g_eigen((float*)rgb[1].data, rgb[1].cols, rgb[1].rows);
    Map<MatrixXf> b_eigen((float*)rgb[2].data, rgb[2].cols, rgb[2].rows);

    int output_size = rgb[0].rows * rgb[0].cols;

    // 3. eigen 2 opencv
    Mat rr(rgb[0].rows, rgb[0].cols, CV_32FC1, r_eigen.data());
    cout<<rr.rows<<"   dd "<<rr.cols<<endl;

    // 4. RowVectorXf init
    // Map r_eigen1((float*)rgb[0].data, output_size);
    // Map g_eigen1((float*)rgb[1].data, output_size);
    // Map b_eigen1((float*)rgb[2].data, output_size);

    // 5. ArrayXf , one dim, Array和Matrix不同,Array的一般算术运算是element-wise.
    Map<ArrayXf> r_eigen1((float*)rgb[0].data, output_size);
    Map<ArrayXf> g_eigen1((float*)rgb[1].data, output_size);
    Map<ArrayXf> b_eigen1((float*)rgb[2].data, output_size);
    ArrayXf c = r_eigen1*0.2 + 45 ;
    cout << " c = "<< c<<endl;
    cout << " c = "<< c.cast<int>()<<endl;
    // 6. ArrayXXf , two dim
    ArrayXXf a(2,2);
    ArrayXXf b(2,2);
    a << 1,2,
    b << 5,6,
    cout << "a * b = " << endl << a * b << endl;
    cout << "a / b = " << endl << a / b << endl;


//三线性插值:apply 3dlut
void TriLinearForwardCpu_eigen(float* lut3d, Mat& image, Mat& output, const int dim, const float binsize, const int width, const int height)
    vector<Mat> rgb;
    split(image, rgb);

    int output_size = height * width;
    Map<ArrayXf> r_eigen((float*)rgb[0].data, output_size);
    Map<ArrayXf> g_eigen((float*)rgb[1].data, output_size);
    Map<ArrayXf> b_eigen((float*)rgb[2].data, output_size);

    ArrayXf r = r_eigen / binsize;
    ArrayXf g = g_eigen / binsize;
    ArrayXf b = b_eigen / binsize;
    ArrayXi r_id, g_id, b_id;
    r_id = r.cast<int>();
    g_id = g.cast<int>();
    b_id = b.cast<int>();
    // ArrayXf r_d(output_size), g_d(output_size), b_d(output_size);
    ArrayXf r_d = r - r_id.cast<float>();
    ArrayXf g_d = g - g_id.cast<float>();
    ArrayXf b_d = b - b_id.cast<float>();

    ArrayXi id000, id100, id010, id110, id001, id101, id011, id111;
    id000 = r_id + g_id * dim + b_id * dim * dim;
    id100 = r_id + 1 + g_id * dim + b_id * dim * dim;
    id010 = r_id + (g_id + 1) * dim + b_id * dim * dim;
    id110 = r_id + 1 + (g_id + 1) * dim + b_id * dim * dim;
    id001 = r_id + g_id * dim + (b_id + 1) * dim * dim;
    id101 = r_id + 1 + g_id * dim + (b_id + 1) * dim * dim;
    id011 = r_id + (g_id + 1) * dim + (b_id + 1) * dim * dim;
    id111 = r_id + 1 + (g_id + 1) * dim + (b_id + 1) * dim * dim;

    ArrayXf w000, w100, w010, w110, w001, w101, w011, w111;
    w000 = (1 - r_d) * (1 - g_d) * (1 - b_d);
    w100 = r_d * (1 - g_d) * (1 - b_d);
    w010 = (1 - r_d) * g_d * (1 - b_d);
    w110 = r_d * g_d * (1 - b_d);
    w001 = (1 - r_d) * (1 - g_d) * b_d;
    w101 = r_d * (1 - g_d) * b_d;
    w011 = (1 - r_d) * g_d * b_d;
    w111 = r_d * g_d * b_d;

    Map<ArrayXXf> lutt(lut3d, 3, DIM*DIM*DIM);
    ArrayXXf lut = lutt.transpose();

    r = w000 * lut(id000, 0) + w100 * lut(id100, 0) +
        w010 * lut(id010, 0) + w110 * lut(id110, 0) +
        w001 * lut(id001, 0) + w101 * lut(id101, 0) +
        w011 * lut(id011, 0) + w111 * lut(id111, 0);

    g = w000 * lut(id000, 1) + w100 * lut(id100, 1) +
            w010 * lut(id010, 1) + w110 * lut(id110, 1) +
            w001 * lut(id001, 1) + w101 * lut(id101, 1) +
            w011 * lut(id011, 1) + w111 * lut(id111, 1);

    b = w000 * lut(id000, 2) + w100 * lut(id100, 2) +
            w010 * lut(id010, 2) + w110 * lut(id110, 2) +
            w001 * lut(id001, 2) + w101 * lut(id101, 2) +
            w011 * lut(id011, 2) + w111 * lut( id111, 2);
     Mat r1(height, width, CV_32FC1, r.data());
    Mat g1(height, width, CV_32FC1, g.data());
    Mat b1(height, width, CV_32FC1, b.data());
    rgb[0] = r1;
    rgb[1] = g1;
    rgb[2] = b1;
    merge(rgb, output);

int test_trilinear_eigen()
    std::string file = "image/IMG_0002.tif";
    cv::Mat img0 = cv::imread(file);
    cv::imshow("window", img0);
    //cv::waitKey(0);// 按任意键在0秒后退出窗口,不写这句话是不会显示出窗口的
    cv::Mat img1;
    cv::cvtColor(img0, img1, cv::COLOR_BGR2RGB);
    cv::Mat img;
    img1.convertTo(img, CV_32F, 1.0 / 255);

    int height = img.rows;
    int width = img.cols;
    int channels = img.channels();
    printf("hello  height, width: %d,%d\n", height, width);
    cv::Vec3f color_value = img.ptr<cv::Vec3f>(0)[0];
    cout << img.ptr<float>(0)[0] << "  " << img.ptr<float>(0)[1] << "" << img.ptr<float>(0)[2] << endl;
    cout << color_value << endl;

    int N = 10;
    vector<double> all(N, 0);

    float* lut = lut3d;
    float* image = (float*)img.data;

    cv::Mat output_img = Mat::zeros(height, width, CV_32FC3); 
    float* output = (float*)output_img.data;

    const int shift = 1;
    const float binsize = 1.00001 / (DIM - 1);

    for (int i = 0; i < N; i++) {
        auto t_start = std::chrono::high_resolution_clock::now();

        struct timeval t1,t2;
        double timeuse;

        TriLinearForwardCpu_eigen(lut, img, output_img, 17, binsize, width, height);
        timeuse = (t2.tv_sec - t1.tv_sec) + (double)(t2.tv_usec - t1.tv_usec)/1000000.0;

        printf("timeuse:%lf\n", timeuse);

        auto t_end = std::chrono::high_resolution_clock::now();
        double elapsed_time_us = std::chrono::duration<double, std::micro>(t_end - t_start).count();
        cout << "time in micro second " << elapsed_time_us / 1000 << endl;
        all[i] = elapsed_time_us;
    output_img.convertTo(output_img, CV_8UC3, 255);
    cv::cvtColor(output_img, output_img, cv::COLOR_RGB2BGR);
    cv::imshow("window3", output_img);
    cv::waitKey(0);// 按任意键在0秒后退出窗口,不写这句话是不会显示出窗口的
    return 0;
int main()
    return 0;
