TensorFlow中的一个重要op---MatMul的实现(一)

本文目的是以Tensorfl中的MatMul这个具有代表性又比较简单的ops为例介绍一下TensorFlow中的图的节点是怎么实现的。我个人认为TensorFlow中的ops是整个TensorFlow的核心,如果理解了这个,那么对TensorFlow就有了比较深的认识。
在阅读这段代码前看一下官方文档中的添加新的op会很有帮助:
中文翻译:http://www.tensorfly.cn/tfdoc/how_tos/adding_an_op.html
英文原文:http://www.tensorflow.org/how_tos/adding_an_op/index.html#adding-a-new-op
好了对添加一个新的op的方法有一定的了解后我们看真实的可实用的例子:MatMul
一个ops其实包含两个计算节点,一个是正向计算节点,一个是梯度计算节点。
正向计算节点在源代码的:core/kernels/matmul_op.cc这个文件里面。

/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/

// See docs in ../ops/math_ops.cc.

#define EIGEN_USE_THREADS

#include "tensorflow/core/kernels/matmul_op.h"

#include "tensorflow/core/framework/op.h"
#include "tensorflow/core/framework/op_kernel.h"
#include "tensorflow/core/framework/register_types.h"
#include "tensorflow/core/kernels/fill_functor.h"

#if GOOGLE_CUDA
#include "cuda/include/cuda.h"
#include "tensorflow/core/platform/stream_executor.h"
#endif  // GOOGLE_CUDA

namespace tensorflow {

#if GOOGLE_CUDA

namespace {
template 
perftools::gputools::DeviceMemory AsDeviceMemory(const T* cuda_memory) {
  perftools::gputools::DeviceMemoryBase wrapped(const_cast(cuda_memory));
  perftools::gputools::DeviceMemory typed(wrapped);
  return typed;
}
}  // namespace

#endif  // GOOGLE_CUDA

typedef Eigen::ThreadPoolDevice CPUDevice;
typedef Eigen::GpuDevice GPUDevice;
#ifdef TENSORFLOW_USE_SYCL
typedef Eigen::SyclDevice SYCLDevice;
#endif  // TENSORFLOW_USE_SYCL

template 
struct LaunchMatMul;

namespace {
// Converts a TensorFlow Tensor to an Eigen Matrix.
template 
Eigen::Map<
    const Eigen::Matrix>
ToEigenMatrix(const Tensor& tensor) {
  auto matrix = tensor.matrix();
  return Eigen::Matrix::Map(
      matrix.data(), matrix.dimension(0), matrix.dimension(1));
}

// Converts a TensorFlow Tensor to an Eigen Vector.
template 
Eigen::Map> ToEigenVector(Tensor* tensor) {
  auto v = tensor->flat();
  return Eigen::Matrix::Map(v.data(), v.dimension(0));
}
template 
Eigen::Map> ToEigenVector(
    const Tensor& tensor) {
  auto v = tensor.flat();
  return Eigen::Matrix::Map(v.data(), v.dimension(0));
}
}  // namespace

// If either side can be represented as a vector, do an explicit vector
// matrix multiply and return true; else return false.
//
// Note: this uses plain Eigen and not Eigen Tensor because it is more
// efficient.
template 
bool ExplicitVectorMatrixOptimization(
    const Tensor& a, const Tensor& b,
    const Eigen::array, 1>& dim_pair,
    Tensor* out) {
  if (out->dim_size(0) == 1) {
    if (dim_pair[0].second == 0) {
      // Note: this case is optimized in Eigen Tensors.
      return false;
    } else {
      auto out_v = ToEigenVector(out);
      auto a_v = ToEigenVector(a);
      auto b_m = ToEigenMatrix(b);
      out_v.noalias() = b_m * a_v;
    }
    return true;
  } else if (out->dim_size(1) == 1) {
    auto out_v = ToEigenVector(out);
    auto a_m = ToEigenMatrix(a);
    auto b_v = ToEigenVector(b);
    if (dim_pair[0].first == 0) {
      out_v.noalias() = a_m.transpose() * b_v;
    } else {
      out_v.noalias() = a_m * b_v;
    }
    return true;
  }
  return false;
}
// Half is not supported.
template <>
bool ExplicitVectorMatrixOptimization(
    const Tensor& a, const Tensor& b,
    const Eigen::array, 1>& dim_pair,
    Tensor* out) {
  return false;
}

template 
struct LaunchMatMulBase {
  static void launch(
      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
      const Eigen::array, 1>& dim_pair,
      Tensor* out) {
#ifndef TENSORFLOW_USE_SYCL
    // An explicit vector-matrix multiply is much better optimized than an
    // implicit one and this is a bottleneck during non-batched inference.
    bool was_vector = ExplicitVectorMatrixOptimization(a, b, dim_pair, out);
    if (!was_vector) {
#endif  // TENSORFLOW_USE_SYCL
      functor::MatMulFunctor()(ctx->eigen_device(),
                                          out->matrix(), a.matrix(),
                                          b.matrix(), dim_pair);
#ifndef TENSORFLOW_USE_SYCL
    }
#endif  // TENSORFLOW_USE_SYCL
  }
};
// On CPUs, we ignore USE_CUBLAS
template 
struct LaunchMatMulCPU : LaunchMatMulBase {};

template 
struct LaunchMatMul : public LaunchMatMulCPU {};

#ifdef TENSORFLOW_USE_SYCL
template 
struct LaunchMatMulSYCL : LaunchMatMulBase {};

template 
struct LaunchMatMul : public LaunchMatMulSYCL {};
#endif  // TENSORFLOW_USE_SYCL

#if GOOGLE_CUDA

namespace {
template 
struct LaunchBlasGemv {
  static void Compute(OpKernelContext* ctx, perftools::gputools::Stream* stream,
                      bool trans, uint64 m, uint64 n,
                      const perftools::gputools::DeviceMemory& a,
                      const perftools::gputools::DeviceMemory& b,
                      perftools::gputools::DeviceMemory* c) {
    const auto blas_trans =
        trans ? perftools::gputools::blas::Transpose::kTranspose
              : perftools::gputools::blas::Transpose::kNoTranspose;
    bool blas_launch_status =
        stream
            ->ThenBlasGemv(blas_trans, m, n, static_cast(1.0), a, m, b, 1,
                           static_cast(0.0), c, 1)
            .ok();
    if (!blas_launch_status) {
      ctx->SetStatus(
          errors::Internal("Blas GEMV launch failed:  m=", m, ", n=", n));
    }
  }

  static bool IsSupported() { return true; }
};

template <>
void LaunchBlasGemv::Compute(
    OpKernelContext* ctx, perftools::gputools::Stream* stream, bool trans,
    uint64 m, uint64 n, const perftools::gputools::DeviceMemory& a,
    const perftools::gputools::DeviceMemory& b,
    perftools::gputools::DeviceMemory* c) {
  ctx->SetStatus(errors::Internal(
      "Blas GEMV launch failed: GEMV is not implemented for float16."));
}

template <>
bool LaunchBlasGemv::IsSupported() {
  return false;
}

}  // namespace

template 
struct LaunchMatMul {
  static void launch(
      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
      const Eigen::array, 1>& dim_pair,
      Tensor* out) {
    perftools::gputools::blas::Transpose trans[] = {
        perftools::gputools::blas::Transpose::kNoTranspose,
        perftools::gputools::blas::Transpose::kTranspose};
    const uint64 m = a.dim_size(1 - dim_pair[0].first);
    const uint64 k = a.dim_size(dim_pair[0].first);
    const uint64 n = b.dim_size(1 - dim_pair[0].second);
    bool transpose_a = dim_pair[0].first == 0;
    bool transpose_b = dim_pair[0].second == 1;
    auto blas_transpose_a = trans[transpose_a];
    auto blas_transpose_b = trans[transpose_b];

    auto* stream = ctx->op_device_context()->stream();
    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));

    auto a_ptr = AsDeviceMemory(a.template flat().data());
    auto b_ptr = AsDeviceMemory(b.template flat().data());
    auto c_ptr = AsDeviceMemory(out->template flat().data());
    // Cublas does
    // C = A x B
    // where A, B and C are assumed to be in column major.
    // We want the output to be in row-major, so we can compute
    // C' = B' x A' (' stands for transpose)
    if (LaunchBlasGemv::IsSupported() && n == 1) {
      // This is a matrix*vector multiply so use GEMV to compute A * b.
      // Here we are multiplying in the natural order, so we have to flip
      // the transposition flag to compensate for the tensor being stored
      // row-major.
      LaunchBlasGemv::Compute(ctx, stream, !transpose_a, transpose_a ? m : k,
                                 transpose_a ? k : m, a_ptr, b_ptr, &c_ptr);
    } else {
      bool blas_launch_status =
          stream
              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
                             b_ptr, transpose_b ? k : n, a_ptr,
                             transpose_a ? m : k, 0.0f, &c_ptr, n)
              .ok();
      if (!blas_launch_status) {
        ctx->SetStatus(errors::Internal(
            "Blas GEMM launch failed : a.shape=(", a.dim_size(0), ", ",
            a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
            "), m=", m, ", n=", n, ", k=", k));
      }
    }
  }
};

#endif  // GOOGLE_CUDA

template 
class MatMulOp : public OpKernel {
 public:
  explicit MatMulOp(OpKernelConstruction* ctx) : OpKernel(ctx) {
    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_a", &transpose_a_));
    OP_REQUIRES_OK(ctx, ctx->GetAttr("transpose_b", &transpose_b_));
  }

  void Compute(OpKernelContext* ctx) override {
    const Tensor& a = ctx->input(0);
    const Tensor& b = ctx->input(1);

    // Check that the dimensions of the two matrices are valid.
    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(a.shape()),
                errors::InvalidArgument("In[0] is not a matrix"));
    OP_REQUIRES(ctx, TensorShapeUtils::IsMatrix(b.shape()),
                errors::InvalidArgument("In[1] is not a matrix"));
    Eigen::array, 1> dim_pair;
    dim_pair[0].first = transpose_a_ ? 0 : 1;
    dim_pair[0].second = transpose_b_ ? 1 : 0;

    OP_REQUIRES(
        ctx, a.dim_size(dim_pair[0].first) == b.dim_size(dim_pair[0].second),
        errors::InvalidArgument(
            "Matrix size-incompatible: In[0]: ", a.shape().DebugString(),
            ", In[1]: ", b.shape().DebugString()));
    int a_dim_remaining = 1 - dim_pair[0].first;
    int b_dim_remaining = 1 - dim_pair[0].second;
    TensorShape out_shape(
        {a.dim_size(a_dim_remaining), b.dim_size(b_dim_remaining)});
    Tensor* out = nullptr;
    OP_REQUIRES_OK(ctx, ctx->allocate_output(0, out_shape, &out));

    if (out->NumElements() == 0) {
      // If a has shape [0, x] or b has shape [x, 0], the output shape
      // is a 0-element matrix, so there is nothing to do.
      return;
    }

    if (a.NumElements() == 0 || b.NumElements() == 0) {
      // If a has shape [x, 0] and b has shape [0, y], the
      // output shape is [x, y] where x and y are non-zero, so we fill
      // the output with zeros.
      functor::SetZeroFunctor f;
      f(ctx->eigen_device(), out->flat());
      return;
    }

    LaunchMatMul::launch(ctx, this, a, b, dim_pair, out);
  }

 private:
  bool transpose_a_;
  bool transpose_b_;
};

namespace functor {

// Partial specialization MatMulFunctor.
template 
struct MatMulFunctor {
  void operator()(
      const CPUDevice& d, typename MatMulTypes::out_type out,
      typename MatMulTypes::in_type in0,
      typename MatMulTypes::in_type in1,
      const Eigen::array, 1>& dim_pair) {
    MatMul(d, out, in0, in1, dim_pair);
  }
};

#ifdef TENSORFLOW_USE_SYCL
// Partial specialization MatMulFunctor.
template 
struct MatMulFunctor {
  void operator()(
      const SYCLDevice& d, typename MatMulTypes::out_type out,
      typename MatMulTypes::in_type in0,
      typename MatMulTypes::in_type in1,
      const Eigen::array, 1>& dim_pair) {
    MatMul(d, out, in0, in1, dim_pair);
  }
};
#endif  // TENSORFLOW_USE_SYCL

}  // end namespace functor

#define REGISTER_CPU(T)                                                        \
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("MatMul").Device(DEVICE_CPU).TypeConstraint("T"),                \
      MatMulOp);            \
  REGISTER_KERNEL_BUILDER(                                                     \
      Name("MatMul").Device(DEVICE_CPU).TypeConstraint("T").Label("eigen"), \
      MatMulOp)

#define REGISTER_GPU(T)                                            \
  REGISTER_KERNEL_BUILDER(                                         \
      Name("MatMul").Device(DEVICE_GPU).TypeConstraint("T"),    \
      MatMulOp); \
  REGISTER_KERNEL_BUILDER(Name("MatMul")                           \
                              .Device(DEVICE_GPU)                  \
                              .TypeConstraint("T")              \
                              .Label("cublas"),                    \
                          MatMulOp)

#if defined(INTEL_MKL)
// MKL does not support half and int32 types for matrix-multiplication, so
// register the kernel to use default Eigen based implementations for these
// types
TF_CALL_half(REGISTER_CPU);
TF_CALL_int32(REGISTER_CPU);
#else
TF_CALL_float(REGISTER_CPU);
TF_CALL_double(REGISTER_CPU);
TF_CALL_half(REGISTER_CPU);

TF_CALL_int32(REGISTER_CPU);
TF_CALL_complex64(REGISTER_CPU);
TF_CALL_complex128(REGISTER_CPU);
#endif

#if GOOGLE_CUDA
TF_CALL_float(REGISTER_GPU);
TF_CALL_double(REGISTER_GPU);
TF_CALL_complex64(REGISTER_GPU);
TF_CALL_complex128(REGISTER_GPU);
#if CUDA_VERSION >= 7050
TF_CALL_half(REGISTER_GPU);
#endif
#endif  // GOOGLE_CUDA

#ifdef TENSORFLOW_USE_SYCL
#define REGISTER_SYCL(T)                                         \
  REGISTER_KERNEL_BUILDER(                                       \
      Name("MatMul").Device(DEVICE_SYCL).TypeConstraint("T"), \
      MatMulOp);              \
  REGISTER_KERNEL_BUILDER(Name("MatMul")                         \
                              .Device(DEVICE_SYCL)               \
                              .TypeConstraint("T")            \
                              .Label("eigen"),                   \
                          MatMulOp)
TF_CALL_float(REGISTER_SYCL);

#endif  // TENSORFLOW_USE_SYCL
}  // namespace tensorflow
我们知道一个op,他的核心处理函数是:

void Compute(OpKernelContext* ctx) override

在这里面其实并没有做太多的事情,只是把参数和shape信息获取了一下,真正的计算操作又交给了函数:

LaunchMatMul::launch(ctx, this, a, b, dim_pair, out);

LaunchMatMul是一个类模板, 依据Device的不同这个类模板有三个不同的实现(模板类)分别是:LaunchMatMul,LaunchMatMul,LaunchMatMul,这三个类分别对应CPU版本,SYCL版本,GPU版本。
我们主要看这个些类的launch方法,可以看到GPU和SYCL的版本的方法是一样的,如下:

template 
struct LaunchMatMulBase {
  static void launch(
      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
      const Eigen::array, 1>& dim_pair,
      Tensor* out) {
#ifndef TENSORFLOW_USE_SYCL
    // An explicit vector-matrix multiply is much better optimized than an
    // implicit one and this is a bottleneck during non-batched inference.
    bool was_vector = ExplicitVectorMatrixOptimization(a, b, dim_pair, out);
    if (!was_vector) {
#endif  // TENSORFLOW_USE_SYCL
      functor::MatMulFunctor()(ctx->eigen_device(),
                                          out->matrix(), a.matrix(),
                                          b.matrix(), dim_pair);
#ifndef TENSORFLOW_USE_SYCL
    }
#endif  // TENSORFLOW_USE_SYCL
  }
};

这段代码主要是调用了对象函数MatMulFunctor

      functor::MatMulFunctor()(ctx->eigen_device(),
                                          out->matrix(), a.matrix(),
                                          b.matrix(), dim_pair);

这个对象函数的实现如下:

// Partial specialization MatMulFunctor.
template 
struct MatMulFunctor {
  void operator()(
      const CPUDevice& d, typename MatMulTypes::out_type out,
      typename MatMulTypes::in_type in0,
      typename MatMulTypes::in_type in1,
      const Eigen::array, 1>& dim_pair) {
    MatMul(d, out, in0, in1, dim_pair);
  }
};

可以看到主要是调用了函数MatMul,MatMul这个函数定义在core\kernels\matmul_op.h文件下:

template 
void MatMul(const Device& d, Out out, In0 in0, In1 in1,
            const DimPair& dim_pair) {
  out.device(d) = in0.contract(in1, dim_pair);
}

contract是Eigen的一个方法,表示矩阵相乘,Eigen是一套高效的C++中调用的数学平台,里面实现了很多通用的数学运算。所以整个计算就结束了。
接下来我们看另一个GPU版本,回到launch方法,他的GPU版本的实现如下:

template 
struct LaunchMatMul {
  static void launch(
      OpKernelContext* ctx, OpKernel* kernel, const Tensor& a, const Tensor& b,
      const Eigen::array, 1>& dim_pair,
      Tensor* out) {
    perftools::gputools::blas::Transpose trans[] = {
        perftools::gputools::blas::Transpose::kNoTranspose,
        perftools::gputools::blas::Transpose::kTranspose};
    const uint64 m = a.dim_size(1 - dim_pair[0].first);
    const uint64 k = a.dim_size(dim_pair[0].first);
    const uint64 n = b.dim_size(1 - dim_pair[0].second);
    bool transpose_a = dim_pair[0].first == 0;
    bool transpose_b = dim_pair[0].second == 1;
    auto blas_transpose_a = trans[transpose_a];
    auto blas_transpose_b = trans[transpose_b];

    auto* stream = ctx->op_device_context()->stream();
    OP_REQUIRES(ctx, stream, errors::Internal("No GPU stream available."));

    auto a_ptr = AsDeviceMemory(a.template flat().data());
    auto b_ptr = AsDeviceMemory(b.template flat().data());
    auto c_ptr = AsDeviceMemory(out->template flat().data());
    // Cublas does
    // C = A x B
    // where A, B and C are assumed to be in column major.
    // We want the output to be in row-major, so we can compute
    // C' = B' x A' (' stands for transpose)
    if (LaunchBlasGemv::IsSupported() && n == 1) {
      // This is a matrix*vector multiply so use GEMV to compute A * b.
      // Here we are multiplying in the natural order, so we have to flip
      // the transposition flag to compensate for the tensor being stored
      // row-major.
      LaunchBlasGemv::Compute(ctx, stream, !transpose_a, transpose_a ? m : k,
                                 transpose_a ? k : m, a_ptr, b_ptr, &c_ptr);
    } else {
      bool blas_launch_status =
          stream
              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
                             b_ptr, transpose_b ? k : n, a_ptr,
                             transpose_a ? m : k, 0.0f, &c_ptr, n)
              .ok();
      if (!blas_launch_status) {
        ctx->SetStatus(errors::Internal(
            "Blas GEMM launch failed : a.shape=(", a.dim_size(0), ", ",
            a.dim_size(1), "), b.shape=(", b.dim_size(0), ", ", b.dim_size(1),
            "), m=", m, ", n=", n, ", k=", k));
      }
    }
  }
};

从上面可以看出,前面主要是一些预处理,然后如果当前支持GEMV并且第二个矩阵是一维的话就调用方法:

LaunchBlasGemv::Compute(ctx, stream, !transpose_a, transpose_a ? m : k,
                                 transpose_a ? k : m, a_ptr, b_ptr, &c_ptr);

否则调用方法:

 stream
              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
                             b_ptr, transpose_b ? k : n, a_ptr,
                             transpose_a ? m : k, 0.0f, &c_ptr, n)
              .ok();

这里我们先看非GEMV的情况,首先什么是GEMV,这里就需要提一下BLAS,BLAS--基本线性代数子程序库(Basic Linear Algebra Subprograms),他定义了很多基本线性代数的规范,然后这个规范下有很多实现(之前提的EIGEN我的理解也是其中的一个),这个规范有很多内容,简单可以看如下这个文章
http://blog.csdn.net/g_spider/article/details/6054990
对照这个文章,我们知道GEMV就是general  matrix-vector,也就是通用矩阵与向量的运算,所以这个起作用必需是第二个矩阵的一个维度为1。
接下来我们看非GEMV的情况的实现:

 stream
              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
                             b_ptr, transpose_b ? k : n, a_ptr,
                             transpose_a ? m : k, 0.0f, &c_ptr, n)
              .ok();

直接调用了stream的ThenBlasGemm()方法,stream的定义如下:

auto* stream = ctx->op_device_context()->stream();

这里GPU情况下使用的是cuBLAS库(cublas是NVIDIA的一个GPU的blas库,提供的计算函数都在GPU上执行),准确而言是基于cuBLAS的stream_executor库。Stream executor是google开发的开源并行计算库,调用方式如下:

auto* stream = ctx->op_device_context()->stream();
      bool blas_launch_status =
          stream
              ->ThenBlasGemm(blas_transpose_b, blas_transpose_a, n, m, k, 1.0f,
                             b_ptr, transpose_b ? k : n, a_ptr,
                             transpose_a ? m : k, 0.0f, &c_ptr, n)

说白了就是Google自己“封装”一下,以便可以直接调用stream的方法实现这些运算。这里是调用了GEMM的方法ThenBlasGemm(), 这样就实现了非GEMV的情况的运算。然后看GEMV情况的:

static void Compute(OpKernelContext* ctx, perftools::gputools::Stream* stream,
                      bool trans, uint64 m, uint64 n,
                      const perftools::gputools::DeviceMemory& a,
                      const perftools::gputools::DeviceMemory& b,
                      perftools::gputools::DeviceMemory* c) {
    const auto blas_trans =
        trans ? perftools::gputools::blas::Transpose::kTranspose
              : perftools::gputools::blas::Transpose::kNoTranspose;
    bool blas_launch_status =
        stream
            ->ThenBlasGemv(blas_trans, m, n, static_cast(1.0), a, m, b, 1,
                           static_cast(0.0), c, 1)
            .ok();
    if (!blas_launch_status) {
      ctx->SetStatus(
          errors::Internal("Blas GEMV launch failed:  m=", m, ", n=", n));
    }
  }

这个实现如GEMM的时候类似,只是调用的方法对应的变成GEMV的方法:ThenBlasGemv()

你可能感兴趣的:(机器学习)