TensorFlow 中的 LRNOp 与 Caffe 的差异:
sqr_sum[a, b, c, d] =
sum(input[a, b, c, d - depth_radius : d + depth_radius + 1] ** 2)
output = input / (bias + alpha * sqr_sum) ** beta
下面对于框架内部的 CPU 实现进行介绍。
OP_REQUIRES_OK 在错误时打印文件名、行号和错误码。
OP_REQUIRES 接受表达式和返回值。
FastBoundsCheck 检查depth_radius64
有没有超出 int 的范围。
OpKernel 是所有 Op 类的基类。
template <typename Device, typename T>
class LRNOp : public OpKernel {
public:
explicit LRNOp(OpKernelConstruction* context) : OpKernel(context) {
int64_t depth_radius64;
OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
OP_REQUIRES(
context,
FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
errors::InvalidArgument("depth_radius = ", depth_radius64,
" larger than int max"));
depth_radius_ = static_cast<int>(depth_radius64);
float tmp;
OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
bias_ = T(tmp);
OP_REQUIRES_OK(context, context->GetAttr("alpha", &tmp));
alpha_ = T(tmp);
OP_REQUIRES_OK(context, context->GetAttr("beta", &tmp));
beta_ = T(tmp);
}
LRNOp::Compute
输入维度要求是4维。
输入元素数量在int
的表示范围内。
仅支持 NHWC 布局。
通道数量加填充也需要在int
的表示范围内。
void Compute(OpKernelContext* context) override {
const Tensor& in = context->input(0);
OP_REQUIRES(context, in.dims() == 4,
errors::InvalidArgument("in must be 4-dimensional"));
OP_REQUIRES(
context,
FastBoundsCheck(in.NumElements(), std::numeric_limits<int>::max()),
errors::InvalidArgument("argument to LRN too large"));
// Cast to platform-specific int to avoid conversion warnings.
const int batch = static_cast<int>(in.dim_size(0));
const int rows = static_cast<int>(in.dim_size(1));
const int cols = static_cast<int>(in.dim_size(2));
const int depth = static_cast<int>(in.dim_size(3));
OP_REQUIRES(context,
(depth + depth_radius_) <= std::numeric_limits<int>::max(),
errors::InvalidArgument("depth ", depth, " + depth_radius ",
depth_radius_, " exceeds int max."));
OpKernelContext::allocate_output 为输出分配内存。
LaunchLRN
Tensor* output = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(
0, TensorShape({batch, rows, cols, depth}), &output));
LaunchLRN<Device, T> launcher(depth_radius_, bias_, alpha_, beta_);
launcher.launch(context, this, in, output);
}
private:
int depth_radius_;
T bias_;
T alpha_;
T beta_;
};
LaunchLRN::launch
beta_
是特定值时且通道数量大于 kSingleThreadedLRNDepthCutoff 才会调用 LaunchLRN::SingleThreadedLRN 来实现。
template <typename T>
struct LaunchLRN<CPUDevice, T> {
LaunchLRN(int depth_radius, T bias, T alpha, T beta)
: depth_radius_(depth_radius), bias_(bias), alpha_(alpha), beta_(beta) {}
void launch(OpKernelContext* context, OpKernel* kernel, const Tensor& in,
Tensor* output) {
const int batch = static_cast<int>(in.dim_size(0));
const int rows = static_cast<int>(in.dim_size(1));
const int cols = static_cast<int>(in.dim_size(2));
const int depth = static_cast<int>(in.dim_size(3));
#if defined(IS_MOBILE_PLATFORM)
SingleThreadedLRN(in, batch, rows, cols, depth, output);
#else
if (depth > kSingleThreadedLRNDepthCutoff &&
(beta_ == T(0.5) || beta_ == T(1))) {
SingleThreadedLRN(in, batch, rows, cols, depth, output);
return;
}
nodes
为特征图的元素数量。
Tensor::shaped 将现有 Tensor 中的数据映射到一个对应维度和类型的 Eigen::TensorMap。
将输入转为二维矩阵in_shaped
。
const int nodes = cols * rows;
auto in_shaped = in.shaped<T, 2>({nodes * batch, depth});
multiplier
是一个二维的 Eigen::Tensor。
GetBandMatrix 生成带状矩阵。将输入与带矩阵相乘,可减少沿深度方向的适合分片数量。
// Multiplying the input with the band matrix has the effect of reducing the
// correct patch along the depth.
Eigen::Tensor<T, 2, Eigen::RowMajor> multiplier(depth, depth);
GetBandMatrix<T>(depth, depth_radius_, &multiplier);
将输出转为二维矩阵out_shaped
。
Eigen::Tensor::contract 在二维情况下是矩阵乘。
in_shaped
在平方后与multiplier
矩阵相乘实现求和,得到 b i a s + α ( ∑ j , i − n i + n x j 2 ) bias + \alpha(\sum_{j,{i - n}}^{i + n} x_j^2) bias+α(∑j,i−ni+nxj2)。
auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
auto tmp = in_shaped.square().contract(multiplier, dims) * alpha_ + bias_;
激活的计算开销较大所以对特殊情况进行优化:
beta_=1
:对tmp
取倒数;beta_=0.5
:则直接使用rsqrt
函数; if (beta_ == T(1)) {
out_shaped.device(context->eigen_cpu_device()) =
in_shaped * tmp.inverse();
} else if (beta_ == T(0.5)) {
out_shaped.device(context->eigen_cpu_device()) = in_shaped * tmp.rsqrt();
} else {
out_shaped.device(context->eigen_cpu_device()) =
in_shaped * (tmp.log() * -beta_).exp();
}
#endif
}
private:
typedef typename Eigen::Tensor<T, 1, Eigen::RowMajor>::DimensionPair DimPair;
Eigen::Map 映射现有数据数组的矩阵或向量表达式。
Eigen::Matrix 有六个模板参数,一般只需指定前三个参数。
将in
映射为二维矩阵。由于 Eigen 矩阵默认为列优先存储顺序,所以不需要进行转置。
向量表达式的大小 为depth
,跨度为batch * rows * cols
。
padded_square
的大小为depth+depth_radius_ * 2
DenseBase::setZero
void SingleThreadedLRN(const Tensor& in, const int batch, const int rows,
const int cols, const int depth, Tensor* out) {
Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> data_in(
in.flat<T>().data(), depth, batch * rows * cols);
Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> data_out(
out->flat<T>().data(), depth, batch * rows * cols);
const int double_depth_radius = depth_radius_ * 2;
Eigen::Matrix<T, Eigen::Dynamic, 1> padded_square(data_in.rows() +
double_depth_radius);
padded_square.setZero();
每次取一个空间位置上的所有通道,求平方;滑动窗口求和。需要 N H W × C NHW \times C NHW×C 两层循环。仅在求平方时存在向量化。
块操作跳过padded_square
的前depth_radius_
行,存储 α x 2 \alpha x^2 αx2。
accumulated_scale
为当前位置上的 α ( ∑ j , i − n i + n x j 2 \alpha(\sum_{j,{i - n}}^{i + n} x_j^2 α(∑j,i−ni+nxj2。
data_out
中为 N = ( b i a s + α ( ∑ j , i − n i + n x j 2 ) ) N = (bias + \alpha(\sum_{j,{i - n}}^{i + n} x_j^2)) N=(bias+α(∑j,i−ni+nxj2))
for (int r = 0; r < data_in.cols(); ++r) {
// Do local response normalization for data_in(:, r). First, compute the
// square and store them in buffer for repeated use.
padded_square.block(depth_radius_, 0, data_out.rows(), 1) =
data_in.col(r).cwiseProduct(data_in.col(r)) * alpha_;
// Then, compute the scale and write it to data_out.
T accumulated_scale(0);
for (int i = 0; i < double_depth_radius; ++i) {
accumulated_scale += padded_square(i);
}
for (int i = 0; i < data_in.rows(); ++i) {
accumulated_scale += padded_square(i + double_depth_radius);
data_out(i, r) = bias_ + accumulated_scale;
accumulated_scale -= padded_square(i);
}
}
根据不同情况计算激活。
if (beta_ == T(1)) {
data_out.array() = data_in.array() * data_out.array().inverse();
} else if (beta_ == T(0.5)) {
data_out.array() = data_in.array() * data_out.array().rsqrt();
} else {
data_out.array() =
data_in.array() * (data_out.array().log() * -beta_).exp();
}
}
int depth_radius_;
T bias_;
T alpha_;
T beta_;
};
沿着对角线上大小为(2*深度_半径+1)的扫描线,创建一个逐深的带状矩阵,其中有1个。
沿对角线周围的一条尺寸为( 2 深度_半径+ 1)的条带创建一个深度为1s的深度带状矩阵。
沿对角线周围的大小(2depth_radius+1)创建一个带1的逐深度带矩阵。
// Create a depth-by-depth band matrix with 1s along a swath of size (2 *
// depth_radius + 1) around the diagonal.
template <typename T>
void GetBandMatrix(int depth, int depth_radius,
Eigen::Tensor<T, 2, Eigen::RowMajor>* result) {
result->setZero();
for (int row = 0; row < depth; ++row) {
const int begin = std::max<int>(0, row - depth_radius);
const int end = std::min<int>(depth, row + depth_radius + 1);
Eigen::DSizes<Eigen::DenseIndex, 2> start(row, begin);
Eigen::DSizes<Eigen::DenseIndex, 2> sizes(1, end - begin);
result->slice(start, sizes).setConstant(T(1));
}
}
template <typename Device, typename T>
class LRNGradOp : public OpKernel {
public:
explicit LRNGradOp(OpKernelConstruction* context) : OpKernel(context) {
int64_t depth_radius64;
OP_REQUIRES_OK(context, context->GetAttr("depth_radius", &depth_radius64));
OP_REQUIRES(
context,
FastBoundsCheck(depth_radius64, std::numeric_limits<int>::max()),
errors::InvalidArgument("depth_radius = ", depth_radius64,
" larger than int max"));
depth_radius_ = static_cast<int>(depth_radius64);
float tmp;
OP_REQUIRES_OK(context, context->GetAttr("bias", &tmp));
bias_ = T(tmp);
OP_REQUIRES_OK(context, context->GetAttr("alpha", &tmp));
alpha_ = T(tmp);
OP_REQUIRES_OK(context, context->GetAttr("beta", &tmp));
beta_ = T(tmp);
}
创建一个 LaunchLRNGrad 对象并调用。
void Compute(OpKernelContext* context) override {
const Tensor& in_grads = context->input(0);
const Tensor& in_image = context->input(1);
const Tensor& out_image = context->input(2);
OP_REQUIRES(context, in_grads.dims() == 4 && in_image.dims() == 4,
errors::InvalidArgument("inputs must be 4-dimensional"));
const int64_t batch = in_grads.dim_size(0);
const int64_t rows = in_grads.dim_size(1);
const int64_t cols = in_grads.dim_size(2);
const int64_t depth = in_grads.dim_size(3);
OP_REQUIRES(
context,
in_image.dim_size(0) == batch && in_image.dim_size(1) == rows &&
in_image.dim_size(2) == cols && in_image.dim_size(3) == depth &&
out_image.dim_size(0) == batch && out_image.dim_size(1) == rows &&
out_image.dim_size(2) == cols && out_image.dim_size(3) == depth &&
out_image.dims() == 4,
errors::InvalidArgument(
"input_grads, input_image, and out_image should have the same "
"shape"));
Tensor* output = nullptr;
OP_REQUIRES_OK(context,
context->allocate_output(
0, TensorShape({batch, rows, cols, depth}), &output));
LaunchLRNGrad<Device, T> launcher(depth_radius_, bias_, alpha_, beta_);
launcher.launch(context, this, in_grads, in_image, out_image, output);
}
private:
int depth_radius_;
T bias_;
T alpha_;
T beta_;
};
template <typename T>
struct LaunchLRNGrad<CPUDevice, T> {
LaunchLRNGrad(int depth_radius, T bias, T alpha, T beta)
: depth_radius_(depth_radius),
bias_(bias),
alpha_(alpha),
beta_(beta),
alpha_beta_2_(T(-2) * alpha * beta) {}
void launch(OpKernelContext* context, OpKernel* kernel,
const Tensor& in_grads, const Tensor& in_image,
const Tensor& out_image, Tensor* output) {
const int64_t batch = in_grads.dim_size(0);
const int64_t rows = in_grads.dim_size(1);
const int64_t cols = in_grads.dim_size(2);
const int64_t depth = in_grads.dim_size(3);
const auto nodes = cols * rows;
auto grads_shaped = in_grads.shaped<T, 2>({nodes * batch, depth});
auto in_shaped = in_image.shaped<T, 2>({nodes * batch, depth});
auto activations = out_image.shaped<T, 2>({nodes * batch, depth});
auto out_shaped = output->shaped<T, 2>({nodes * batch, depth});
out_shaped.setZero();
y i = x i ( b i a s + α ( ∑ j , i − n i + n x j 2 ) ) β y_i = \frac{x_i}{(bias + \alpha(\sum_{j,{i - n}}^{i + n} x_j^2))^\beta} yi=(bias+α(∑j,i−ni+nxj2))βxi
令
N = ( b i a s + α ( ∑ j , i − n i + n x j 2 ) ) = ( x i y i ) 1 β N = (bias + \alpha(\sum_{j,{i - n}}^{i + n} x_j^2)) = (\frac{x_i}{y_i})^{\frac{1}{\beta}} N=(bias+α(j,i−n∑i+nxj2))=(yixi)β1
N − β = y i x i N^{-{\beta}} = \frac{y_i}{x_i} N−β=xiyi
则
d y i d x i = N β − x i β ⋅ N β − 1 ⋅ 2 α x i N 2 β = 1 − 2 α β N − 1 ⋅ x i x i N β d y i d x j = − x i β ⋅ N β − 1 ⋅ 2 α x j N 2 β = − 2 α β ⋅ N − 1 ⋅ x i x j N β \begin{aligned} \frac{dy_i}{dx_i} &= \frac{N^\beta - x_i\beta\cdot N^{\beta-1}\cdot 2\alpha x_i}{N^{2\beta}}\\ &= \frac{1 - 2\alpha\beta N^{-1}\cdot x_i x_i}{N^{\beta}}\\ \frac{dy_i}{dx_j} &= \frac{ - x_i\beta \cdot N^{\beta-1}\cdot 2\alpha x_j}{N^{2\beta}}\\ &= \frac{ - 2\alpha\beta \cdot N^{-1}\cdot x_i x_j}{N^{\beta}} \end{aligned} dxidyidxjdyi=N2βNβ−xiβ⋅Nβ−1⋅2αxi=Nβ1−2αβN−1⋅xixi=N2β−xiβ⋅Nβ−1⋅2αxj=Nβ−2αβ⋅N−1⋅xixj
将 d y dy dy 修改为两维。
对于每一个输出元素,norm
为 N N N,pre_computed_pow
为 N − β N^{-\beta} N−β。
activations_ab2
为 − 2 α β y i -2\alpha\beta y_i −2αβyi
d E d x i = d E d y i d y i d x i = d y i 1 − 2 α β N − 1 ⋅ x i x j N β = g s ⋅ − 2 α β x i y i N \begin{aligned} \frac{dE}{dx_i} &= \frac{dE}{dy_i} \frac{dy_i}{dx_i}\\ &= dy_i \frac{1 - 2\alpha\beta N^{-1}\cdot x_i x_j}{N^{\beta}} \\ &= gs \cdot \frac{-2\alpha\beta x_i y_i}{N} \end{aligned} dxidE=dyidEdxidyi=dyiNβ1−2αβN−1⋅xixj=gs⋅N−2αβxiyi
所以 y i = N − β ⋅ x j y_i = N^{-{\beta}}\cdot x_j yi=N−β⋅xj。
求 N N N 时需要求和。
auto shard = [this, activations, in_shaped, grads_shaped, out_shaped,
depth](int64_t begin, int64_t end) {
for (int64_t i = begin; i < end; ++i) {
for (int64_t j = 0; j < depth; ++j) {
// Let y be the LRN activations and x be the inputs along the depth
// dimension. (LRN operates independently along rows, cols, and
// batch).
// We have
// yi = xi / (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius}
// x_j^2))^beta
//
// Let N = (bias + alpha(sum_j_{i - depth_radius}^{i + depth_radius}
// x_j^2))
// dy_i/dx_i = (N^beta - xi. beta*N^(beta-1)*2*alpha*xi)/N^(2*beta)
// dy_i/dx_j = ( - xi. beta*N^(beta-1)*2*alpha*xj)/N^(2*beta)
//
// NOTE(keveman) : We can compute N by doing (yi/xi) ^ (1/beta).
// However, this is numerically unstable for small values of xi. We
// compute N explicitly here to avoid that.
T gs = grads_shaped(i, j);
if (gs == T(0)) continue;
int64_t depth_begin = std::max<int64_t>(0, j - depth_radius_);
int64_t depth_end = std::min<int64_t>(depth, j + depth_radius_ + 1);
T norm(0);
for (int64_t k = depth_begin; k < depth_end; ++k) {
norm += in_shaped(i, k) * in_shaped(i, k);
}
norm = alpha_ * norm + bias_;
DCHECK_GT(norm, T(1e-6));
T pre_computed_pow = Eigen::numext::pow(norm, -beta_);
T activations_ab2 = alpha_beta_2_ * activations(i, j);
for (int64_t k = depth_begin; k < depth_end; ++k) {
T dyi = in_shaped(i, k) * activations_ab2 / norm;
if (k == j) {
dyi += pre_computed_pow;
}
dyi *= gs;
const_cast<typename TTypes<T, 2>::Tensor&>(out_shaped)(i, k) += dyi;
}
}
}
};
Shard 使用线程池执行函数。
auto worker_threads = *(context->device()->tensorflow_cpu_worker_threads());
Shard(worker_threads.num_threads, worker_threads.workers, nodes * batch,
depth * depth, shard);
}
int depth_radius_;
T bias_;
T alpha_;
T beta_;
T alpha_beta_2_;
};