tvm tutorial (1.1)

接着上次 tensor_expr_get_started.py 没记录完的部分继续

剩下的代码如下,给出的样例是Nvidia GPU设备端的代码

run_cuda = False
if run_cuda:
    # Change this target to the correct backend for you gpu. For example: cuda (NVIDIA GPUs),
    # rocm (Radeon GPUS), OpenCL (opencl).
    tgt_gpu = tvm.target.Target(target="cuda", host="llvm")

    # Recreate the schedule
    n = te.var("n")
    A = te.placeholder((n,), name="A")
    B = te.placeholder((n,), name="B")
    C = te.compute(A.shape, lambda i: A[i] + B[i], name="C")

    s = te.create_schedule(C.op)

    bx, tx = s[C].split(C.op.axis[0], factor=64)

    ################################################################################
    # Finally we must bind the iteration axis bx and tx to threads in the GPU
    # compute grid. The naive schedule is not valid for GPUs, and these are
    # specific constructs that allow us to generate code that runs on a GPU.

    s[C].bind(bx, te.thread_axis("blockIdx.x"))
    s[C].bind(tx, te.thread_axis("threadIdx.x"))

    ######################################################################
    # Compilation
    # -----------
    # After we have finished specifying the schedule, we can compile it
    # into a TVM function. By default TVM compiles into a type-erased
    # function that can be directly called from the python side.
    #
    # In the following line, we use tvm.build to create a function.
    # The build function takes the schedule, the desired signature of the
    # function (including the inputs and outputs) as well as target language
    # we want to compile to.
    #
    # The result of compilation fadd is a GPU device function (if GPU is
    # involved) as well as a host wrapper that calls into the GPU
    # function. fadd is the generated host wrapper function, it contains
    # a reference to the generated device function internally.

    fadd = tvm.build(s, [A, B, C], target=tgt_gpu, name="myadd")

    ################################################################################
    # The compiled TVM function is exposes a concise C API that can be invoked from
    # any language.
    #
    # We provide a minimal array API in python to aid quick testing and prototyping.
    # The array API is based on the `DLPack `_ standard.
    #
    # - We first create a GPU device.
    # - Then tvm.nd.array copies the data to the GPU.
    # - ``fadd`` runs the actual computation
    # - ``numpy()`` copies the GPU array back to the CPU (so we can verify correctness).
    #
    # Note that copying the data to and from the memory on the GPU is a required step.

    dev = tvm.device(tgt_gpu.kind.name, 0)

    n = 1024
    a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), dev)
    b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), dev)
    c = tvm.nd.array(np.zeros(n, dtype=C.dtype), dev)
    fadd(a, b, c)
    tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

    ################################################################################
    # Inspect the Generated GPU Code
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    # You can inspect the generated code in TVM. The result of tvm.build is a TVM
    # Module. fadd is the host module that contains the host wrapper, it also
    # contains a device module for the CUDA (GPU) function.
    #
    # The following code fetches the device module and prints the content code.

    if (
        tgt_gpu.kind.name == "cuda"
        or tgt_gpu.kind.name == "rocm"
        or tgt_gpu.kind.name.startswith("opencl")
    ):
        dev_module = fadd.imported_modules[0]
        print("-----GPU code-----")
        print(dev_module.get_source())
    else:
        print(fadd.get_source())

其中比较重要的是下面几行代码

 bx, tx = s[C].split(C.op.axis[0], factor=64)

################################################################################
# Finally we must bind the iteration axis bx and tx to threads in the GPU
# compute grid. The naive schedule is not valid for GPUs, and these are
# specific constructs that allow us to generate code that runs on a GPU.
s[C].bind(bx, te.thread_axis("blockIdx.x"))
s[C].bind(tx, te.thread_axis("threadIdx.x"))

这里需要了解cuda的编程模型,上面的split操作将循环分成的两个嵌套循环。
其中factor为64,内层循环为64,这里将cuda中threadIdx绑定到内层循环中,将blockIdx绑定的外层循环上。

其余部分没有什么区别,进行编译运行,最后生成代码的部分如下:

    if (
        tgt_gpu.kind.name == "cuda"
        or tgt_gpu.kind.name == "rocm"
        or tgt_gpu.kind.name.startswith("opencl")
    ):
        dev_module = fadd.imported_modules[0]
        print("-----GPU code-----")
        print(dev_module.get_source())
    else:
        print(fadd.get_source())

根据使用不同的设备,会生成对应不同设备端的代码,也包括host端的代码

保存与加载模型

################################################################################
# Saving and Loading Compiled Modules
# -----------------------------------
# Besides runtime compilation, we can save the compiled modules into a file and
# load them back later.
#
# The following code first performs the following steps:
#
# - It saves the compiled host module into an object file.
# - Then it saves the device module into a ptx file.
# - cc.create_shared calls a compiler (gcc) to create a shared library

from tvm.contrib import cc
from tvm.contrib import utils

temp = utils.tempdir()
fadd.save(temp.relpath("myadd.o"))
if tgt.kind.name == "cuda":
    fadd.imported_modules[0].save(temp.relpath("myadd.ptx"))
if tgt.kind.name == "rocm":
    fadd.imported_modules[0].save(temp.relpath("myadd.hsaco"))
if tgt.kind.name.startswith("opencl"):
    fadd.imported_modules[0].save(temp.relpath("myadd.cl"))
cc.create_shared(temp.relpath("myadd.so"), [temp.relpath("myadd.o")])
print(temp.listdir())

################################################################################
# .. note:: Module Storage Format
#
#   The CPU (host) module is directly saved as a shared library (.so). There
#   can be multiple customized formats of the device code. In our example, the
#   device code is stored in ptx, as well as a meta data json file. They can be
#   loaded and linked separately via import.

################################################################################
# Load Compiled Module
# ~~~~~~~~~~~~~~~~~~~~
# We can load the compiled module from the file system and run the code. The
# following code loads the host and device module separately and links them
# together. We can verify that the newly loaded function works.

fadd1 = tvm.runtime.load_module(temp.relpath("myadd.so"))
if tgt.kind.name == "cuda":
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.ptx"))
    fadd1.import_module(fadd1_dev)

if tgt.kind.name == "rocm":
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.hsaco"))
    fadd1.import_module(fadd1_dev)

if tgt.kind.name.startswith("opencl"):
    fadd1_dev = tvm.runtime.load_module(temp.relpath("myadd.cl"))
    fadd1.import_module(fadd1_dev)

fadd1(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

################################################################################
# Pack Everything into One Library
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# In the above example, we store the device and host code separately. TVM also
# supports export everything as one shared library. Under the hood, we pack
# the device modules into binary blobs and link them together with the host
# code. Currently we support packing of Metal, OpenCL and CUDA modules.

fadd.export_library(temp.relpath("myadd_pack.so"))
fadd2 = tvm.runtime.load_module(temp.relpath("myadd_pack.so"))
fadd2(a, b, c)
tvm.testing.assert_allclose(c.numpy(), a.numpy() + b.numpy())

这里看cuda模型,如果是使用nv的GPU,最后保存成PTX文件,里面是GPU的汇编代码。
在后面可以使用runtime部分之家加载ptx文件并运行
最后代码中的注释也提到,上面部分的程序是将host和device端部分的代码分离开保存的,也可以一起打包成动态库

矩阵乘法优化

朴素方法

这部分代码使用TE优化矩阵乘法,主要以CPU为主(不错,电脑上没有GPU)
矩阵乘法是一个计算密集型的OP,主要的优化手段包括两种(来自注释部分):
1.提高cache的命中率,要达到此目的需要改变访存的模式来适应cache的工作原理
2.使用向量指令,通过循环优化,可以在LLVM 后端上lower成向量指令。

import tvm
import tvm.testing
from tvm import te
import numpy
import timeit

# The size of the matrix
# (M, K) x (K, N)
# You are free to try out different shapes, sometimes TVM optimization outperforms numpy with MKL.
M = 1024
K = 1024
N = 1024

# The default tensor data type in tvm
dtype = "float32"

# You will want to adjust the target to match any CPU vector extensions you
# might have. For example, if you're using using Intel AVX2 (Advanced Vector
# Extensions) ISA for SIMD, you can get the best performance by changing the
# following line to ``llvm -mcpu=core-avx2``, or specific type of CPU you use.
# Recall that you're using llvm, you can get this information from the command
# ``llc --version`` to get the CPU type, and you can check ``/proc/cpuinfo``
# for additional extensions that your processor might support.

target = tvm.target.Target(target="llvm", host="llvm")
dev = tvm.device(target.kind.name, 0)

# Random generated tensor for testing
a = tvm.nd.array(numpy.random.rand(M, K).astype(dtype), dev)
b = tvm.nd.array(numpy.random.rand(K, N).astype(dtype), dev)

# Repeatedly perform a matrix multiplication to get a performance baseline
# for the default numpy implementation
np_repeat = 100
np_running_time = timeit.timeit(
    setup="import numpy\n"
    "M = " + str(M) + "\n"
    "K = " + str(K) + "\n"
    "N = " + str(N) + "\n"
    'dtype = "float32"\n'
    "a = numpy.random.rand(M, K).astype(dtype)\n"
    "b = numpy.random.rand(K, N).astype(dtype)\n",
    stmt="answer = numpy.dot(a, b)",
    number=np_repeat,
)
print("Numpy running time: %f" % (np_running_time / np_repeat))

answer = numpy.dot(a.numpy(), b.numpy())
################################################################################
# Now we write a basic matrix multiplication using TVM TE and verify that it
# produces the same results as the numpy implementation. We also write a
# function that will help us measure the performance of the schedule
# optimizations.

# TVM Matrix Multiplication using TE
k = te.reduce_axis((0, K), "k")
A = te.placeholder((M, K), name="A")
B = te.placeholder((K, N), name="B")
C = te.compute((M, N), lambda x, y: te.sum(A[x, k] * B[k, y], axis=k), name="C")

# Default schedule
s = te.create_schedule(C.op)
func = tvm.build(s, [A, B, C], target=target, name="mmult")

c = tvm.nd.array(numpy.zeros((M, N), dtype=dtype), dev)
func(a, b, c)
tvm.testing.assert_allclose(c.numpy(), answer, rtol=1e-5)

上面的代码使用了最原始的矩阵乘法计算的方式,对应的C++代码逻辑如下:

// A[N][K] * B[K][M] = C[N][M]
void gemm_naive(float* A, float* B, float* C, int N, int M, int K) {
	memset(C, 0, sizeof(T) * N * M);
    for (int n = 0; n < N; n++) {
        for (int m = 0; m < M; m++) {
            float tmp = 0.0;
            for (int k = 0; k < K; k++) {
                // C[n][m] += A[n][k] * B[k][m];
                tmp += (*(A + n * K + k)) * (*(B + k * M + m));
            }
            (*(C + n * M + m)) = tmp;
        }
    }
}

上面tvm代码生成的IR如下:

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (x: int32, 0, 1024) {
    for (y: int32, 0, 1024) {
      C_2[((x*1024) + y)] = 0f32
      for (k: int32, 0, 1024) {
        C_2[((x*1024) + y)] = ((float32*)C_2[((x*1024) + y)] + ((float32*)A_2[((x*1024) + k)]*(float32*)B_2[((k*1024) + y)]))
      }
    }
  }
}

blocking & tilling优化

什么是blocking tilling优化?实际上就是利用cache存储数据的局部性原理,增加连续访存,减少跳跃式访存带来的cache miss,从而提高运行效率。
要实现连续性访存,且将局部数据放进cache里面,需要使用tilereorder两种操作,经过tile后的数据块可以可以完整的填入cache。

在CPU上,这种方法可以增加cache的命中率
tvm的代码如下:

# Optimization 1: Blocking
# ~~~~~~~~~~~~~~~~~~~~~~~~
#
# A important trick to enhance the cache hit rate is blocking, where you
# structure memory access such that the inside a block is a small neighborhood
# that has high memory locality. In this tutorial, we pick a block factor of
# 32. This will result in a block that will fill a 32 * 32 * sizeof(float) area
# of memory. This corresponds to a cache size of 4KB, in relation to a
# reference cache size of 32 KB for L1 cache.
#
# We begin by creating a default schedule for the ``C`` operation, then apply a
# ``tile`` scheduling primitive to it with the specified block factor, with the
# scheduling primitive returning the resulting loop order from outermost to
# innermost, as a vector ``[x_outer, y_outer, x_inner, y_inner]``. We then get
# the reduction axis for output of the operation, and perform a split operation
# on it using a factor of 4. This factor doesn't directly impact the blocking
# optimization we're working on right now, but will be useful later when we
# apply vectorization.
#
# Now that the operation has been blocked, we can reorder the computation to
# put the reduction operation into the outermost loop of the computation,
# helping to guarantee that the blocked data remains in cache. This completes
# the schedule, and we can build and test the performance compared to the naive
# schedule.

bn = 32

# Blocking by loop tiling
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
(k,) = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=4)

# Hoist reduction domain outside the blocking loop
s[C].reorder(xo, yo, ko, ki, xi, yi)

evaluate_operation(s, [A, B, C], target=target, name="mmult", optimization="blocking", log=log)

################################################################################
# By reordering the computation to take advantage of caching, you should see a
# significant improvement in the performance of the computation. Now, print the
# internal representation and compare it to the original:

print(tvm.lower(s, [A, B, C], simple_mode=True))

仅使用tiling

这里使用的tile的参数大小是32,即可以构造一个32×32大小的block填入到cache当中
如果仅使用tile的schedule,得到的IR是这样的,inner的部分是tile的参数

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (x.outer: int32, 0, 32) {
    for (y.outer: int32, 0, 32) {
      for (x.inner: int32, 0, 32) {
        for (y.inner: int32, 0, 32) {
          C_2[((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)) + y.inner)] = 0f32
          for (k: int32, 0, 1024) {
            C_2[((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)) + y.inner)] = ((float32*)C_2[((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)) + y.inner)] + ((float32*)A_2[(((x.outer*32768) + (x.inner*1024)) + k)]*(float32*)B_2[(((k*1024) + (y.outer*32)) + y.inner)]))
          }
        }
      }
    }
  }
}

x.outer的stride是32768,即 y . o u t e r ∗ x . i n n e r ∗ y . i n n e r = 32768 y.outer * x.inner * y.inner = 32768 y.outerx.innery.inner=32768

对应的C++代码如下:

template <typename T>
void gemm_tile(T* A, T* B, T* C, int N, int M, int K, int fac) {
    memset(C, 0, sizeof(T) * N * M);
    for (int n_outer = 0; n_outer < (N + fac - 1) / fac; n_outer++) {
        for (int m_outer = 0; m_outer < (M + fac - 1) / fac; m_outer++) {
            for (int k_outer = 0; k_outer < (K + fac - 1) / fac; k_outer++) {

                for (int n_inner = 0; n_inner < fac; n_inner++) {
                    int index_n = n_outer * fac + n_inner;
                    if (index_n >= N) break;

                    for (int m_inner = 0; m_inner < fac; m_inner++) {
                        int index_m = m_outer * fac + m_inner;
                        if (index_m >= M) break;

                        for (int k_inner = 0; k_inner < fac; k_inner++) {
                            int index_k = k_outer * fac + k_inner;
                            if (index_k >= K) break;

                            C[index_n * M + index_m] += (*(A + index_n * K + index_k)) * (*(B + index_k * M + index_m));
                        }
                    }
                }
            }
        }
    }
}

仅使用reordering

遍历方式如下:
tvm tutorial (1.1)_第1张图片
相比与朴素方法,遍历矩阵B时为按行索引,这样对数据的局部性很不友好,可以考虑将遍历矩阵B的方式改为按列遍历,仅使用reoroder的tvm代码如下:

bn = 32
(k,) = s[C].op.reduce_axis
(x, y) = C.op.axis
s[C].reorder(x, k, y)
print(tvm.lower(s, [A, B, C], simple_mode=True))

生成的IR如下:

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [512, 512], []),
             A: Buffer(A_2: Pointer(float32), float32, [512, 512], []),
             B: Buffer(B_2: Pointer(float32), float32, [512, 512], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (x: int32, 0, 512) {
    for (y.init: int32, 0, 512) {
      C_2[((x*512) + y.init)] = 0f32
    }
    for (k: int32, 0, 512) {
      for (y: int32, 0, 512) {
        C_2[((x*512) + y)] = ((float32*)C_2[((x*512) + y)] + ((float32*)A_2[((x*512) + k)]*(float32*)B_2[((k*512) + y)]))
      }
    }
  }
}

c++实现代码如下:

template <typename T>
void gemm_reordering(T* A, T* B, T* C, int N, int M, int K) {
    memset(C, 0, sizeof(T) * N * M);
    for (int n = 0; n < N; n++) {
        for (int k = 0; k < K; k++) {
            for (int m = 0; m < M; m++) {
                C[n * M + m] += A[n * K + k] * B[k * M + m];
            }
        }
    }
}

reordering & tile

reorderingtile的策略结合起来

# Blocking by loop tiling
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
(k,) = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=4)
#
# # Hoist reduction domain outside the blocking loop
s[C].reorder(xo, yo, ko, ki, xi, yi)

print(tvm.lower(s, [A, B, C], simple_mode=True))

得到的IR如下:

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
             A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (x.outer: int32, 0, 32) {
    for (y.outer: int32, 0, 32) {
      for (x.inner.init: int32, 0, 32) {
        for (y.inner.init: int32, 0, 32) {
          C_2[((((x.outer*32768) + (x.inner.init*1024)) + (y.outer*32)) + y.inner.init)] = 0f32
        }
      }
      for (k.outer: int32, 0, 256) {
        for (k.inner: int32, 0, 4) {
          for (x.inner: int32, 0, 32) {
            for (y.inner: int32, 0, 32) {
              C_2[((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)) + y.inner)] = ((float32*)C_2[((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)) + y.inner)] + ((float32*)A_2[((((x.outer*32768) + (x.inner*1024)) + (k.outer*4)) + k.inner)]*(float32*)B_2[((((k.outer*4096) + (k.inner*1024)) + (y.outer*32)) + y.inner)]))
            }
          }
        }
      }
    }
  }
}

相应的C++代码如下:

template <typename T>
void gemm_reordering_tile(T* A, T* B, T* C, int N, int M, int K, int fac) {
    memset(C, 0, sizeof(T) * N * M);
    for (int n_outer = 0; n_outer < (N + fac - 1) / fac; n_outer++) {
        for (int k_outer = 0; k_outer < (K + fac - 1) / fac; k_outer++) {

            for (int m_outer = 0; m_outer < (M + fac - 1) / fac; m_outer++) {
                for (int n_inner = 0; n_inner < fac; n_inner++) {
                    int index_n = n_outer * fac + n_inner;
                    if (index_n >= N) break;

                    for (int k_inner = 0; k_inner < fac; k_inner++) {
                        int index_k = k_outer * fac + k_inner;
                            if (index_k >= K) break;

                        for (int m_inner = 0; m_inner < fac; m_inner++) {
                            int index_m = m_outer * fac + m_inner;
                            if (index_m >= M) break;

                            C[index_n * M + index_m] += (*(A + index_n * K + index_k)) * (*(B + index_k * M + index_m));
                        }
                    }
                }
            }
        }
    }
}

向量化

tvm中使用如下代码

bn = 32

# Blocking by loop tiling
xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
(k,) = s[C].op.reduce_axis
ko, ki = s[C].split(k, factor=4)

# Hoist reduction domain outside the blocking loop
s[C].reorder(xo, yo, ko, ki, xi, yi)
s[C].vectorize(yi)

注意上面的reorder的顺序,将yi放在了最后,将ki放在了xi和yi之前

生成的IR如下:

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
             A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (x.outer: int32, 0, 32) {
    for (y.outer: int32, 0, 32) {
      for (x.inner.init: int32, 0, 32) {
        C_2[ramp((((x.outer*32768) + (x.inner.init*1024)) + (y.outer*32)), 1, 32)] = broadcast(0f32, 32)
      }
      for (k.outer: int32, 0, 256) {
        for (k.inner: int32, 0, 4) {
          for (x.inner: int32, 0, 32) {
            C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] = ((float32x32*)C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] + (broadcast((float32*)A_2[((((x.outer*32768) + (x.inner*1024)) + (k.outer*4)) + k.inner)], 32)*(float32x32*)B_2[ramp((((k.outer*4096) + (k.inner*1024)) + (y.outer*32)), 1, 32)]))
          }
        }
      }
    }
  }
}

可见上面的循环中少了对y.innder部分的循环,因为对y轴执行向量化操作

这里暂时不实现向量指令的C++代码。(记得补上)

循环重新排布

上面循环的排列对矩阵A访问存时,是按照列访问的(k_inner在最外),cache hit利用率不高。

TVM代码如下:

 s = te.create_schedule(C.op)
 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 (k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=4)

 # re-ordering
 s[C].reorder(xo, yo, ko, xi, ki, yi)
 s[C].vectorize(yi)

生成IR如下:

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
             A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  for (x.outer: int32, 0, 32) {
    for (y.outer: int32, 0, 32) {
      for (x.inner.init: int32, 0, 32) {
        C_2[ramp((((x.outer*32768) + (x.inner.init*1024)) + (y.outer*32)), 1, 32)] = broadcast(0f32, 32)
      }
      for (k.outer: int32, 0, 256) {
        for (x.inner: int32, 0, 32) {
          for (k.inner: int32, 0, 4) {
            C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] = ((float32x32*)C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] + (broadcast((float32*)A_2[((((x.outer*32768) + (x.inner*1024)) + (k.outer*4)) + k.inner)], 32)*(float32x32*)B_2[ramp((((k.outer*4096) + (k.inner*1024)) + (y.outer*32)), 1, 32)]))
          }
        }
      }
    }
  }
}

array packing

这个优化策略稍有复杂
原始的B的数据摆放是B[K][N]现在变成B[N/bn][K][bn],这里bn是数组B最内层循环的迭代量。
注意上面数组B的两个维度K和N交换了,然后将N分成bn份。
这里记
bigN=N/bn
bn=littleN
仅执行packedB操作,得到的IR如下:

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  allocate(packedB: Pointer(global float32), float32, [1048576]), storage_scope = global;
  for (bigN: int32, 0, 32) {
    for (k: int32, 0, 1024) {
      for (littleN: int32, 0, 32) {
        packedB[(((bigN*32768) + (k*32)) + littleN)] = (float32*)B_2[(((k*1024) + (bigN*32)) + littleN)]
      }
    }
  }
}

可以看到执行packedB的遍历顺序是bigN,K然后是littleN
注意计算逻辑中的区别如下

 packedB[(((bigN*32768) + (k*32)) + littleN)] = (float32*)B_2[(((k*1024) + (bigN*32)) + littleN)]

上面IR执行的packed过程,可以用下面的图来表示:
tvm tutorial (1.1)_第2张图片对应的c++代码逻辑表示如下:

template <typename T>
void packedB(T* B, T* packed_b, int K, int N, int fac) {
    for (int big_n = 0; big_n < N / fac; big_n++) {
        for (int k = 0; k < K; k++) {
            for (int little_n = 0; little_n < fac; little_n++) {
                packed_b[big_n * K * fac + k * fac + little_n] = B[k * N + big_n * fac + little_n];
            }
        }
    }
}

接下来描述矩阵C如何利用packedB进行计算

 C = te.compute(
     (M, N),
     lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
     name="C",
 )

上面代码里面有个tvm.tir.indexmod,从官方文档中得到的解释如下:

Compute the remainder of indexdiv. a and b are non-negative.

注意上面的提到的哪个indexdiv也是OP,它的解释是这样的:

Compute floor(a / b) where a and b are non-negative. (直接取整)

这里猜测indexmod(y,bn)的结果就是y % bn (为啥不直接叫mod呢)

生成的IR如下:

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
             A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  allocate(packedB: Pointer(global float32), float32, [1048576]), storage_scope = global {
    for (bigN: int32, 0, 32) {
      for (k: int32, 0, 1024) {
        for (littleN: int32, 0, 32) {
          packedB[(((bigN*32768) + (k*32)) + littleN)] = (float32*)B_2[(((k*1024) + (bigN*32)) + littleN)]
        }
      }
    }
    for (x: int32, 0, 1024) {
      for (y: int32, 0, 1024) {
        C_2[((x*1024) + y)] = 0f32
        for (k_1: int32, 0, 1024) {
          C_2[((x*1024) + y)] = ((float32*)C_2[((x*1024) + y)] + ((float32*)A_2[((x*1024) + k_1)]*(float32*)packedB[(((floordiv(y, 32)*32768) + (k_1*32)) + floormod(y, 32))]))
        }
      }
    }
  }
}

上面的逻辑就是用矩阵A乘以packedB了,可以看到矩阵A的访存方式就是标准的按行主序的方式索引,再看packedB,
因为packedB的维度是B[N/bn][K][bn],所以上面的IR中floordiv(y,32)计算就是N/bn这个维度的索引,后面的32768是1024×32
k_1是K的索引,floormod(y,32)是bn的索引

完整的代码如下:

 packedB = te.compute((N / bn, K, bn), lambda x, y, z: B[y, x * bn + z], name="packedB")
 C = te.compute(
     (M, N),
     lambda x, y: te.sum(A[x, k] * packedB[y // bn, k, tvm.tir.indexmod(y, bn)], axis=k),
     name="C",
 )

 s = te.create_schedule(C.op)

 xo, yo, xi, yi = s[C].tile(C.op.axis[0], C.op.axis[1], bn, bn)
 (k,) = s[C].op.reduce_axis
 ko, ki = s[C].split(k, factor=4)

 s[C].reorder(xo, yo, ko, xi, ki, yi)
 s[C].vectorize(yi)

 x, y, z = s[packedB].op.axis
 s[packedB].vectorize(z)
 s[packedB].parallel(x)

上面的schedule逻辑中,矩阵C的两个轴进行tile操作。
此外,还对矩阵C最内层循环进行向量化操作,C是写回的结果矩阵,同样需要进行向量化的原因是计算packedB进行向量化操作,得到的是多个结果的连续值,因此需要将写回的访存行为也进行向量化

生成的IR如下:

primfn(A_1: handle, B_1: handle, C_1: handle) -> ()
  attr = {"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}
  buffers = {C: Buffer(C_2: Pointer(float32), float32, [1024, 1024], []),
             A: Buffer(A_2: Pointer(float32), float32, [1024, 1024], []),
             B: Buffer(B_2: Pointer(float32), float32, [1024, 1024], [])}
  buffer_map = {A_1: A, B_1: B, C_1: C} {
  allocate(packedB: Pointer(global float32x32), float32x32, [32768]), storage_scope = global {
    for (bigN: int32, 0, 32) "parallel" {
      for (k: int32, 0, 1024) {
        packedB[ramp(((bigN*32768) + (k*32)), 1, 32)] = (float32x32*)B_2[ramp(((k*1024) + (bigN*32)), 1, 32)]
      }
    }
    for (x.outer: int32, 0, 32) {
      for (y.outer: int32, 0, 32) {
        for (x.inner.init: int32, 0, 32) {
          C_2[ramp((((x.outer*32768) + (x.inner.init*1024)) + (y.outer*32)), 1, 32)] = broadcast(0f32, 32)
        }
        for (k.outer: int32, 0, 256) {
          for (x.inner: int32, 0, 32) {
            for (k.inner: int32, 0, 4) {
              C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] = ((float32x32*)C_2[ramp((((x.outer*32768) + (x.inner*1024)) + (y.outer*32)), 1, 32)] + (broadcast((float32*)A_2[((((x.outer*32768) + (x.inner*1024)) + (k.outer*4)) + k.inner)], 32)*(float32x32*)packedB[ramp((((y.outer*32768) + (k.outer*128)) + (k.inner*32)), 1, 32)]))
            }
          }
        }
      }
    }
  }
}

你可能感兴趣的:(LLVM/TVM/编译理论,python,开发语言,后端)