本文展示如何只添加18行code,在矩阵乘法上获得200+倍的加速。
通常,CPU上的计算密集型任务有2个优化点:
- 提高内存访问的缓存命中率
- SIMD指令加速
对于gemm的优化手段已有现成的总结,基本都可以在这篇文档how to optimize gemm找到。
tvm已经实现了其中的一些优化方法,但由于tvm本身的限制,还有一些方法没有实现。
本文逐步优化,不断提升程序性能。首先用没有优化的code和numpy运行结果做对比,如下:
这是我自己的容器里运行的结果
Numpy running time: 0.004862
Baseline: 2.646903
原始ir如下:
produce C {
for (x, 0, 1024) {
for (y, 0, 1024) {
C[((x*1024) + y)] = 0.000000f
for (k, 0, 1024) {
C[((x*1024) + y)] = (C[((x*1024) + y)] + (A[((x*1024) + k)]*B[(y + (k*1024))]))
}
}
}
}
1 blocking
使用blocking的技术可以显著提升缓存命中率,因为数据被分块进行计算,块内的数据在缓存中的访问都是相邻的。
结果如下:
Numpy running time: 0.004732
Baseline: 2.892019
Opt1: 0.701635
优化后ir
produce C {
for (x.outer, 0, 32) {
for (y.outer, 0, 32) {
for (x.inner.init, 0, 32) {
for (y.inner.init, 0, 32) {
C[(((((x.outer*1024) + y.outer) + (x.inner.init*32))*32) + y.inner.init)] = 0.000000f
}
}
for (k.outer, 0, 256) {
for (k.inner, 0, 4) {
for (x.inner, 0, 32) {
for (y.inner, 0, 32) {
C[(((((x.outer*1024) + y.outer) + (x.inner*32))*32) + y.inner)] = (C[(((((x.outer*1024) + y.outer) + (x.inner*32))*32) + y.inner)] + (A[(((((x.outer*8192) + k.outer)*4) + k.inner) + (x.inner*1024))]*B[((((y.outer + (k.outer*128)) + (k.inner*32))*32) + y.inner)]))
}
}
}
}
}
}
}
2 Vectorization
向量化。
结果:
Numpy running time: 0.004964
Baseline: 2.884543
Opt1: 0.713341
Opt2: 0.331218
优化后ir
produce C {
for (x.outer, 0, 32) {
for (y.outer, 0, 32) {
for (x.inner.init, 0, 32) {
C[ramp(((((x.outer*1024) + y.outer) + (x.inner.init*32))*32), 1, 32)] = x32(0.000000f)
}
for (k.outer, 0, 256) {
for (k.inner, 0, 4) {
for (x.inner, 0, 32) {
C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] = (C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer)*4) + k.inner) + (x.inner*1024))])*B[ramp((((y.outer + (k.outer*128)) + (k.inner*32))*32), 1, 32)]))
}
}
}
}
}
}
3 Loop Permutation
结果:
Numpy running time: 0.005203
Baseline: 2.646298
Opt1: 0.691242
Opt2: 0.330293
Opt3: 0.147917
优化后ir:
produce C {
for (x.outer, 0, 32) {
for (y.outer, 0, 32) {
for (x.inner.init, 0, 32) {
C[ramp(((((x.outer*1024) + y.outer) + (x.inner.init*32))*32), 1, 32)] = x32(0.000000f)
}
for (k.outer, 0, 256) {
for (x.inner, 0, 32) {
for (k.inner, 0, 4) {
C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] = (C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.inner*256))*4) + k.inner)])*B[ramp((((y.outer + (k.outer*128)) + (k.inner*32))*32), 1, 32)]))
}
}
}
}
}
}
4 Array Packing
结果:
Numpy running time: 0.005159
Baseline: 2.884619
Opt1: 0.693074
Opt2: 0.332173
Opt3: 0.149278
Opt4: 0.233195
这一步优化,连续跑了两次,性能都反而变差了。
优化后ir:
// attr [packedB] storage_scope = "global"
allocate packedB[float32x32 * 32 * 1024 * 1]
produce packedB {
parallel (x, 0, 32) {
for (y, 0, 1024) {
packedB[ramp((((x*1024) + y)*32), 1, 32)] = B[ramp(((x + (y*32))*32), 1, 32)]
}
}
}
produce C {
for (x.outer, 0, 32) {
for (y.outer, 0, 32) {
for (x.inner.init, 0, 32) {
C[ramp(((((x.outer*1024) + y.outer) + (x.inner.init*32))*32), 1, 32)] = x32(0.000000f)
}
for (k.outer, 0, 256) {
for (x.inner, 0, 32) {
for (k.inner, 0, 4) {
C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] = (C[ramp(((((x.outer*1024) + y.outer) + (x.inner*32))*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.inner*256))*4) + k.inner)])*packedB[ramp((((((y.outer*256) + k.outer)*4) + k.inner)*32), 1, 32)]))
}
}
}
}
}
}
5 Write cache for blocks
结果:
Numpy running time: 0.005358
Baseline: 2.654734
Opt1: 0.689408
Opt2: 0.329072
Opt3: 0.148742
Opt4: 0.231431
Opt5: 0.211086
优化后ir:
// attr [packedB] storage_scope = "global"
allocate packedB[float32x32 * 32 * 1024 * 1]
// attr [C.global] storage_scope = "global"
allocate C.global[float32 * 32 * 32]
produce packedB {
parallel (x, 0, 32) {
for (y, 0, 1024) {
packedB[ramp((((x*1024) + y)*32), 1, 32)] = B[ramp(((x + (y*32))*32), 1, 32)]
}
}
}
produce C {
for (x.outer, 0, 32) {
for (y.outer, 0, 32) {
produce C.global {
for (x.c.init, 0, 32) {
C.global[ramp((x.c.init*32), 1, 32)] = x32(0.000000f)
}
for (k.outer, 0, 256) {
for (x.c, 0, 32) {
C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[((((x.outer*8192) + k.outer) + (x.c*256))*4)])*packedB[ramp((((y.outer*256) + k.outer)*128), 1, 32)]))
C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 1)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 32), 1, 32)]))
C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 2)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 64), 1, 32)]))
C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 3)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 96), 1, 32)]))
}
}
}
for (x.inner, 0, 32) {
for (y.inner, 0, 32) {
C[(((((x.outer*1024) + y.outer) + (x.inner*32))*32) + y.inner)] = C.global[((x.inner*32) + y.inner)]
}
}
}
}
}
6 Parallel
结果:
Numpy running time: 0.005989
Baseline: 2.635383
Opt1: 0.691006
Opt2: 0.328837
Opt3: 0.149464
Opt4: 0.233010
Opt5: 0.213697
Opt6: 0.018374
优化后ir:
// attr [packedB] storage_scope = "global"
allocate packedB[float32x32 * 32 * 1024 * 1]
produce packedB {
parallel (x, 0, 32) {
for (y, 0, 1024) {
packedB[ramp((((x*1024) + y)*32), 1, 32)] = B[ramp(((x + (y*32))*32), 1, 32)]
}
}
}
produce C {
parallel (x.outer, 0, 32) {
// attr [C.global] storage_scope = "global"
allocate C.global[float32 * 32 * 32]
for (y.outer, 0, 32) {
produce C.global {
for (x.c.init, 0, 32) {
C.global[ramp((x.c.init*32), 1, 32)] = x32(0.000000f)
}
for (k.outer, 0, 256) {
for (x.c, 0, 32) {
C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[((((x.outer*8192) + k.outer) + (x.c*256))*4)])*packedB[ramp((((y.outer*256) + k.outer)*128), 1, 32)]))
C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 1)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 32), 1, 32)]))
C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 2)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 64), 1, 32)]))
C.global[ramp((x.c*32), 1, 32)] = (C.global[ramp((x.c*32), 1, 32)] + (x32(A[(((((x.outer*8192) + k.outer) + (x.c*256))*4) + 3)])*packedB[ramp(((((y.outer*256) + k.outer)*128) + 96), 1, 32)]))
}
}
}
for (x.inner, 0, 32) {
for (y.inner, 0, 32) {
C[(((((x.outer*1024) + y.outer) + (x.inner*32))*32) + y.inner)] = C.global[((x.inner*32) + y.inner)]
}
}
}
}
}
参考: https://docs.tvm.ai/tutorials/optimize/opt_gemm.html