<2022-04-28 周四>
ScaleImage()
的硬件加速函数(七)其实“如何写ScaleImage()
的硬件加速函数(六)”的实现就是一个ResizeHorizontalFilter()
将y
改成y / xFactor
的精简版,并不是ScaleImage()
的硬件加速函数。虽然它不是,但至少省掉了ResizeVerticalFilter()
的调用,速度上更快了。
但是目前发现的问题还是竖条纹,连续多次缩小一倍,最终图片被黑色竖条纹全部覆盖住,不断缩小或者放大,右侧会出现密集竖条纹,等等等的问题啦。
经过分析,黑色竖纹的产生原因是因为kernel
函数ScaleFilter()
的最内层的循环没有执行,导致将初始值0.0f
赋进了目标地址。
for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
{
float4 cp = (float4)0.0f;
__local CLQuantum* p = inputImageCache + (cacheIndex * 4);
cp.x = (float)*(p);
cp.y = (float)*(p + 1);
cp.z = (float)*(p + 2);
cp.w = (float)*(p + 3);
filteredPixel += cp;
}
可以这样解决:
STRINGIFY(
__kernel __attribute__((reqd_work_group_size(256, 1, 1)))
void ScaleFilter(const __global CLQuantum* inputImage, const unsigned int matte_or_cmyk,
const unsigned int inputColumns, const unsigned int inputRows, __global CLQuantum* filteredImage,
const unsigned int filteredColumns, const unsigned int filteredRows,
const float resizeFilterScale,
__local CLQuantum* inputImageCache, const int numCachedPixels,
const unsigned int pixelPerWorkgroup, const unsigned int pixelChunkSize,
__local float4* outputPixelCache, __local float* densityCache, __local float* gammaCache)
{
// calculate the range of resized image pixels computed by this workgroup
const unsigned int startX = get_group_id(0) * pixelPerWorkgroup;
const unsigned int stopX = MagickMin(startX + pixelPerWorkgroup, filteredColumns);
const unsigned int actualNumPixelToCompute = stopX - startX;
float xFactor = (float)filteredColumns / inputColumns;
// calculate the range of input image pixels to cache
const int cacheRangeStartX = MagickMax((int)((startX + 0.5f) / xFactor), (int)(0));
const int cacheRangeEndX = MagickMin((int)(cacheRangeStartX + numCachedPixels), (int)inputColumns);
// cache the input pixels into local memory
const unsigned int y = get_global_id(1);
const unsigned int pos = getPixelIndex(4, inputColumns, cacheRangeStartX, y / xFactor);
const unsigned int num_elements = (cacheRangeEndX - cacheRangeStartX) * 4;
event_t e = async_work_group_copy(inputImageCache, inputImage + pos, num_elements, 0);
wait_group_events(1, &e);
unsigned int totalNumChunks = (actualNumPixelToCompute + pixelChunkSize - 1) / pixelChunkSize;
for (unsigned int chunk = 0; chunk < totalNumChunks; chunk++)
{
const unsigned int chunkStartX = startX + chunk * pixelChunkSize;
const unsigned int chunkStopX = MagickMin(chunkStartX + pixelChunkSize, stopX);
const unsigned int actualNumPixelInThisChunk = chunkStopX - chunkStartX;
// determine which resized pixel computed by this workitem
const unsigned int itemID = get_local_id(0);
const unsigned int numItems = getNumWorkItemsPerPixel(actualNumPixelInThisChunk, get_local_size(0));
const int pixelIndex = pixelToCompute(itemID, actualNumPixelInThisChunk, get_local_size(0));
float4 filteredPixel = (float4)0.0f;
// -1 means this workitem doesn't participate in the computation
if (pixelIndex != -1)
{
// x coordinated of the resized pixel computed by this workitem
const int x = chunkStartX + pixelIndex;
// calculate how many steps required for this pixel
const float bisect = (x + 0.5) / xFactor + MagickEpsilon;
const unsigned int start = (unsigned int)MagickMax(bisect, 0.0f);
const unsigned int stop = (unsigned int)MagickMin(bisect + 1, (float)inputColumns);
const unsigned int n = stop - start;
// calculate how many steps this workitem will contribute
unsigned int numStepsPerWorkItem = n / numItems;
numStepsPerWorkItem += ((numItems * numStepsPerWorkItem) == n ? 0 : 1);
const unsigned int startStep = (itemID % numItems) * numStepsPerWorkItem;
if (startStep < n)
{
const unsigned int stopStep = MagickMin(startStep + numStepsPerWorkItem, n);
unsigned int cacheIndex = start + startStep - cacheRangeStartX;
for (unsigned int i = startStep; i < stopStep; i++, cacheIndex++)
{
float4 cp = (float4)0.0f;
__local CLQuantum* p = inputImageCache + (cacheIndex * 4);
cp.x = (float)*(p);
cp.y = (float)*(p + 1);
cp.z = (float)*(p + 2);
cp.w = (float)*(p + 3);
filteredPixel += cp;
}
}
}
if (itemID < actualNumPixelInThisChunk) {
outputPixelCache[itemID] = (float4)0.0f;
}
barrier(CLK_LOCAL_MEM_FENCE);
for (unsigned int i = 0; i < numItems; i++) {
if (pixelIndex != -1) {
if (itemID % numItems == i) {
outputPixelCache[pixelIndex] += filteredPixel;
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (itemID < actualNumPixelInThisChunk)
{
float4 filteredPixel = outputPixelCache[itemID];
WriteAllChannels(filteredImage, 4, filteredColumns, chunkStartX + itemID, y, filteredPixel);
}
}
}
)
测试了一下性能,感觉提升不少(原图缩小一半,共三次操作,原图连续放大一倍两次,共三次操作):
ScaleImage()
加速版本:
20220428104719 0:3.229821 1.672 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104719 0:3.230185 1.672 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 1360
20220428104725 0:9.628057 1.875 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104725 0:9.628288 1.875 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 0
20220428104732 0:16.078872 2.234 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104732 0:16.079057 2.234 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 0
20220428104740 0:24.253815 2.484 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104740 0:24.254118 2.484 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 0
20220428104749 0:33.888819 2.875 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104749 0:33.889007 2.875 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 31
20220428104752 0:36.173104 3.047 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104752 0:36.173301 3.047 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 156
20220428104800 0:44.287153 3.469 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104800 0:44.287372 3.469 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 47
20220428104801 0:45.546271 3.656 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104801 0:45.546588 3.656 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 140
20220428104806 0:49.973027 4.047 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104806 0:49.973217 4.047 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 31
20220428104806 0:50.640522 4.250 11552 opencl.c AcquireOpenCLKernel 744 Accelerate Event Using kernel: ScaleFilter
20220428104806 0:50.640730 4.250 11552 resize.c ScaleImage 1764 Accelerate Event accelerate scale: 141
ScaleImage()
原先版本:
20220428104934 0:1.982873 0.266 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428104934 0:2.040677 0.328 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 63
20220428104940 0:7.854823 0.578 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428104940 0:7.913365 0.625 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 47
20220428104944 0:11.896725 0.875 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428104944 0:11.956722 0.938 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 63
20220428104951 0:18.070817 1.219 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428104951 0:18.378405 1.516 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 297
20220428104952 0:19.394056 1.531 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428104953 0:20.634341 2.781 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 1250
20220428104958 0:25.534006 3.063 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428104958 0:25.836584 3.375 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 312
20220428104959 0:26.729520 3.406 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428105000 0:27.930533 4.609 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 1203
20220428105011 0:38.879392 5.438 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428105012 0:39.210382 5.766 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 328
20220428105012 0:39.872525 5.797 10052 resize.c ScaleImage 1770 Accelerate Event AccelerateScaleImage null
20220428105014 0:41.176969 7.094 10052 resize.c ScaleImage 2116 Accelerate Event normal scale: 1297