在C#中的GPU加速——AleaGPU学习笔记(三)

如今,越来越多的软件公司选择使用.NET等框架来针对从桌面到移动应用的跨平台开发,使用单一的代码库来降低成本,利用现有的库来应对不断变化的趋势。虽然开发人员可以使用诸如task parallel library之类的库轻松地为.NET上的多核cpu编写并行代码,但是使用gpu来处理计算密集型任务则是一个更大的挑战。为了用gpu加速.NET应用程序,开发人员必须用CUDA C/C++编写函数,并编写或生成代码在.NET和CUDA C/ c++之间进行交互操作。Alea GPU通过将GPU计算直接引入.NET生态系统来弥补这一差距。使用Alea GPU,你可以用任何你喜欢的.NET语言(C#、F#、VB)来编写GPU函数,用你标准的.NET编译工具来编译,用GPU来加速。Alea GPU提供了所有CUDA功能的完整实现,用Alea GPU编译的代码执行起来与CUDA C/C++代码相当。

CUDA on .NET with Alea GPU

Alea GPU易于用于各种并行问题。开发者可以用任何.NET语言编写GPU代码,使用NVIDIA LibDevice提供的完整CUDA设备函数集,以及CUDA设备并行固有函数,如 thread synchrhonization, warp vote functions, warp shuffle functions, 和 atomic functions等。让我们考虑一个简单的示例,该示例用于矩阵乘法计算,并在Main函数中用两个1000×1000矩阵作为测试,对比CPU计算、GPU计算与闭包GPU计算的耗时。

using System;
using System.Diagnostics;
using NUnit.Framework;
using Alea;
using Alea.CSharp;

namespace Samples.CSharp
{
    class MatrixMultShared 
    {
        //Block中的线程数量
        private const int BlockSize = 32;

        //通过指定行列搜索矩阵元素:对于二维数组构造的矩阵,将矩阵行列根据block的行列所计算出的id来找到对应元素
        private static double GetMatrixElement(double[,] matrix, int blockRow, int blockCol, int row, int col)
        {
            //根据线程数和行数确定线程中行的索引,列同理
            var globalRow = blockRow * BlockSize + row;
            var globalCol = blockCol * BlockSize + col;
            if (globalRow < matrix.GetLength(0) && globalCol < matrix.GetLength(1))//GetLength()获取数组中指定维数的数组的数量
                return matrix[globalRow, globalCol];
            else
                return 0.0;
        }
        //通过指定行列搜索矩阵元素:对于一维数组构造的矩阵,将矩阵行列根据block的行列所计算出的id来找到对应元素
        private static double GetMatrixElement(int ld, double[] matrix, int blockRow, int blockCol, int row, int col)
        {
            var globalRow = blockRow * BlockSize + row;
            var globalCol = blockCol * BlockSize + col;
            var globalIdx = globalRow * ld + globalCol;
            if (globalIdx < matrix.Length)
                return matrix[globalIdx];
            else
                return 0.0;
        }
        //通过指定行列设定矩阵元素:对于二维数组构造的矩阵,将矩阵行列根据block的行列所计算出的id来设置对应元素
        private static void SetMatrixElement(double[,] matrix, int blockRow, int blockCol, int row, int col, double value)
        {
            var globalRow = blockRow * BlockSize + row;
            var globalCol = blockCol * BlockSize + col;
            if (globalRow < matrix.GetLength(0) && globalCol < matrix.GetLength(1))
                matrix[globalRow, globalCol] = value;
        }
        //通过指定行列设定矩阵元素:对于一维数组构造的矩阵,将矩阵行列根据block的行列所计算出的id来设置对应元素
        private static void SetMatrixElement(int ld, double[] matrix, int blockRow, int blockCol, int row, int col,
            double value)
        {
            var globalRow = blockRow * BlockSize + row;
            var globalCol = blockCol * BlockSize + col;
            var globalIdx = globalRow * ld + globalCol;
            if (globalIdx < matrix.Length)
                matrix[globalIdx] = value;
        }

        private static int DivUp(int num, int den)
        {
            return (num + den - 1) / den;
        }
        //核函数:输入矩阵a和b,返回矩阵c。将输入和输出放在一起的写法
        private static void Kernel(double[,] a, double[,] b, double[,] c)
        {
            var colsA = a.GetLength(1);//colsA为矩阵A的列数组
            var blockRow = blockIdx.x;//二维的block行数
            var blockCol = blockIdx.y;//二维的block列数

            var valueC = 0.0;

            var row = threadIdx.x;//二维的线程行数
            var col = threadIdx.y;//二维的线程列数

            //这里DivUP是向上取整,相当于ceil操作。例如我们有矩阵A有33列,线程数为32,
            //那么我们需要多分配一个block用来计算,因此向上取整
            for (var m = 0; m < DivUp(colsA, BlockSize); ++m)
            {
                //构建两个共享内存中的二维数组
                var subA = __shared__.Array2D(BlockSize, BlockSize);
                var subB = __shared__.Array2D(BlockSize, BlockSize);

                //填充两二维数组
                subA[row, col] = GetMatrixElement(a, blockRow, m, row, col);
                subB[row, col] = GetMatrixElement(b, m, blockCol, row, col);
                //同步线程,等所有线程都拿到数据再开始计算
                DeviceFunction.SyncThreads();

                for (var e = 0; e < BlockSize; ++e)
                {
                    //计算每个线程的值
                    valueC += subA[row, e] * subB[e, col];
                }
                //同步线程,得出结果
                DeviceFunction.SyncThreads();
            }
            //把计算出来的值赋值给各行各列
            SetMatrixElement(c, blockRow, blockCol, row, col, valueC);
        }

        //这里先对二维数组进行降维(PACK方法)处理,变成一维数组,效率会提高很多,这里直接计算出A和B的列数输入进去
        private static void KernelPacked(double[] a, double[] b, double[] c, int colsA, int colsB, int colsC)
        {
            var blockRow = blockIdx.x;
            var blockCol = blockIdx.y;

            var valueC = 0.0;

            var row = threadIdx.x;
            var col = threadIdx.y;

            for (var m = 0; m < DivUp(colsA, BlockSize); ++m)
            {
                var subA = __shared__.Array2D(BlockSize, BlockSize);
                var subB = __shared__.Array2D(BlockSize, BlockSize);

                subA[row, col] = GetMatrixElement(colsA, a, blockRow, m, row, col);
                subB[row, col] = GetMatrixElement(colsB, b, m, blockCol, row, col);
                DeviceFunction.SyncThreads();

                for (var e = 0; e < BlockSize; ++e)
                {
                    valueC += subA[row, e] * subB[e, col];
                }
                DeviceFunction.SyncThreads();
            }

            SetMatrixElement(colsC, c, blockRow, blockCol, row, col, valueC);
        }

        [GpuManaged]
        public static void RunGpuPacked(double[,] a, double[,] b, double[,] c)
        {
            //声明三个二维数组
            var lp = LaunchParam(a, b, c);
            var aFlat = Pack(a);
            var bFlat = Pack(b);
            var cFlat = new double[c.Length];
            Gpu.Default.Launch(KernelPacked, lp, aFlat, bFlat, cFlat, a.GetLength(1), b.GetLength(1), c.GetLength(1));
            Unpack(cFlat, c);
        }

        [GpuManaged]
        public static void RunGpu(double[,] a, double[,] b, double[,] c)
        {
            var lp = LaunchParam(a, b, c);
            Gpu.Default.Launch(Kernel, lp, a, b, c);
        }

        public static void RunCpu(double[,] a, double[,] b, double[,] c)
        {
            for (var row = 0; row < c.GetLength(0); ++row)
            {
                for (var col = 0; col < c.GetLength(1); ++col)
                {
                    var sum = 0.0;
                    for (var k = 0; k < a.GetLength(1); ++k)
                    {
                        sum += a[row, k] * b[k, col];
                    }
                    c[row, col] = sum;
                }
            }
        }

        private static LaunchParam LaunchParam(double[,] a, double[,] b, double[,] c)
        {
            Check(a, b, c);
            //定义二维线程数
            var blockSize = new dim3(BlockSize, BlockSize);
            //定义二维block,这里DivUP是向上取整,相当于ceil操作。例如我们有矩阵A有33列,线程数为32,
            //那么我们需要多分配一个block用来计算,因此向上取整
            var gridSize = new dim3(DivUp(a.GetLength(0), BlockSize), DivUp(b.GetLength(1), BlockSize));
            return new LaunchParam(gridSize, blockSize);
        }

        private static double[] Pack(double[,] a)
        {
            var flat = new double[a.Length];
            var rows = a.GetLength(0);
            var cols = a.GetLength(1);
            for (var i = 0; i < rows; i++)
                for (var j = 0; j < cols; j++)
                    flat[i * cols + j] = a[i, j];
            return flat;
        }

        [GpuManaged]
        private static void Unpack(double[] aFlat, double[,] a)
        {
            var rows = a.GetLength(0);
            var cols = a.GetLength(1);
            for (var i = 0; i < rows; i++)
                for (var j = 0; j < cols; j++)
                    a[i, j] = aFlat[i * cols + j];
        }

        private static void Check(double[,] a, double[,] b, double[,] c)
        {
            if (a == null) throw new ArgumentNullException(nameof(a));
            if (b == null) throw new ArgumentNullException(nameof(b));
            if (c == null) throw new ArgumentNullException(nameof(c));
            Debug.Assert(a.GetLength(1) == b.GetLength(0));
            Debug.Assert(a.GetLength(0) == c.GetLength(0));
            Debug.Assert(b.GetLength(1) == c.GetLength(1));
        }
    }

    class MatrixMultSharedTest
    {
        static readonly Random rng = new Random(42);

        public static double[,] RandomMatrix(int rows, int cols)
        {
            var a = new double[rows, cols];
            for (var i = 0; i < rows; ++i)
                for (var j = 0; j < cols; ++j)
                    a[i, j] = rng.NextDouble();
            return a;
        }

        private static void Run(int n, double tolerance) 
        {
            var a = RandomMatrix(n, n);
            var b = RandomMatrix(n, n);
            var c = new double[n, n];

            MatrixMultShared.RunCpu(a, b, c);

            var cGpu = new double[n, n];
            MatrixMultShared.RunGpu(a, b, cGpu);
            Assert.That(cGpu, Is.EqualTo(c).Within(tolerance));

            var cGpuPacked = new double[n, n];
            MatrixMultShared.RunGpuPacked(a, b, cGpuPacked);
            Assert.That(cGpuPacked, Is.EqualTo(c).Within(tolerance));
        }

        [GpuManaged, Test]
        public static void Small()
        {
            Run(128, 1e-4);
        }

        public static void Large()
        {
            Run(1024, 1e-4);
        }
    }
}

  static void Main()
        {
            int n = 1000;
            var a = MatrixMultSharedTest.RandomMatrix(n, n);
            var b = MatrixMultSharedTest.RandomMatrix(n, n);
            var c = new double[n, n];

            Stopwatch sw = new Stopwatch();
            sw.Start();

            MatrixMultShared.RunCpu(a, b, c);
            sw.Stop();
            double time1 = sw.Elapsed.TotalMilliseconds;

            sw.Restart();
            var cGpu = new double[n, n];
            MatrixMultShared.RunGpu(a, b, cGpu);
            sw.Stop();
            double time2 = sw.Elapsed.TotalMilliseconds;

            sw.Restart();
            var cGpuPacked = new double[n, n];
            MatrixMultShared.RunGpuPacked(a, b, cGpuPacked);
            sw.Stop();
            double time3 = sw.Elapsed.TotalMilliseconds;

            Console.WriteLine(time1);
            Console.WriteLine(time2);
            Console.WriteLine(time3);

            Console.ReadKey();
        }

运行结果:


在C#中的GPU加速——AleaGPU学习笔记(三)_第1张图片
两个1000×1000矩阵相乘计算效率对比

你可能感兴趣的:(在C#中的GPU加速——AleaGPU学习笔记(三))