如今,越来越多的软件公司选择使用.NET等框架来针对从桌面到移动应用的跨平台开发,使用单一的代码库来降低成本,利用现有的库来应对不断变化的趋势。虽然开发人员可以使用诸如task parallel library之类的库轻松地为.NET上的多核cpu编写并行代码,但是使用gpu来处理计算密集型任务则是一个更大的挑战。为了用gpu加速.NET应用程序,开发人员必须用CUDA C/C++编写函数,并编写或生成代码在.NET和CUDA C/ c++之间进行交互操作。Alea GPU通过将GPU计算直接引入.NET生态系统来弥补这一差距。使用Alea GPU,你可以用任何你喜欢的.NET语言(C#、F#、VB)来编写GPU函数,用你标准的.NET编译工具来编译,用GPU来加速。Alea GPU提供了所有CUDA功能的完整实现,用Alea GPU编译的代码执行起来与CUDA C/C++代码相当。
CUDA on .NET with Alea GPU
Alea GPU易于用于各种并行问题。开发者可以用任何.NET语言编写GPU代码,使用NVIDIA LibDevice提供的完整CUDA设备函数集,以及CUDA设备并行固有函数,如 thread synchrhonization, warp vote functions, warp shuffle functions, 和 atomic functions等。让我们考虑一个简单的示例,该示例用于矩阵乘法计算,并在Main函数中用两个1000×1000矩阵作为测试,对比CPU计算、GPU计算与闭包GPU计算的耗时。
using System;
using System.Diagnostics;
using NUnit.Framework;
using Alea;
using Alea.CSharp;
namespace Samples.CSharp
{
class MatrixMultShared
{
//Block中的线程数量
private const int BlockSize = 32;
//通过指定行列搜索矩阵元素:对于二维数组构造的矩阵,将矩阵行列根据block的行列所计算出的id来找到对应元素
private static double GetMatrixElement(double[,] matrix, int blockRow, int blockCol, int row, int col)
{
//根据线程数和行数确定线程中行的索引,列同理
var globalRow = blockRow * BlockSize + row;
var globalCol = blockCol * BlockSize + col;
if (globalRow < matrix.GetLength(0) && globalCol < matrix.GetLength(1))//GetLength()获取数组中指定维数的数组的数量
return matrix[globalRow, globalCol];
else
return 0.0;
}
//通过指定行列搜索矩阵元素:对于一维数组构造的矩阵,将矩阵行列根据block的行列所计算出的id来找到对应元素
private static double GetMatrixElement(int ld, double[] matrix, int blockRow, int blockCol, int row, int col)
{
var globalRow = blockRow * BlockSize + row;
var globalCol = blockCol * BlockSize + col;
var globalIdx = globalRow * ld + globalCol;
if (globalIdx < matrix.Length)
return matrix[globalIdx];
else
return 0.0;
}
//通过指定行列设定矩阵元素:对于二维数组构造的矩阵,将矩阵行列根据block的行列所计算出的id来设置对应元素
private static void SetMatrixElement(double[,] matrix, int blockRow, int blockCol, int row, int col, double value)
{
var globalRow = blockRow * BlockSize + row;
var globalCol = blockCol * BlockSize + col;
if (globalRow < matrix.GetLength(0) && globalCol < matrix.GetLength(1))
matrix[globalRow, globalCol] = value;
}
//通过指定行列设定矩阵元素:对于一维数组构造的矩阵,将矩阵行列根据block的行列所计算出的id来设置对应元素
private static void SetMatrixElement(int ld, double[] matrix, int blockRow, int blockCol, int row, int col,
double value)
{
var globalRow = blockRow * BlockSize + row;
var globalCol = blockCol * BlockSize + col;
var globalIdx = globalRow * ld + globalCol;
if (globalIdx < matrix.Length)
matrix[globalIdx] = value;
}
private static int DivUp(int num, int den)
{
return (num + den - 1) / den;
}
//核函数:输入矩阵a和b,返回矩阵c。将输入和输出放在一起的写法
private static void Kernel(double[,] a, double[,] b, double[,] c)
{
var colsA = a.GetLength(1);//colsA为矩阵A的列数组
var blockRow = blockIdx.x;//二维的block行数
var blockCol = blockIdx.y;//二维的block列数
var valueC = 0.0;
var row = threadIdx.x;//二维的线程行数
var col = threadIdx.y;//二维的线程列数
//这里DivUP是向上取整,相当于ceil操作。例如我们有矩阵A有33列,线程数为32,
//那么我们需要多分配一个block用来计算,因此向上取整
for (var m = 0; m < DivUp(colsA, BlockSize); ++m)
{
//构建两个共享内存中的二维数组
var subA = __shared__.Array2D(BlockSize, BlockSize);
var subB = __shared__.Array2D(BlockSize, BlockSize);
//填充两二维数组
subA[row, col] = GetMatrixElement(a, blockRow, m, row, col);
subB[row, col] = GetMatrixElement(b, m, blockCol, row, col);
//同步线程,等所有线程都拿到数据再开始计算
DeviceFunction.SyncThreads();
for (var e = 0; e < BlockSize; ++e)
{
//计算每个线程的值
valueC += subA[row, e] * subB[e, col];
}
//同步线程,得出结果
DeviceFunction.SyncThreads();
}
//把计算出来的值赋值给各行各列
SetMatrixElement(c, blockRow, blockCol, row, col, valueC);
}
//这里先对二维数组进行降维(PACK方法)处理,变成一维数组,效率会提高很多,这里直接计算出A和B的列数输入进去
private static void KernelPacked(double[] a, double[] b, double[] c, int colsA, int colsB, int colsC)
{
var blockRow = blockIdx.x;
var blockCol = blockIdx.y;
var valueC = 0.0;
var row = threadIdx.x;
var col = threadIdx.y;
for (var m = 0; m < DivUp(colsA, BlockSize); ++m)
{
var subA = __shared__.Array2D(BlockSize, BlockSize);
var subB = __shared__.Array2D(BlockSize, BlockSize);
subA[row, col] = GetMatrixElement(colsA, a, blockRow, m, row, col);
subB[row, col] = GetMatrixElement(colsB, b, m, blockCol, row, col);
DeviceFunction.SyncThreads();
for (var e = 0; e < BlockSize; ++e)
{
valueC += subA[row, e] * subB[e, col];
}
DeviceFunction.SyncThreads();
}
SetMatrixElement(colsC, c, blockRow, blockCol, row, col, valueC);
}
[GpuManaged]
public static void RunGpuPacked(double[,] a, double[,] b, double[,] c)
{
//声明三个二维数组
var lp = LaunchParam(a, b, c);
var aFlat = Pack(a);
var bFlat = Pack(b);
var cFlat = new double[c.Length];
Gpu.Default.Launch(KernelPacked, lp, aFlat, bFlat, cFlat, a.GetLength(1), b.GetLength(1), c.GetLength(1));
Unpack(cFlat, c);
}
[GpuManaged]
public static void RunGpu(double[,] a, double[,] b, double[,] c)
{
var lp = LaunchParam(a, b, c);
Gpu.Default.Launch(Kernel, lp, a, b, c);
}
public static void RunCpu(double[,] a, double[,] b, double[,] c)
{
for (var row = 0; row < c.GetLength(0); ++row)
{
for (var col = 0; col < c.GetLength(1); ++col)
{
var sum = 0.0;
for (var k = 0; k < a.GetLength(1); ++k)
{
sum += a[row, k] * b[k, col];
}
c[row, col] = sum;
}
}
}
private static LaunchParam LaunchParam(double[,] a, double[,] b, double[,] c)
{
Check(a, b, c);
//定义二维线程数
var blockSize = new dim3(BlockSize, BlockSize);
//定义二维block,这里DivUP是向上取整,相当于ceil操作。例如我们有矩阵A有33列,线程数为32,
//那么我们需要多分配一个block用来计算,因此向上取整
var gridSize = new dim3(DivUp(a.GetLength(0), BlockSize), DivUp(b.GetLength(1), BlockSize));
return new LaunchParam(gridSize, blockSize);
}
private static double[] Pack(double[,] a)
{
var flat = new double[a.Length];
var rows = a.GetLength(0);
var cols = a.GetLength(1);
for (var i = 0; i < rows; i++)
for (var j = 0; j < cols; j++)
flat[i * cols + j] = a[i, j];
return flat;
}
[GpuManaged]
private static void Unpack(double[] aFlat, double[,] a)
{
var rows = a.GetLength(0);
var cols = a.GetLength(1);
for (var i = 0; i < rows; i++)
for (var j = 0; j < cols; j++)
a[i, j] = aFlat[i * cols + j];
}
private static void Check(double[,] a, double[,] b, double[,] c)
{
if (a == null) throw new ArgumentNullException(nameof(a));
if (b == null) throw new ArgumentNullException(nameof(b));
if (c == null) throw new ArgumentNullException(nameof(c));
Debug.Assert(a.GetLength(1) == b.GetLength(0));
Debug.Assert(a.GetLength(0) == c.GetLength(0));
Debug.Assert(b.GetLength(1) == c.GetLength(1));
}
}
class MatrixMultSharedTest
{
static readonly Random rng = new Random(42);
public static double[,] RandomMatrix(int rows, int cols)
{
var a = new double[rows, cols];
for (var i = 0; i < rows; ++i)
for (var j = 0; j < cols; ++j)
a[i, j] = rng.NextDouble();
return a;
}
private static void Run(int n, double tolerance)
{
var a = RandomMatrix(n, n);
var b = RandomMatrix(n, n);
var c = new double[n, n];
MatrixMultShared.RunCpu(a, b, c);
var cGpu = new double[n, n];
MatrixMultShared.RunGpu(a, b, cGpu);
Assert.That(cGpu, Is.EqualTo(c).Within(tolerance));
var cGpuPacked = new double[n, n];
MatrixMultShared.RunGpuPacked(a, b, cGpuPacked);
Assert.That(cGpuPacked, Is.EqualTo(c).Within(tolerance));
}
[GpuManaged, Test]
public static void Small()
{
Run(128, 1e-4);
}
public static void Large()
{
Run(1024, 1e-4);
}
}
}
static void Main()
{
int n = 1000;
var a = MatrixMultSharedTest.RandomMatrix(n, n);
var b = MatrixMultSharedTest.RandomMatrix(n, n);
var c = new double[n, n];
Stopwatch sw = new Stopwatch();
sw.Start();
MatrixMultShared.RunCpu(a, b, c);
sw.Stop();
double time1 = sw.Elapsed.TotalMilliseconds;
sw.Restart();
var cGpu = new double[n, n];
MatrixMultShared.RunGpu(a, b, cGpu);
sw.Stop();
double time2 = sw.Elapsed.TotalMilliseconds;
sw.Restart();
var cGpuPacked = new double[n, n];
MatrixMultShared.RunGpuPacked(a, b, cGpuPacked);
sw.Stop();
double time3 = sw.Elapsed.TotalMilliseconds;
Console.WriteLine(time1);
Console.WriteLine(time2);
Console.WriteLine(time3);
Console.ReadKey();
}
运行结果: