C#调用GPU计算。
opencl 和 cuda 是两大老牌 GPU计算库。这里选择了 opencl ,因为它不挑显卡。而且手机上也能用。甚至没有独显,集显也行,再没有CPU也行。在cpu上跑也有少量性能提升。
NOpenCL库
这里用到 C#的 NOpenCL库,调用 opencl 实现调用 显卡GPU计算。一般开发opencl 用C++。这里为了C#能用,尝试了 opencl.net库,和NOpenCL库,这两者都能实现C#调用opencl。
然而opencl.net不知道是不是我使用不正确,出现了严重的内存泄露,放弃了。所以选择 NOpenCL库。然后自己进一步封装,进一步方便使用。
opencl.net库,和NOpenCL库,在github上可以下载到。
CLBLAS库
这是 amd公布的 blas线性计算数学库的 opencl 版本。里面包含 矩阵乘法 等各种常用的数学计算函数。性能已经优化,比较成熟。当然不是 C#的库,而是C的库。因此要借助另一个库,让C#能够调用。
clMathLibraries-dotnet 库
C#对CLBLAS库的封装。包含了CLBLAS,CLFFT,CLSparse三个库。FFT自然是傅里叶变换。Sparse则是稀疏矩阵运算。都是常用数学库。
以下贴一个 矩阵乘法的测试案例。OpenCLMode是本人对NopenCL的二次封装,方便调用。
using CLMathLibraries.CLBLAS;
using NOpenCL;
using NOpenCL.SafeHandles;
using OpenCLMode.OCL;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Buffer = NOpenCL.Buffer;
using size_t = System.UInt32;
using cl_float = System.Single;
using cl_double = System.Double;
using cl_uint = System.UInt32;
using cl_mem = System.IntPtr;
using cl_event = System.IntPtr;
using cl_command_queue = System.IntPtr;
using TestTools;
namespace CAmathTest
{
class Program
{
public static bool ConsoleEquale(object a, object b)
{
bool re = (a.Equals(b));
Console.WriteLine("state:{0} equale {1} is {2}", a, b, re);
return re;
}
static void Main(string[] args)
{
oclModel oclm;
string filename = "";
string dirbase = System.AppDomain.CurrentDomain.BaseDirectory;
filename = Path.GetPathRoot(dirbase) + @"projectHJ\UTestOpenCLMode\OCL\OCLCode2.c";//自己写的opencl数学库。本次测试中可以省略
if (!File.Exists(filename))
filename = Application.StartupPath + @"\OCL\OCLCode2.c";
string code = OpenCLMode.OCL.oclModel.LoadCode(filename);
oclm = new OpenCLMode.OCL.oclModel(code);
var s1 = CLBLAS.Setup();
bool state = ConsoleEquale(s1, CLBLASStatus.clblasSuccess);
if (!state)
{
return;
}
const int M = 1024;
const int N = 1024;
const int K = 512;
//float alpha = 10f;
float alpha = 1f;
float[] A = new float[M * K];
uint lda = K; /* i.e. lda = K */
float[] B = new float[K * N];
uint ldb = N; /* i.e. ldb = N */
//float beta = 20f;
float beta = 1f;
float[] C = new float[M * N];
uint ldc = N; /* i.e. ldc = N */
float[,] result = new float[M, N];
RandomData(A);
RandomData(B);
Buffer Abuffer = oclm.BufferWriteAddCache(A, "A");
Buffer Bbuffer = oclm.BufferWriteAddCache(B, "B");
Buffer Cbuffer = oclm.CreateBufferFloatArray(C.Length, "C");
//opencl 在 C++下的调用写法
//err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
// M * K * sizeof(*A), A, 0, NULL, NULL);
//err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
// K * N * sizeof(*B), B, 0, NULL, NULL);
//err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
// M * N * sizeof(*C), C, 0, NULL, NULL);
oclModelBase.OCLBase.CommandQueue.DangerousAddRef();
if (!oclModelBase.OCLBase.CommandQueue.IsDangerousAddRefOk)
Console.WriteLine("error CommandQueue.IsDangerousAddRefOk:" + oclModelBase.OCLBase.CommandQueue.IsDangerousAddRefOk);
Abuffer.DangerousAddRef();
Bbuffer.DangerousAddRef();
Cbuffer.DangerousAddRef();
IntPtr aintptr = Abuffer.DangerousGetHandle();
IntPtr bintptr = Bbuffer.DangerousGetHandle();
IntPtr cintptr = Cbuffer.DangerousGetHandle();
IntPtr queue = oclModelBase.OCLBase.CommandQueue.DangerousGetHandle();
var error = CLBLASStatus.clblasSuccess;
TimeToolConsole ttc = new TimeToolConsole();//计时器
int p1 = 12;
int p2 = 50;
List lts = new List();
for (int t = 1; t < p1; t++)
{
//Console.WriteLine("GPU Start:" + DateTime.Now.ToString("HH:mm:ss:fff"));
ttc.Begin();//开始计时
int loop = p2 * t;
Console.WriteLine("loop:" + loop);
for (int i = 0; i < loop; i++)
{
error = CLBLAS.Sgemm(CLBLASOrder.clblasRowMajor, CLBLASTranspose.clblasNoTrans, CLBLASTranspose.clblasNoTrans,
M, N, K,
alpha, aintptr, 0, lda,
bintptr, 0, ldb, beta,
cintptr, 0, ldc,
1, new IntPtr[] { queue }, 0, null, null);
}
oclm.RunWaitFinisth();
lts.Add(ttc.End2());//完成一次计时
//Console.WriteLine(" GPU End:" + DateTime.Now.ToString("HH:mm:ss:fff"));
}
Console.WriteLine(error);
TimeSpan ts = new TimeSpan(0);
int count = 0;
for (int i = 1; i < lts.Count; i++)//统计时间消耗
{
var temp = (lts[i] - lts[i - 1]);
if (temp.TotalMilliseconds > 0)
{
ts += temp;
count++;
}
}
long AVGticks = ts.Ticks / (count) / p2;
ts = new TimeSpan(AVGticks);
Console.WriteLine("单次用时:{0}/s", ts.ToString());
double xn = ts.TotalMilliseconds / 1000;//秒
xn = M * N * K * 2 / xn / Math.Pow(10, 9);
Console.WriteLine("M={0},N={1},K={2}", M, N, K);
Console.WriteLine("性能:{0} GFlops", xn.ToString("F3"));
//gtx650ti (ADIA64测试 1.244 tflops =1244 gflops)
//cpu e3-1230v2 (ADIA64测试 0.229 tflops =229 gflops)
//任务:i=64 j=64 k=64 计算c=c+a*b
//r9 280x
//a=0.000029
//18 gflops
//任务:i=32 j=32 k=32 计算c=c+a*b
//r9 280x
//a=0.00001578
//4.15 gflops
//以上成绩为自写opencl代码。以下成绩使用了clblas库。20170926
//--------------clblas库-----------
//任务:i=512 j=512 k=512 计算c=c+a*b
//clblas gtx650ti
//a=0.009515s
//282 gflops
//任务:i=1024 j=1024 k=512 计算c=c+a*b
//clblas r9 280x
//2219.3 gflops
///* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
//err = clblasSgemm(clblasRowMajor, clblasNoTrans, clblasNoTrans,
// M, N, K,
// alpha, bufA, 0, lda,
// bufB, 0, ldb, beta,
// bufC, 0, ldc,
// 1, &queue, 0, NULL, &event);
///* Wait for calculations to be finished. */
//err = clWaitForEvents(1, &event);
Abuffer.DangerousRelease();
Bbuffer.DangerousRelease();
Cbuffer.DangerousRelease();
///* Fetch results of calculations from GPU memory. */
//err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
// M * N * sizeof(*result),
// result, 0, NULL, NULL);
oclm.ReadBuffer(C, Cbuffer);
oclModelBase.OCLBase.CommandQueue.DangerousRelease();
CLBLAS.Teardown();
bool mode2cpu = false;
if (mode2cpu)
{
float[] c2 = new cl_float[M * N];
for (int t = 0; t < p1; t++)
{
//Console.WriteLine("GPU Start:" + DateTime.Now.ToString("HH:mm:ss:fff"));
ttc.Begin();
int loop = p2 * t;
Console.WriteLine("loop:" + loop);
for (int i = 0; i < loop; i++)
{
Dot(c2, A, B, M, N, K);
}
ttc.End2();
}
int countError = 0;
for (int m = 0; m < M * N; m++)//效验正确性
{
if (Math.Abs(c2[m] - C[m]) > Math.Min(c2[m], C[m]) * 0.01)
{
countError++;
}
}
Console.WriteLine("data err:" + DateTime.Now.ToString("HH:mm:ss:fff"));
Console.WriteLine("countError=" + countError);
}
//CLSparse.Setup();
//var s = CLBLAS.Setup();
//do something...
//CLBLAS.Teardown();
////Console.WriteLine("right:" + (s == CLBLASStatus.clblasSuccess));
//ConsoleEquale(s, CLBLASStatus.clblasSuccess);
//var setupData = new CLFFTSetupData();
//var s2 = CLFFT.Setup(setupData);
//ConsoleEquale(s2, CLFFTStatus.CL_SUCCESS);
//ConsoleEquale(CLFFT.Teardown(), CLFFTStatus.CL_SUCCESS);
//ConsoleEquale(CLSparse.Setup(), CLSparseStatus.Success);
//ConsoleEquale(CLSparse.Teardown(), CLSparseStatus.Success);
Console.WriteLine("over");
Console.ReadKey();
}
static Random rand = new Random();
public static void RandomData(float[,] a)
{
for (int i = 0; i < a.GetLength(0); i++)
{
for (int j = 0; j < a.GetLength(1); j++)
{
a[i, j] = (float)rand.NextDouble();
}
}
}
public static void RandomData(float[] a)
{
for (int i = 0; i < a.GetLength(0); i++)
{
a[i] = (float)rand.NextDouble();
}
}
public static void Dot(float[] c, float[] a, float[] b, int M, int N, int K) // mnozy dwie macierze 2D
{
for (int i = 0; i < M; i++)
{
if (M > 255 && N > 255)
{
Parallel.For(0, N, j =>
{
//c[i * N + j] = 0;
for (int k = 0; k < K; k++) // OR k 255 && c.GetLength(1) > 255)
{
Parallel.For(0, c.GetLength(1), j =>
{
c[i, j] = 0;
for (int k = 0; k < a.GetLength(1); k++) // OR k