C#(Csharp)+OpenCL+CLBLAS库,实现GPU高性能计算。

C#调用GPU计算。

opencl 和 cuda 是两大老牌 GPU计算库。这里选择了 opencl ,因为它不挑显卡。而且手机上也能用。甚至没有独显,集显也行,再没有CPU也行。在cpu上跑也有少量性能提升。


NOpenCL库

这里用到 C#的 NOpenCL库,调用 opencl 实现调用 显卡GPU计算。一般开发opencl 用C++。这里为了C#能用,尝试了 opencl.net库,和NOpenCL库,这两者都能实现C#调用opencl。

然而opencl.net不知道是不是我使用不正确,出现了严重的内存泄露,放弃了。所以选择 NOpenCL库。然后自己进一步封装,进一步方便使用。

opencl.net库,和NOpenCL库,在github上可以下载到。


CLBLAS库

这是 amd公布的 blas线性计算数学库的 opencl 版本。里面包含 矩阵乘法 等各种常用的数学计算函数。性能已经优化,比较成熟。当然不是 C#的库,而是C的库。因此要借助另一个库,让C#能够调用。


clMathLibraries-dotnet 库

C#对CLBLAS库的封装。包含了CLBLAS,CLFFT,CLSparse三个库。FFT自然是傅里叶变换。Sparse则是稀疏矩阵运算。都是常用数学库。


以下贴一个 矩阵乘法的测试案例。OpenCLMode是本人对NopenCL的二次封装,方便调用。


using CLMathLibraries.CLBLAS;
using NOpenCL;
using NOpenCL.SafeHandles;
using OpenCLMode.OCL;
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using System.Windows.Forms;
using Buffer = NOpenCL.Buffer;

using size_t = System.UInt32;
using cl_float = System.Single;
using cl_double = System.Double;
using cl_uint = System.UInt32;
using cl_mem = System.IntPtr;
using cl_event = System.IntPtr;
using cl_command_queue = System.IntPtr;
using TestTools;

namespace CAmathTest
{
    class Program
    {
        public static bool ConsoleEquale(object a, object b)
        {
            bool re = (a.Equals(b));
            Console.WriteLine("state:{0} equale {1} is {2}", a, b, re);
            return re;
        }


        static void Main(string[] args)
        {
            oclModel oclm;
            string filename = "";

            string dirbase = System.AppDomain.CurrentDomain.BaseDirectory;
            filename = Path.GetPathRoot(dirbase) + @"projectHJ\UTestOpenCLMode\OCL\OCLCode2.c";//自己写的opencl数学库。本次测试中可以省略
            if (!File.Exists(filename))
                filename = Application.StartupPath + @"\OCL\OCLCode2.c";


            string code = OpenCLMode.OCL.oclModel.LoadCode(filename);
            oclm = new OpenCLMode.OCL.oclModel(code);




            var s1 = CLBLAS.Setup();
            bool state = ConsoleEquale(s1, CLBLASStatus.clblasSuccess);
            if (!state)
            {
                return;
            }

           
            
            const int M = 1024;
            const int N = 1024;
            const int K = 512;


            //float alpha = 10f;
            float alpha = 1f;

            float[] A = new float[M * K];

            uint lda = K;        /* i.e. lda = K */

            float[] B = new float[K * N];
         
            uint ldb = N;        /* i.e. ldb = N */

            //float beta = 20f;
            float beta = 1f;
   

            float[] C = new float[M * N];


            uint ldc = N;        /* i.e. ldc = N */

            float[,] result = new float[M, N];


            RandomData(A);
            RandomData(B);


            Buffer Abuffer = oclm.BufferWriteAddCache(A, "A");
            Buffer Bbuffer = oclm.BufferWriteAddCache(B, "B");
           
            Buffer Cbuffer = oclm.CreateBufferFloatArray(C.Length, "C");
            //opencl 在 C++下的调用写法
            //err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0,
            //    M * K * sizeof(*A), A, 0, NULL, NULL);
            //err = clEnqueueWriteBuffer(queue, bufB, CL_TRUE, 0,
            //    K * N * sizeof(*B), B, 0, NULL, NULL);
            //err = clEnqueueWriteBuffer(queue, bufC, CL_TRUE, 0,
            //    M * N * sizeof(*C), C, 0, NULL, NULL);


            oclModelBase.OCLBase.CommandQueue.DangerousAddRef();
            if (!oclModelBase.OCLBase.CommandQueue.IsDangerousAddRefOk)
                Console.WriteLine("error CommandQueue.IsDangerousAddRefOk:" + oclModelBase.OCLBase.CommandQueue.IsDangerousAddRefOk);

            
            Abuffer.DangerousAddRef();
            Bbuffer.DangerousAddRef();
            Cbuffer.DangerousAddRef();


            IntPtr aintptr = Abuffer.DangerousGetHandle();
            IntPtr bintptr = Bbuffer.DangerousGetHandle();
            IntPtr cintptr = Cbuffer.DangerousGetHandle();
            IntPtr queue = oclModelBase.OCLBase.CommandQueue.DangerousGetHandle();

           
            var error = CLBLASStatus.clblasSuccess;
            TimeToolConsole ttc = new TimeToolConsole();//计时器
            int p1 = 12;
            int p2 = 50;

            List lts = new List();

            for (int t = 1; t < p1; t++)
            {
                //Console.WriteLine("GPU Start:" + DateTime.Now.ToString("HH:mm:ss:fff"));

                ttc.Begin();//开始计时

                int loop = p2 * t;
                Console.WriteLine("loop:" + loop);
                for (int i = 0; i < loop; i++)
                {
                    error = CLBLAS.Sgemm(CLBLASOrder.clblasRowMajor, CLBLASTranspose.clblasNoTrans, CLBLASTranspose.clblasNoTrans,
                             M, N, K,
                             alpha, aintptr, 0, lda,
                            bintptr, 0, ldb, beta,
                           cintptr, 0, ldc,
                             1, new IntPtr[] { queue }, 0, null, null);
                }
                oclm.RunWaitFinisth();
                lts.Add(ttc.End2());//完成一次计时

                //Console.WriteLine(" GPU End:" + DateTime.Now.ToString("HH:mm:ss:fff"));
            }


            Console.WriteLine(error);
            TimeSpan ts = new TimeSpan(0);
            int count = 0;

            for (int i = 1; i < lts.Count; i++)//统计时间消耗
            {
                var temp = (lts[i] - lts[i - 1]);
                if (temp.TotalMilliseconds > 0)
                {
                    ts += temp;
                    count++;
                }
            }
            long AVGticks = ts.Ticks / (count) / p2;
            ts = new TimeSpan(AVGticks);
            Console.WriteLine("单次用时:{0}/s", ts.ToString());
            double xn = ts.TotalMilliseconds / 1000;//秒
            xn = M * N * K * 2 / xn / Math.Pow(10, 9);
            Console.WriteLine("M={0},N={1},K={2}", M, N, K);
            Console.WriteLine("性能:{0} GFlops", xn.ToString("F3"));

            
            //gtx650ti  (ADIA64测试 1.244 tflops =1244 gflops)
            //cpu e3-1230v2   (ADIA64测试 0.229 tflops =229 gflops)
 
            //任务:i=64 j=64 k=64 计算c=c+a*b
            //r9 280x            
            //a=0.000029
            //18 gflops

            //任务:i=32 j=32 k=32 计算c=c+a*b
            //r9 280x
            //a=0.00001578
            //4.15 gflops

            //以上成绩为自写opencl代码。以下成绩使用了clblas库。20170926
            //--------------clblas库-----------
            //任务:i=512 j=512 k=512 计算c=c+a*b
            //clblas gtx650ti
            //a=0.009515s
            //282 gflops

            //任务:i=1024 j=1024 k=512 计算c=c+a*b
            //clblas r9 280x            
            //2219.3 gflops


            ///* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */
            //err = clblasSgemm(clblasRowMajor, clblasNoTrans, clblasNoTrans,
            //    M, N, K,
            //    alpha, bufA, 0, lda,
            //    bufB, 0, ldb, beta,
            //    bufC, 0, ldc,
            //    1, &queue, 0, NULL, &event);

            ///* Wait for calculations to be finished. */
            //err = clWaitForEvents(1, &event);

          
            Abuffer.DangerousRelease();
            Bbuffer.DangerousRelease();
            Cbuffer.DangerousRelease();

            ///* Fetch results of calculations from GPU memory. */
            //err = clEnqueueReadBuffer(queue, bufC, CL_TRUE, 0,
            //    M * N * sizeof(*result),
            //    result, 0, NULL, NULL);

            oclm.ReadBuffer(C, Cbuffer);

            oclModelBase.OCLBase.CommandQueue.DangerousRelease();

            CLBLAS.Teardown();


            bool mode2cpu = false;
            if (mode2cpu)
            {
                float[] c2 = new cl_float[M * N];

                for (int t = 0; t < p1; t++)
                {
                    //Console.WriteLine("GPU Start:" + DateTime.Now.ToString("HH:mm:ss:fff"));

                    ttc.Begin();

                    int loop = p2 * t;
                    Console.WriteLine("loop:" + loop);

                    for (int i = 0; i < loop; i++)
                    {
                        Dot(c2, A, B, M, N, K);
                    }
                    ttc.End2();
                }


                int countError = 0;
                for (int m = 0; m < M * N; m++)//效验正确性
                {
                    if (Math.Abs(c2[m] - C[m]) > Math.Min(c2[m], C[m]) * 0.01)
                    {
                        countError++;
                    }
                }

                Console.WriteLine("data err:" + DateTime.Now.ToString("HH:mm:ss:fff"));

                Console.WriteLine("countError=" + countError);


            }

            //CLSparse.Setup();


            //var s = CLBLAS.Setup();
            //do something...            
            //CLBLAS.Teardown();
            ////Console.WriteLine("right:" + (s == CLBLASStatus.clblasSuccess));
            //ConsoleEquale(s, CLBLASStatus.clblasSuccess);

            //var setupData = new CLFFTSetupData();
            //var s2 = CLFFT.Setup(setupData);
            //ConsoleEquale(s2, CLFFTStatus.CL_SUCCESS);
            //ConsoleEquale(CLFFT.Teardown(), CLFFTStatus.CL_SUCCESS);

            //ConsoleEquale(CLSparse.Setup(), CLSparseStatus.Success);
            //ConsoleEquale(CLSparse.Teardown(), CLSparseStatus.Success);

            Console.WriteLine("over");
            Console.ReadKey();

        }

        static Random rand = new Random();

        public static void RandomData(float[,] a)
        {
            for (int i = 0; i < a.GetLength(0); i++)
            {
                for (int j = 0; j < a.GetLength(1); j++)
                {
                    a[i, j] = (float)rand.NextDouble();
                }
            }
        }



        public static void RandomData(float[] a)
        {
            for (int i = 0; i < a.GetLength(0); i++)
            {

                a[i] = (float)rand.NextDouble();

            }
        }


        public static void Dot(float[] c, float[] a, float[] b, int M, int N, int K) // mnozy dwie macierze 2D
        {
            for (int i = 0; i < M; i++)
            {
                if (M > 255 && N > 255)
                {
                    Parallel.For(0, N, j =>
                    {
                        //c[i * N + j] = 0;
                        for (int k = 0; k < K; k++) // OR k 255 && c.GetLength(1) > 255)
                    {
                        Parallel.For(0, c.GetLength(1), j =>
                        {
                            c[i, j] = 0;
                            for (int k = 0; k < a.GetLength(1); k++) // OR k



你可能感兴趣的:(C#学习,OpenCL)