pycuda 之 安装与简单使用

 

pycuda安装:

    (1)查看CUDA版本:cat /usr/local/cuda/version.txt   (目前实验CUDA版本为:CUDA Version 9.0.176)

    (2)查看cudnn版本:cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2   (目前实验cudnn版本:7.0)

    (3)安装直接:pip install pycuda==2017.1.1       (目前实验cudnn版本:2017.1.1)

    注意:pycuda查看可以看这里:pycuda官网。  (参看版本之间的对应)

 

实验:

hello_gpu.py

import pycuda.autoinit
import pycuda.driver as drv
import numpy

from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  const int i = threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(400,1,1), grid=(1,1))

print ( dest-a*b )

或者:

import pycuda.autoinit
import pycuda.driver as drv
import numpy

from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void multiply_them(float *dest, float *a, float *b)
{
  //const int i = threadIdx.x;
  const int i =  blockIdx.x * blockDim.x + threadIdx.x;
  dest[i] = a[i] * b[i];
}
""")

multiply_them = mod.get_function("multiply_them")

a = numpy.random.randn(400).astype(numpy.float32)
b = numpy.random.randn(400).astype(numpy.float32)

dest = numpy.zeros_like(a)
multiply_them(
        drv.Out(dest), drv.In(a), drv.In(b),
        block=(40,1,1), grid=(10,1))

print ( dest-a*b )

 

cpu与gpu计算效率的对比:  test.py

import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from timeit import default_timer as timer

from pycuda.compiler import SourceModule
mod = SourceModule("""
__global__ void func(float *a, float *b, size_t N)
{
 const int i = blockIdx.x * blockDim.x + threadIdx.x;
 if (i >= N)
 {
  return;
 }
 float temp_a = a[i];
 float temp_b = b[i];
 a[i] = (temp_a * 10 + 2 ) * ((temp_b + 2) * 10 - 5 ) * 5;
 // a[i] = a[i] + b[i];
}
""")

func = mod.get_function("func")

def test(N):
  # N = 1024 * 1024 * 90  # float: 4M = 1024 * 1024

  print("N = %d" % N)

  N = np.int32(N)

  a = np.random.randn(N).astype(np.float32)
  b = np.random.randn(N).astype(np.float32)
  # copy a to aa
  aa = np.empty_like(a)
  aa[:] = a
  # GPU run
  nTheads = 256
  nBlocks = int( ( N + nTheads - 1 ) / nTheads )
  start = timer()
  func(
      drv.InOut(a), drv.In(b), N,
      block=( nTheads, 1, 1 ), grid=( nBlocks, 1 ) )
  run_time = timer() - start
  print("gpu run time %f seconds " % run_time)
  # cpu run
  start = timer()
  aa = (aa * 10 + 2 ) * ((b + 2) * 10 - 5 ) * 5
  run_time = timer() - start

  print("cpu run time %f seconds " % run_time)

  # check result
  r = a - aa
  print( min(r), max(r) )

def main():
 for n in range(1, 10):
  N = 1024 * 1024 * (n * 10)
  print("------------%d---------------" % n)
  test(N)

if __name__ == '__main__':
  main()

结果:

------------1---------------
N = 10485760
gpu run time 0.023215 seconds
cpu run time 0.068797 seconds
-0.0014648438 0.0014648438
------------2---------------
N = 20971520
gpu run time 0.032089 seconds
cpu run time 0.124529 seconds
-0.0014648438 0.0014648438
------------3---------------
N = 31457280
gpu run time 0.046203 seconds
cpu run time 0.187157 seconds
-0.0014648438 0.0014648438
------------4---------------
N = 41943040
gpu run time 0.055805 seconds
cpu run time 0.244947 seconds
-0.0014648438 0.0014648438
------------5---------------
N = 52428800
gpu run time 0.075256 seconds
cpu run time 0.317744 seconds
-0.0014648438 0.0014648438
------------6---------------
N = 62914560
gpu run time 0.080560 seconds
cpu run time 0.378609 seconds
-0.0014648438 0.0014648438
------------7---------------
N = 73400320
gpu run time 0.101881 seconds
cpu run time 0.439889 seconds
-0.0014648438 0.0014648438
------------8---------------
N = 83886080
gpu run time 0.112525 seconds
cpu run time 0.504098 seconds
-0.0014648438 0.0014648438
------------9---------------
N = 94371840
gpu run time 0.139425 seconds
cpu run time 0.576029 seconds
-0.0014648438 0.0014648438

 

参考文献:

(1)GPU共享内存:pycuda使用教程:https://blog.csdn.net/qq_36387683/article/details/81075870

(2)pycuda教程:https://documen.tician.de/pycuda/tutorial.html

(3)理论性指导可以看这篇:https://blog.csdn.net/hujingshuang/article/details/53097222

你可能感兴趣的:(Python学习,pycuda安装,pycuda使用,pycuda性能比较)