【CUDA-FORTRAN】实现任意维度矩阵乘

之前写过一篇矩阵乘,但只能是方阵,详见https://blog.csdn.net/xll_bit/article/details/103271788?spm=1001.2014.3001.5501

这次更新的矩阵乘可以是任意维度的,同样有全局内存,共享内存两个版本的实现。

module simpleOps_m
contains
  attributes(global) subroutine mulmatrix(a, b, c)
    implicit none
    integer :: a(:,:), b(:,:), c(:,:)
    integer :: i, j, k, n(3)
    integer :: tmp

    i = (blockIdx%x-1)*blockDim%x + threadIdx%x
    j = (blockIdx%y-1)*blockDim%y + threadIdx%y
    n(1) = size(a,1)
    n(2) = size(a,2)
    n(3) = size(b,2)
    if (i <= n(1) .and. j <= n(3) ) then
        tmp = 0
        do k = 1,n(2)
                tmp = tmp + a(i,k) * b(k, j)
        enddo
        c(i,j) = tmp
    endif
  end subroutine mulmatrix
  attributes(global) subroutine mulmatrix_shared(a, b, c, TILE_SIZE)
    implicit none
    integer :: a(:,:), b(:,:), c(:,:)
    integer :: i, j, k, n(3), ii, jj, s
    integer :: tmp,stride
    integer, value :: TILE_SIZE
    !integer, parameter :: TILE_SIZE = TILE
    integer, shared :: sa(TILE_SIZE,TILE_SIZE),sb(TILE_SIZE,TILE_SIZE)
    !integer, shared :: sa(16,16),sb(16,16)
    !integer, shared :: sa(8,8),sb(8,8)

    n(1) = size(a,1)
    n(2) = size(a,2)
    n(3) = size(b,2)
    i = (blockIdx%x-1)*blockDim%x + threadIdx%x
    j = (blockIdx%y-1)*blockDim%y + threadIdx%y
    ii = threadIdx%x
    jj = threadIdx%y
    tmp = 0
    do s = 1, n(2),TILE_SIZE
        call syncthreads()
        sa(ii,jj) = 0
        sb(ii,jj) = 0
        ! 涉及到分块的移动,从左到右、从上到下,方向不能搞混
        if (i <= n(1) .and. jj+s-1 <= n(2)) then
            sa(ii,jj) = a(i,jj + s - 1)
        endif
        if (ii+s-1 <= n(2) .and. j <= n(3)) then
            sb(ii,jj) = b(ii +s - 1,j)
        endif
        call syncthreads()
        do k = 1,TILE_SIZE
            tmp = tmp + sa(ii,k) * sb(k,jj)
        enddo
    enddo
    if (i <= n(1) .and. j <= n(3) ) then
        c(i,j) = tmp
    endif


  end subroutine mulmatrix_shared
end module simpleOps_m



program incrementTest
  use cudafor
  use simpleOps_m
  implicit none
  integer, parameter :: nx=1000, ny=1024, nz=1024, NUM_REPS = 100
  integer, parameter :: TILE_SIZE = 16
  integer :: a(nx,ny), b(ny,nz), c(nx,nz), d(nx,nz)
  integer, device :: a_d(nx,ny), b_d(ny, nz), c_d(nx, nz)
  type(dim3) :: grid, tBlock
  integer :: i, j, k, istat
  type (cudaEvent) :: startEvent, stopEvent
  real :: gtime, t1, t2, ctime

  do i = 1,nx
        do j = 1,ny
                a(i,j) = i - j
        enddo
  enddo
  do i = 1,ny
        do j = 1,nz
                b(i,j) = j - i
        enddo
  enddo

  call cpu_time(t1)
  do i = 1,nx
        do j = 1, nz
                d(i,j) = 0
                do k = 1, ny
                        d(i,j) = d(i, j) + a(i,k) * b(k,j)
                enddo
        enddo
  enddo
  call cpu_time(t2)
  ctime = t2 - t1
  !d = matmul(a,b)

  tBlock = dim3(TILE_SIZE,TILE_SIZE,1)
  grid = dim3(ceiling(real(nx)/tBlock%x), &
              ceiling(real(nz)/tBlock%y), 1)
  a_d = a
  b_d = b
  istat = cudaEventCreate(startEvent)
  istat = cudaEventCreate(stopEvent)

  istat = cudaEventRecord(startEvent, 0)
  do i=1, NUM_REPS
        call mulmatrix<<>>(a_d, b_d, c_d)
        call mulmatrix_shared<<>>(a_d, b_d, c_d, TILE_SIZE)
  end do
  istat = cudaEventRecord(stopEvent, 0)
  istat = cudaEventSynchronize(stopEvent)
  istat = cudaEventElapsedTime(gtime, startEvent, stopEvent)
  c = c_d

  if (maxval(abs(c-d)) .gt. 1.0e-7) then
     write(*,*) '**** Program Failed ****', maxval(abs(c-d))
     write(*,*) '*** c gpumatmul ***'
     write(*,*) c
     write(*,*) '*** d matmul ***'
     write(*,*) d
  else
     write(*,*) 'Program Passed'
     write(*,*) 'cpu time(ms):', ctime * 1000
     write(*,*) 'gpu time(ms):', gtime/NUM_REPS
     write(*,*) 'speedup:', ctime * 1000 / gtime * NUM_REPS
  endif
  istat = cudaEventDestroy(startEvent)
  istat = cudaEventDestroy(stopEvent)
end program incrementTest

 

你可能感兴趣的:(FORTRAN,CUDA,linux,fortran,算法)