之前写过一篇矩阵乘,但只能是方阵,详见https://blog.csdn.net/xll_bit/article/details/103271788?spm=1001.2014.3001.5501
这次更新的矩阵乘可以是任意维度的,同样有全局内存,共享内存两个版本的实现。
module simpleOps_m
contains
attributes(global) subroutine mulmatrix(a, b, c)
implicit none
integer :: a(:,:), b(:,:), c(:,:)
integer :: i, j, k, n(3)
integer :: tmp
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
n(1) = size(a,1)
n(2) = size(a,2)
n(3) = size(b,2)
if (i <= n(1) .and. j <= n(3) ) then
tmp = 0
do k = 1,n(2)
tmp = tmp + a(i,k) * b(k, j)
enddo
c(i,j) = tmp
endif
end subroutine mulmatrix
attributes(global) subroutine mulmatrix_shared(a, b, c, TILE_SIZE)
implicit none
integer :: a(:,:), b(:,:), c(:,:)
integer :: i, j, k, n(3), ii, jj, s
integer :: tmp,stride
integer, value :: TILE_SIZE
!integer, parameter :: TILE_SIZE = TILE
integer, shared :: sa(TILE_SIZE,TILE_SIZE),sb(TILE_SIZE,TILE_SIZE)
!integer, shared :: sa(16,16),sb(16,16)
!integer, shared :: sa(8,8),sb(8,8)
n(1) = size(a,1)
n(2) = size(a,2)
n(3) = size(b,2)
i = (blockIdx%x-1)*blockDim%x + threadIdx%x
j = (blockIdx%y-1)*blockDim%y + threadIdx%y
ii = threadIdx%x
jj = threadIdx%y
tmp = 0
do s = 1, n(2),TILE_SIZE
call syncthreads()
sa(ii,jj) = 0
sb(ii,jj) = 0
! 涉及到分块的移动,从左到右、从上到下,方向不能搞混
if (i <= n(1) .and. jj+s-1 <= n(2)) then
sa(ii,jj) = a(i,jj + s - 1)
endif
if (ii+s-1 <= n(2) .and. j <= n(3)) then
sb(ii,jj) = b(ii +s - 1,j)
endif
call syncthreads()
do k = 1,TILE_SIZE
tmp = tmp + sa(ii,k) * sb(k,jj)
enddo
enddo
if (i <= n(1) .and. j <= n(3) ) then
c(i,j) = tmp
endif
end subroutine mulmatrix_shared
end module simpleOps_m
program incrementTest
use cudafor
use simpleOps_m
implicit none
integer, parameter :: nx=1000, ny=1024, nz=1024, NUM_REPS = 100
integer, parameter :: TILE_SIZE = 16
integer :: a(nx,ny), b(ny,nz), c(nx,nz), d(nx,nz)
integer, device :: a_d(nx,ny), b_d(ny, nz), c_d(nx, nz)
type(dim3) :: grid, tBlock
integer :: i, j, k, istat
type (cudaEvent) :: startEvent, stopEvent
real :: gtime, t1, t2, ctime
do i = 1,nx
do j = 1,ny
a(i,j) = i - j
enddo
enddo
do i = 1,ny
do j = 1,nz
b(i,j) = j - i
enddo
enddo
call cpu_time(t1)
do i = 1,nx
do j = 1, nz
d(i,j) = 0
do k = 1, ny
d(i,j) = d(i, j) + a(i,k) * b(k,j)
enddo
enddo
enddo
call cpu_time(t2)
ctime = t2 - t1
!d = matmul(a,b)
tBlock = dim3(TILE_SIZE,TILE_SIZE,1)
grid = dim3(ceiling(real(nx)/tBlock%x), &
ceiling(real(nz)/tBlock%y), 1)
a_d = a
b_d = b
istat = cudaEventCreate(startEvent)
istat = cudaEventCreate(stopEvent)
istat = cudaEventRecord(startEvent, 0)
do i=1, NUM_REPS
call mulmatrix<<>>(a_d, b_d, c_d)
call mulmatrix_shared<<>>(a_d, b_d, c_d, TILE_SIZE)
end do
istat = cudaEventRecord(stopEvent, 0)
istat = cudaEventSynchronize(stopEvent)
istat = cudaEventElapsedTime(gtime, startEvent, stopEvent)
c = c_d
if (maxval(abs(c-d)) .gt. 1.0e-7) then
write(*,*) '**** Program Failed ****', maxval(abs(c-d))
write(*,*) '*** c gpumatmul ***'
write(*,*) c
write(*,*) '*** d matmul ***'
write(*,*) d
else
write(*,*) 'Program Passed'
write(*,*) 'cpu time(ms):', ctime * 1000
write(*,*) 'gpu time(ms):', gtime/NUM_REPS
write(*,*) 'speedup:', ctime * 1000 / gtime * NUM_REPS
endif
istat = cudaEventDestroy(startEvent)
istat = cudaEventDestroy(stopEvent)
end program incrementTest