340 lines
10 KiB
Fortran
340 lines
10 KiB
Fortran
|
#define MAMUL1_VERSION "1.0.0"
|
||
|
|
||
|
#define magma_devptr_t integer(kind=8)
|
||
|
subroutine print_usage(prog_path)
|
||
|
character(len=*), intent(in) :: prog_path
|
||
|
character(len=80) :: build_variant
|
||
|
#if defined(USE_MAGMA_DGEMM_GPU)
|
||
|
build_variant='gpu'
|
||
|
#elif defined(USE_DGEMM)
|
||
|
build_variant='cpu'
|
||
|
#else
|
||
|
build_variant='unknown'
|
||
|
#endif
|
||
|
write(6,'("mamul1 v",a," (variant:",a,"): benchmark performs a square matrix multiplication in double precision")') MAMUL1_VERSION, trim(build_variant);
|
||
|
write(6,'()');
|
||
|
write(6,'("Usage: ",a," <NDIM> <NUM_LOOPS>")') trim(prog_path);
|
||
|
write(6,'(" <NDIM> positive integer representing the size of the square matrices to multiply ")');
|
||
|
write(6,'(" <NUM_LOOPS> positive integer representing the number of times the multiplication is performed")');
|
||
|
end subroutine
|
||
|
|
||
|
program mamul1
|
||
|
|
||
|
implicit none
|
||
|
|
||
|
|
||
|
integer :: argc, info, ndim, num_loops
|
||
|
|
||
|
character(len=32) :: arg0, arg1, arg2
|
||
|
|
||
|
|
||
|
call get_command_argument(0,arg0)
|
||
|
|
||
|
argc = command_argument_count()
|
||
|
if (argc /= 2) then
|
||
|
call print_usage(trim(arg0))
|
||
|
! write(6,'("Usage: ",a," NDIM NUM_LOOPS, where NDIM is a positive integer")') trim(arg0);
|
||
|
stop
|
||
|
end if
|
||
|
|
||
|
call get_command_argument(1,arg1,status=info)
|
||
|
if (info /= 0) then
|
||
|
write(6,'("Error reading argument: info = ",i2)') info
|
||
|
call print_usage(trim(arg0))
|
||
|
stop
|
||
|
end if
|
||
|
|
||
|
call get_command_argument(2,arg2,status=info)
|
||
|
if (info /= 0) then
|
||
|
write(6,'("Error reading argument: info = ",i2)') info
|
||
|
call print_usage(trim(arg0))
|
||
|
stop
|
||
|
end if
|
||
|
|
||
|
read(arg1,*,iostat=info) ndim
|
||
|
if (info /= 0) then
|
||
|
write(6,'("Error converting ndim argument to integer: info = ",i2)') info
|
||
|
call print_usage(trim(arg0))
|
||
|
stop
|
||
|
end if
|
||
|
|
||
|
read(arg2,*,iostat=info) num_loops
|
||
|
if (info /= 0) then
|
||
|
write(6,'("Error converting num_loops argument to integer: info = ",i2)') info
|
||
|
call print_usage(trim(arg0))
|
||
|
stop
|
||
|
end if
|
||
|
|
||
|
|
||
|
if (ndim < 1) then
|
||
|
call print_usage(trim(arg0))
|
||
|
stop
|
||
|
end if
|
||
|
|
||
|
call test_dgemm(ndim, num_loops)
|
||
|
|
||
|
stop
|
||
|
end program mamul1
|
||
|
|
||
|
subroutine set_random_seed(seed)
|
||
|
integer :: seed
|
||
|
integer :: seed_array_size
|
||
|
INTEGER, ALLOCATABLE :: seed_array (:)
|
||
|
CALL RANDOM_SEED (SIZE = seed_array_size) ! I is set to the size of
|
||
|
! ! the seed array
|
||
|
ALLOCATE (seed_array(seed_array_size))
|
||
|
seed_array = seed
|
||
|
CALL RANDOM_SEED (PUT=seed_array(1:seed_array_size))
|
||
|
end subroutine
|
||
|
|
||
|
subroutine print_matrix(mat, ndim)
|
||
|
implicit none
|
||
|
integer, parameter :: dp = kind(1.0d0)
|
||
|
real(dp), intent(in) :: mat(ndim, ndim)
|
||
|
integer, intent(in) :: ndim
|
||
|
integer :: irow
|
||
|
do irow = 1, ndim
|
||
|
write(6, *) mat(irow,:)
|
||
|
end do
|
||
|
end subroutine
|
||
|
|
||
|
! square matrix multiplication
|
||
|
subroutine sqmatmul(amat, bmat, cmat, ndim)
|
||
|
#if defined(USE_MAGMA_DGEMM_GPU)
|
||
|
use magma, only: magmaf_init, magmaf_finalize
|
||
|
use magma, only: magmaf_queue_create, magmaf_queue_destroy
|
||
|
use magma, only: magmaf_dmalloc, magmaf_free
|
||
|
use magma, only: magmaf_dsetmatrix, magmaf_dgetmatrix
|
||
|
use magma, only: magmablasf_dgemm
|
||
|
#endif
|
||
|
real*8, intent(in) :: amat(ndim,ndim)
|
||
|
real*8, intent(in) :: bmat(ndim,ndim)
|
||
|
real*8, intent(out) :: cmat(ndim,ndim)
|
||
|
integer :: lda, ldb, ldc
|
||
|
integer :: info
|
||
|
|
||
|
real :: time_before, time_after
|
||
|
integer(8) :: num_ops
|
||
|
real :: gflops
|
||
|
|
||
|
#ifdef USE_MAGMA_DGEMM_GPU
|
||
|
magma_devptr_t :: d_amat
|
||
|
magma_devptr_t :: d_bmat
|
||
|
magma_devptr_t :: d_cmat
|
||
|
magma_devptr_t :: queue !! really a CPU pointer
|
||
|
#endif
|
||
|
lda = ceiling(real(ndim)/32)*32
|
||
|
ldb = ceiling(real(ndim)/32)*32
|
||
|
ldc = ceiling(real(ndim)/32)*32
|
||
|
|
||
|
|
||
|
#if defined(USE_MAGMA_DGEMM_GPU)
|
||
|
!! allocate GPU memory
|
||
|
write(6,'("DEBUG: before matrix A gpu memory allocation (",i0," doubles)")') lda * ndim
|
||
|
info = magmaf_dmalloc( d_amat, lda*ndim )
|
||
|
if (d_amat == 0) then
|
||
|
print "(a)", "failed to allocate d_amat"
|
||
|
return
|
||
|
endif
|
||
|
write(6,'("DEBUG: before matrix B gpu memory allocation (",i0," doubles)")') ldb * ndim
|
||
|
info = magmaf_dmalloc( d_bmat, ldb*ndim )
|
||
|
if (d_bmat == 0) then
|
||
|
print "(a)", "failed to allocate d_bmat"
|
||
|
return
|
||
|
endif
|
||
|
write(6,'("DEBUG: before matrix C gpu memory allocation (",i0," doubles)")') ldc * ndim
|
||
|
info = magmaf_dmalloc( d_cmat, ldc*ndim )
|
||
|
if (d_cmat == 0) then
|
||
|
print "(a)", "failed to allocate d_cmat"
|
||
|
return
|
||
|
endif
|
||
|
|
||
|
! copy A to dA and B to dB
|
||
|
call magmaf_queue_create( 0, queue )
|
||
|
write(6,'("DEBUG: queue = ",i0)') queue
|
||
|
if (queue == 0) then
|
||
|
print "(a)", "failed to create a queue"
|
||
|
return
|
||
|
endif
|
||
|
|
||
|
write(6,*) 'DEBUG: copying matrix A from CPU to GPU memory'
|
||
|
call magmaf_dsetmatrix( ndim, ndim, amat, ndim, d_amat, lda, queue )
|
||
|
write(6,*) 'DEBUG: copying matrix B from CPU to GPU memory'
|
||
|
call magmaf_dsetmatrix( ndim, ndim, bmat, ndim, d_bmat, ldb, queue )
|
||
|
|
||
|
call cpu_time(time_before)
|
||
|
write (6,*) 'before magmablasf_dgemm, time=', time_before
|
||
|
|
||
|
call magmablasf_dgemm ('N', 'N', ndim, ndim, ndim, 1.0d0, d_amat, lda, d_bmat, ldb, 0.0d0, d_cmat, ldc, queue)
|
||
|
call magmaf_queue_sync(queue)
|
||
|
|
||
|
call cpu_time(time_after)
|
||
|
num_ops = real(ndim) * real(ndim) * real(ndim) * 2
|
||
|
gflops = num_ops / (time_after - time_before) / 1.0e9
|
||
|
write (6,*) 'after magmablasf_dgemm, time=', time_after
|
||
|
write (6,*) 'magmablasf_dgemm (from gpu memory to gpu memory) duration :', (time_after - time_before), '(', gflops, ' gflops)'
|
||
|
|
||
|
write(6,*) 'DEBUG: copying matrix C from GPU to CPU memory'
|
||
|
call magmaf_dgetmatrix( ndim, ndim, d_cmat, ldc, cmat, ndim, queue )
|
||
|
call magmaf_queue_destroy( queue )
|
||
|
|
||
|
info = magmaf_free(d_cmat)
|
||
|
info = magmaf_free(d_bmat)
|
||
|
info = magmaf_free(d_amat)
|
||
|
|
||
|
#endif
|
||
|
|
||
|
#ifdef USE_DGEMM
|
||
|
! subroutine dgemm ( character TRANSA,
|
||
|
! character TRANSB,
|
||
|
! integer M,
|
||
|
! integer N,
|
||
|
! integer K,
|
||
|
! double precision ALPHA,
|
||
|
! double precision, dimension(lda,*) A,
|
||
|
! integer LDA,
|
||
|
! double precision, dimension(ldb,*) B,
|
||
|
! integer LDB,
|
||
|
! double precision BETA,
|
||
|
! double precision, dimension(ldc,*) C,
|
||
|
! integer LDC
|
||
|
! )
|
||
|
call dgemm('N', 'N', ndim, ndim, ndim, 1.0d0, amat, ndim, bmat, ndim, 0.0d0, cmat, ndim)
|
||
|
#endif
|
||
|
|
||
|
end subroutine
|
||
|
|
||
|
subroutine check_cmat_element(cmat, row, col, amat, bmat, ndim)
|
||
|
real(8), intent(in) :: cmat(ndim, ndim)
|
||
|
integer, intent(in) :: row
|
||
|
integer, intent(in) :: col
|
||
|
real(8), intent(in) :: amat(ndim, ndim)
|
||
|
real(8), intent(in) :: bmat(ndim, ndim)
|
||
|
integer, intent(in) :: ndim
|
||
|
|
||
|
real(8) :: x
|
||
|
x = 0.0d0
|
||
|
do i = 1, ndim
|
||
|
x = x + amat(row, i) * bmat(i, col)
|
||
|
end do
|
||
|
|
||
|
write(6, '("expected cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, x
|
||
|
write(6, '("computed cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, cmat(row, col)
|
||
|
if (abs(cmat(row, col) - x) > 1.0e-8) then
|
||
|
stop 'a computed element has a wrong value'
|
||
|
end if
|
||
|
end subroutine
|
||
|
|
||
|
|
||
|
subroutine test_dgemm(ndim, num_loops)
|
||
|
#if defined(USE_MAGMA_DGEMM_GPU)
|
||
|
use magma, only: magmaf_init, magmaf_finalize
|
||
|
use magma, only: magmablasf_dgemm !, magmaf_dgemm_gpu
|
||
|
#endif
|
||
|
|
||
|
implicit none
|
||
|
integer, intent(in) :: ndim
|
||
|
integer, intent(in) :: num_loops
|
||
|
integer, parameter :: dp = kind(1.0d0)
|
||
|
real :: ct_start, ct_stop ! elapsed cpu time relative to an arbitrary fixed time. Expressed in seconds with the granularity of 1 microsecond
|
||
|
integer(8) :: num_ops
|
||
|
real :: gflops
|
||
|
|
||
|
integer :: sc_start, sc_stop ! system clock time of start and stop events, expressed in ticks
|
||
|
integer :: sc_count_rate ! number of system clock ticks per second
|
||
|
integer :: sc_count_max ! the max possible number of system clock ticks returned by system_clock
|
||
|
integer :: s
|
||
|
REAL :: a_diff, diff
|
||
|
REAL :: num_sc_ticks_per_second ! the number of system clock ticks per second
|
||
|
|
||
|
real*8, allocatable :: amat(:,:)
|
||
|
real*8, allocatable :: bmat(:,:)
|
||
|
real*8, allocatable :: cmat(:,:)
|
||
|
real(dp) :: x
|
||
|
integer :: i, j
|
||
|
|
||
|
#if defined(USE_MAGMA_DGEMM_GPU)
|
||
|
write(6,*) 'DEBUG: init magma'
|
||
|
call magmaf_init()
|
||
|
#endif
|
||
|
|
||
|
! First initialize the system_clock
|
||
|
CALL system_clock(count_rate=sc_count_rate)
|
||
|
CALL system_clock(count_max=sc_count_max)
|
||
|
num_sc_ticks_per_second = REAL(sc_count_rate)
|
||
|
WRITE(*,*) "system_clock rate : ", num_sc_ticks_per_second, " ticks per second"
|
||
|
|
||
|
diff = 0.0
|
||
|
a_diff = 0.0
|
||
|
s = 0
|
||
|
|
||
|
allocate(amat(ndim, ndim))
|
||
|
allocate(bmat(ndim, ndim))
|
||
|
allocate(cmat(ndim, ndim))
|
||
|
|
||
|
call set_random_seed(42)
|
||
|
|
||
|
!call random_number(amat)
|
||
|
!amat = 0.5_dp*(amat + transpose(amat))
|
||
|
do j = 1, ndim
|
||
|
do i = 1, ndim
|
||
|
call random_number(x)
|
||
|
amat(i,j) = x
|
||
|
call random_number(x)
|
||
|
bmat(i,j) = x
|
||
|
end do
|
||
|
end do
|
||
|
|
||
|
call cpu_time(ct_start)
|
||
|
call system_clock(sc_start)
|
||
|
|
||
|
do j = 1, num_loops
|
||
|
! playmat = amat
|
||
|
|
||
|
call sqmatmul(amat, bmat, cmat, ndim)
|
||
|
|
||
|
end do
|
||
|
|
||
|
call cpu_time(ct_stop)
|
||
|
call system_clock(sc_stop)
|
||
|
if ( (sc_stop - sc_start)/num_sc_ticks_per_second < (ct_stop - ct_start) ) s = s + 1
|
||
|
diff = (sc_stop - sc_start)/num_sc_ticks_per_second - (ct_stop - ct_start) + diff
|
||
|
a_diff = ABS((sc_stop - sc_start)/num_sc_ticks_per_second - (ct_stop - ct_start)) + a_diff
|
||
|
|
||
|
! check one of the elements of cmat (the last one here: cmat(ndim, ndim))
|
||
|
call check_cmat_element(cmat, 1, 1, amat, bmat, ndim)
|
||
|
call check_cmat_element(cmat, 1, ndim, amat, bmat, ndim)
|
||
|
call check_cmat_element(cmat, ndim, 1, amat, bmat, ndim)
|
||
|
call check_cmat_element(cmat, ndim, ndim, amat, bmat, ndim)
|
||
|
|
||
|
! write(6, *) 'amat = '
|
||
|
! call print_matrix(amat, ndim)
|
||
|
|
||
|
! write(6, *) 'bmat = '
|
||
|
! call print_matrix(bmat, ndim)
|
||
|
|
||
|
! write(6, *) 'cmat = '
|
||
|
! call print_matrix(cmat, ndim)
|
||
|
|
||
|
num_ops = real(ndim) * real(ndim) * real(ndim) * 2 * num_loops
|
||
|
gflops = num_ops / (ct_stop-ct_start) / 1.0e9
|
||
|
|
||
|
|
||
|
write(6, '("Time taken by dgemm for matrix size ",i8," was ",f10.2," seconds")') ndim, ct_stop-ct_start
|
||
|
WRITE(*,*) "gflops (including potential memory transfers) : ", gflops
|
||
|
|
||
|
WRITE(*,*) "system_clock : ",(sc_stop - sc_start)/num_sc_ticks_per_second
|
||
|
WRITE(*,*) "cpu_time : ",(ct_stop - ct_start)
|
||
|
WRITE(*,*) "sys_clock < cpu_time : ",s
|
||
|
WRITE(*,*) "mean diff : ",diff
|
||
|
WRITE(*,*) "abs mean diff : ",a_diff
|
||
|
|
||
|
#if defined(USE_MAGMA_DGEMM_GPU)
|
||
|
write(6,*) 'DEBUG: deinit magma'
|
||
|
call magmaf_finalize()
|
||
|
#endif
|
||
|
|
||
|
|
||
|
deallocate(amat, bmat, cmat)
|
||
|
end
|