iprbench/test/mamul1/mamul1.F90

340 lines
10 KiB
Fortran
Raw Normal View History

refactored iprbench to separate ipr benchmark framework from the actual benchmarks This decoupling allows to write benchmarks as modules that can be used in various situations (from a benchmark job or directly from a user), but this design will allow automatic registering of the benchmark results in a user selectable form (sql database, stdout, etc.) - separated `hibenchonphysix.py` into `clusterbench.py` (tool to run a benchmark on a cluster) and `hibench.py` (hibridon benchmark module) so that `clusterbench.py` no longer has a knowledge about hibridon. - there are currently 2 ways to run a bechmark: 1. as a simple run through `clusterbench-run` command (which will eventually be renamed as iprbench-run since it might be completely independent from the concept of cluster) 2. as cluster jobs through `clusterbench-submit` command - added unit test - added another benchmark `mamul1` that is used as a unittest because it has 2 benefits over `hibench` benchmark: 1. it's standalone (no external resources needed) 2. it's quicker to execute note: this refactoring work is not complete yet, but the concept proof is complete (the 2 unittests pass): - still need to provide the user a way to switch between IpRCluster and DummyCluster(which is only intended to only be used for testing clusterbench)) - still need to run multiple configs of the same benchmark in one run (as hibenchonphysix did) work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3958] and [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3372]
2024-10-22 09:16:41 +02:00
#define MAMUL1_VERSION "1.0.0"
#define magma_devptr_t integer(kind=8)
subroutine print_usage(prog_path)
character(len=*), intent(in) :: prog_path
character(len=80) :: build_variant
#if defined(USE_MAGMA_DGEMM_GPU)
build_variant='gpu'
#elif defined(USE_DGEMM)
build_variant='cpu'
#else
build_variant='unknown'
#endif
write(6,'("mamul1 v",a," (variant:",a,"): benchmark performs a square matrix multiplication in double precision")') MAMUL1_VERSION, trim(build_variant);
write(6,'()');
write(6,'("Usage: ",a," <NDIM> <NUM_LOOPS>")') trim(prog_path);
write(6,'(" <NDIM> positive integer representing the size of the square matrices to multiply ")');
write(6,'(" <NUM_LOOPS> positive integer representing the number of times the multiplication is performed")');
end subroutine
program mamul1
implicit none
integer :: argc, info, ndim, num_loops
character(len=32) :: arg0, arg1, arg2
call get_command_argument(0,arg0)
argc = command_argument_count()
if (argc /= 2) then
call print_usage(trim(arg0))
! write(6,'("Usage: ",a," NDIM NUM_LOOPS, where NDIM is a positive integer")') trim(arg0);
stop
end if
call get_command_argument(1,arg1,status=info)
if (info /= 0) then
write(6,'("Error reading argument: info = ",i2)') info
call print_usage(trim(arg0))
stop
end if
call get_command_argument(2,arg2,status=info)
if (info /= 0) then
write(6,'("Error reading argument: info = ",i2)') info
call print_usage(trim(arg0))
stop
end if
read(arg1,*,iostat=info) ndim
if (info /= 0) then
write(6,'("Error converting ndim argument to integer: info = ",i2)') info
call print_usage(trim(arg0))
stop
end if
read(arg2,*,iostat=info) num_loops
if (info /= 0) then
write(6,'("Error converting num_loops argument to integer: info = ",i2)') info
call print_usage(trim(arg0))
stop
end if
if (ndim < 1) then
call print_usage(trim(arg0))
stop
end if
call test_dgemm(ndim, num_loops)
stop
end program mamul1
subroutine set_random_seed(seed)
integer :: seed
integer :: seed_array_size
INTEGER, ALLOCATABLE :: seed_array (:)
CALL RANDOM_SEED (SIZE = seed_array_size) ! I is set to the size of
! ! the seed array
ALLOCATE (seed_array(seed_array_size))
seed_array = seed
CALL RANDOM_SEED (PUT=seed_array(1:seed_array_size))
end subroutine
subroutine print_matrix(mat, ndim)
implicit none
integer, parameter :: dp = kind(1.0d0)
real(dp), intent(in) :: mat(ndim, ndim)
integer, intent(in) :: ndim
integer :: irow
do irow = 1, ndim
write(6, *) mat(irow,:)
end do
end subroutine
! square matrix multiplication
subroutine sqmatmul(amat, bmat, cmat, ndim)
#if defined(USE_MAGMA_DGEMM_GPU)
use magma, only: magmaf_init, magmaf_finalize
use magma, only: magmaf_queue_create, magmaf_queue_destroy
use magma, only: magmaf_dmalloc, magmaf_free
use magma, only: magmaf_dsetmatrix, magmaf_dgetmatrix
use magma, only: magmablasf_dgemm
#endif
real*8, intent(in) :: amat(ndim,ndim)
real*8, intent(in) :: bmat(ndim,ndim)
real*8, intent(out) :: cmat(ndim,ndim)
integer :: lda, ldb, ldc
integer :: info
real :: time_before, time_after
integer(8) :: num_ops
real :: gflops
#ifdef USE_MAGMA_DGEMM_GPU
magma_devptr_t :: d_amat
magma_devptr_t :: d_bmat
magma_devptr_t :: d_cmat
magma_devptr_t :: queue !! really a CPU pointer
#endif
lda = ceiling(real(ndim)/32)*32
ldb = ceiling(real(ndim)/32)*32
ldc = ceiling(real(ndim)/32)*32
#if defined(USE_MAGMA_DGEMM_GPU)
!! allocate GPU memory
write(6,'("DEBUG: before matrix A gpu memory allocation (",i0," doubles)")') lda * ndim
info = magmaf_dmalloc( d_amat, lda*ndim )
if (d_amat == 0) then
print "(a)", "failed to allocate d_amat"
return
endif
write(6,'("DEBUG: before matrix B gpu memory allocation (",i0," doubles)")') ldb * ndim
info = magmaf_dmalloc( d_bmat, ldb*ndim )
if (d_bmat == 0) then
print "(a)", "failed to allocate d_bmat"
return
endif
write(6,'("DEBUG: before matrix C gpu memory allocation (",i0," doubles)")') ldc * ndim
info = magmaf_dmalloc( d_cmat, ldc*ndim )
if (d_cmat == 0) then
print "(a)", "failed to allocate d_cmat"
return
endif
! copy A to dA and B to dB
call magmaf_queue_create( 0, queue )
write(6,'("DEBUG: queue = ",i0)') queue
if (queue == 0) then
print "(a)", "failed to create a queue"
return
endif
write(6,*) 'DEBUG: copying matrix A from CPU to GPU memory'
call magmaf_dsetmatrix( ndim, ndim, amat, ndim, d_amat, lda, queue )
write(6,*) 'DEBUG: copying matrix B from CPU to GPU memory'
call magmaf_dsetmatrix( ndim, ndim, bmat, ndim, d_bmat, ldb, queue )
call cpu_time(time_before)
write (6,*) 'before magmablasf_dgemm, time=', time_before
call magmablasf_dgemm ('N', 'N', ndim, ndim, ndim, 1.0d0, d_amat, lda, d_bmat, ldb, 0.0d0, d_cmat, ldc, queue)
call magmaf_queue_sync(queue)
call cpu_time(time_after)
num_ops = real(ndim) * real(ndim) * real(ndim) * 2
gflops = num_ops / (time_after - time_before) / 1.0e9
write (6,*) 'after magmablasf_dgemm, time=', time_after
write (6,*) 'magmablasf_dgemm (from gpu memory to gpu memory) duration :', (time_after - time_before), '(', gflops, ' gflops)'
write(6,*) 'DEBUG: copying matrix C from GPU to CPU memory'
call magmaf_dgetmatrix( ndim, ndim, d_cmat, ldc, cmat, ndim, queue )
call magmaf_queue_destroy( queue )
info = magmaf_free(d_cmat)
info = magmaf_free(d_bmat)
info = magmaf_free(d_amat)
#endif
#ifdef USE_DGEMM
! subroutine dgemm ( character TRANSA,
! character TRANSB,
! integer M,
! integer N,
! integer K,
! double precision ALPHA,
! double precision, dimension(lda,*) A,
! integer LDA,
! double precision, dimension(ldb,*) B,
! integer LDB,
! double precision BETA,
! double precision, dimension(ldc,*) C,
! integer LDC
! )
call dgemm('N', 'N', ndim, ndim, ndim, 1.0d0, amat, ndim, bmat, ndim, 0.0d0, cmat, ndim)
#endif
end subroutine
subroutine check_cmat_element(cmat, row, col, amat, bmat, ndim)
real(8), intent(in) :: cmat(ndim, ndim)
integer, intent(in) :: row
integer, intent(in) :: col
real(8), intent(in) :: amat(ndim, ndim)
real(8), intent(in) :: bmat(ndim, ndim)
integer, intent(in) :: ndim
real(8) :: x
x = 0.0d0
do i = 1, ndim
x = x + amat(row, i) * bmat(i, col)
end do
write(6, '("expected cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, x
write(6, '("computed cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, cmat(row, col)
if (abs(cmat(row, col) - x) > 1.0e-8) then
stop 'a computed element has a wrong value'
end if
end subroutine
subroutine test_dgemm(ndim, num_loops)
#if defined(USE_MAGMA_DGEMM_GPU)
use magma, only: magmaf_init, magmaf_finalize
use magma, only: magmablasf_dgemm !, magmaf_dgemm_gpu
#endif
implicit none
integer, intent(in) :: ndim
integer, intent(in) :: num_loops
integer, parameter :: dp = kind(1.0d0)
real :: ct_start, ct_stop ! elapsed cpu time relative to an arbitrary fixed time. Expressed in seconds with the granularity of 1 microsecond
integer(8) :: num_ops
real :: gflops
integer :: sc_start, sc_stop ! system clock time of start and stop events, expressed in ticks
integer :: sc_count_rate ! number of system clock ticks per second
integer :: sc_count_max ! the max possible number of system clock ticks returned by system_clock
integer :: s
REAL :: a_diff, diff
REAL :: num_sc_ticks_per_second ! the number of system clock ticks per second
real*8, allocatable :: amat(:,:)
real*8, allocatable :: bmat(:,:)
real*8, allocatable :: cmat(:,:)
real(dp) :: x
integer :: i, j
#if defined(USE_MAGMA_DGEMM_GPU)
write(6,*) 'DEBUG: init magma'
call magmaf_init()
#endif
! First initialize the system_clock
CALL system_clock(count_rate=sc_count_rate)
CALL system_clock(count_max=sc_count_max)
num_sc_ticks_per_second = REAL(sc_count_rate)
WRITE(*,*) "system_clock rate : ", num_sc_ticks_per_second, " ticks per second"
diff = 0.0
a_diff = 0.0
s = 0
allocate(amat(ndim, ndim))
allocate(bmat(ndim, ndim))
allocate(cmat(ndim, ndim))
call set_random_seed(42)
!call random_number(amat)
!amat = 0.5_dp*(amat + transpose(amat))
do j = 1, ndim
do i = 1, ndim
call random_number(x)
amat(i,j) = x
call random_number(x)
bmat(i,j) = x
end do
end do
call cpu_time(ct_start)
call system_clock(sc_start)
do j = 1, num_loops
! playmat = amat
call sqmatmul(amat, bmat, cmat, ndim)
end do
call cpu_time(ct_stop)
call system_clock(sc_stop)
if ( (sc_stop - sc_start)/num_sc_ticks_per_second < (ct_stop - ct_start) ) s = s + 1
diff = (sc_stop - sc_start)/num_sc_ticks_per_second - (ct_stop - ct_start) + diff
a_diff = ABS((sc_stop - sc_start)/num_sc_ticks_per_second - (ct_stop - ct_start)) + a_diff
! check one of the elements of cmat (the last one here: cmat(ndim, ndim))
call check_cmat_element(cmat, 1, 1, amat, bmat, ndim)
call check_cmat_element(cmat, 1, ndim, amat, bmat, ndim)
call check_cmat_element(cmat, ndim, 1, amat, bmat, ndim)
call check_cmat_element(cmat, ndim, ndim, amat, bmat, ndim)
! write(6, *) 'amat = '
! call print_matrix(amat, ndim)
! write(6, *) 'bmat = '
! call print_matrix(bmat, ndim)
! write(6, *) 'cmat = '
! call print_matrix(cmat, ndim)
num_ops = real(ndim) * real(ndim) * real(ndim) * 2 * num_loops
gflops = num_ops / (ct_stop-ct_start) / 1.0e9
write(6, '("Time taken by dgemm for matrix size ",i8," was ",f10.2," seconds")') ndim, ct_stop-ct_start
WRITE(*,*) "gflops (including potential memory transfers) : ", gflops
WRITE(*,*) "system_clock : ",(sc_stop - sc_start)/num_sc_ticks_per_second
WRITE(*,*) "cpu_time : ",(ct_stop - ct_start)
WRITE(*,*) "sys_clock < cpu_time : ",s
WRITE(*,*) "mean diff : ",diff
WRITE(*,*) "abs mean diff : ",a_diff
#if defined(USE_MAGMA_DGEMM_GPU)
write(6,*) 'DEBUG: deinit magma'
call magmaf_finalize()
#endif
deallocate(amat, bmat, cmat)
end