added a complete standalone usage example (mamul1: mutiplication of matrices) in the form of a unit test

note: [matmul] is a copy of [https://github.com/g-raffy/flobe/tree/main/benchmarks/mamul1]
This commit is contained in:
Guillaume Raffy 2024-06-30 15:53:03 +02:00
parent 3dc0d12307
commit c05ff89d29
8 changed files with 420 additions and 4 deletions

3
.gitignore vendored
View File

@ -1,3 +1,4 @@
src/starbench/__pycache__/ **/__pycache__/
dist/ dist/
src/starbench.egg-info/ src/starbench.egg-info/
tmp/

View File

@ -45,3 +45,9 @@ Installing collected packages: starbench
Successfully installed starbench-1.0.0 Successfully installed starbench-1.0.0
bob@bob-ws2:~/work/starbench$ starbench --git-repos-url https://github.com/hibridon/hibridon --code-version a3bed1c3ccfbca572003020d3e3d3b1ff3934fad --git-user g-raffy --git-pass-file "$HOME/.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat" --num-cores 2 --output-dir=/tmp/hibench --cmake-path=/opt/cmake/cmake-3.23.0/bin/cmake --cmake-option=-DCMAKE_BUILD_TYPE=Release --cmake-option=-DBUILD_TESTING=ON --benchmark-command='ctest --output-on-failure -L ^arch4_quick$' bob@bob-ws2:~/work/starbench$ starbench --git-repos-url https://github.com/hibridon/hibridon --code-version a3bed1c3ccfbca572003020d3e3d3b1ff3934fad --git-user g-raffy --git-pass-file "$HOME/.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat" --num-cores 2 --output-dir=/tmp/hibench --cmake-path=/opt/cmake/cmake-3.23.0/bin/cmake --cmake-option=-DCMAKE_BUILD_TYPE=Release --cmake-option=-DBUILD_TESTING=ON --benchmark-command='ctest --output-on-failure -L ^arch4_quick$'
``` ```
## how to test
```sh
(starbench.venv) graffy@graffy-ws2:~/work/starbench/starbench.git$ python3 -m unittest test.test_starbench
```

View File

@ -0,0 +1,2 @@
# from .main import starbench_cmake_app
# __all__ = [starbench_cmake_app]

View File

@ -233,6 +233,7 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef
def _interpret_tags(tagged_string: str, tags_value: Dict[str, str]) -> str: def _interpret_tags(tagged_string: str, tags_value: Dict[str, str]) -> str:
untagged_string = tagged_string untagged_string = tagged_string
for tag_id, tag_value in tags_value.items(): for tag_id, tag_value in tags_value.items():
assert isinstance(untagged_string, str)
untagged_string = untagged_string.replace(tag_id, tag_value) untagged_string = untagged_string.replace(tag_id, tag_value)
return untagged_string return untagged_string
@ -246,9 +247,12 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef
stdout_filepath = None stdout_filepath = None
if self.stdout_filepath is not None: if self.stdout_filepath is not None:
stdout_filepath = CommandPerfEstimator._interpret_tags(str(self.stdout_filepath), tags_value) stdout_filepath = CommandPerfEstimator._interpret_tags(str(self.stdout_filepath), tags_value)
Path(stdout_filepath).parent.mkdir(exist_ok=True)
stderr_filepath = None stderr_filepath = None
if self.stderr_filepath is not None: if self.stderr_filepath is not None:
stderr_filepath = CommandPerfEstimator._interpret_tags(str(self.stderr_filepath), tags_value) stderr_filepath = CommandPerfEstimator._interpret_tags(str(self.stderr_filepath), tags_value)
Path(stderr_filepath).parent.mkdir(exist_ok=True)
with self._runs_lock: with self._runs_lock:
run = Run(self._next_run_id, worker_id) run = Run(self._next_run_id, worker_id)
self._next_run_id += 1 self._next_run_id += 1
@ -335,7 +339,7 @@ class GitRepos(IFileTreeProvider):
return self.src_dir return self.src_dir
def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, num_cores: int, benchmark_command: List[str], cmake_options: List[str] = None, cmake_exe_location: Path = None): def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, num_cores: int, benchmark_command: List[str], cmake_options: List[str] = [], cmake_exe_location: Path = None):
""" """
tests_to_run : regular expression as understood by ctest's -L option. eg '^arch4_quick$' tests_to_run : regular expression as understood by ctest's -L option. eg '^arch4_quick$'
""" """
@ -345,7 +349,7 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path,
build_dir = worker_dir / 'build' build_dir = worker_dir / 'build'
print(f'creating build directory {worker_dir}') print(f'creating build directory {worker_dir}')
create_build_dir = CommandPerfEstimator( create_build_dir = CommandPerfEstimator(
run_command=['mkdir', '-p', build_dir], run_command=['mkdir', '-p', str(build_dir)],
num_cores_per_run=1, num_cores_per_run=1,
num_parallel_runs=num_cores, num_parallel_runs=num_cores,
max_num_cores=num_cores, max_num_cores=num_cores,
@ -360,7 +364,7 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path,
if cmake_exe_location: if cmake_exe_location:
cmake_prog = str(cmake_exe_location) cmake_prog = str(cmake_exe_location)
configure = CommandPerfEstimator( configure = CommandPerfEstimator(
run_command=[cmake_prog] + cmake_options + [src_dir], run_command=[cmake_prog] + cmake_options + [str(src_dir)],
num_cores_per_run=1, num_cores_per_run=1,
num_parallel_runs=num_cores, num_parallel_runs=num_cores,
max_num_cores=num_cores, max_num_cores=num_cores,

0
test/__init__.py Normal file
View File

View File

@ -0,0 +1,43 @@
enable_language (Fortran)
set(MAMUL1_USE_MAGMA "OFF" CACHE BOOL "if set, mamul1 build uses magma (matrix algebra on gpu)")
set(MAMUL1_MAGMA_API "CPU_MEM_API" CACHE STRING "which magma API to use when building mamul1: CPU_MEM_API for BLAS compatible API (uses matrices stored on CPU memory) or GPU_MEM_API (use matrices stored on GPU memory)")
add_executable(mamul1 mamul1.F90)
if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
# Allow arbitrary long lines. Needed as preprocessing could generate long line lengths.
target_compile_options(mamul1 PUBLIC -ffree-line-length-none)
elseif (Fortran_COMPILER_NAME STREQUAL "ifort")
# Intel (ifort)
target_compile_options(mamul1 PUBLIC -no-wrap-margin)
endif()
if (MAMUL1_USE_MAGMA)
find_package( MAGMA REQUIRED )
if( MAMUL1_MAGMA_API STREQUAL "CPU_MEM_API" )
target_compile_definitions(mamul1 PUBLIC USE_MAGMA_DGEMM)
elseif( MAMUL1_MAGMA_API STREQUAL "GPU_MEM_API" )
target_compile_definitions(mamul1 PUBLIC USE_MAGMA_DGEMM_GPU)
else()
message(FATAL_ERROR "unexpected value for MAMUL1_MAGMA_API : ${MAMUL1_MAGMA_API}")
endif()
message(STATUS "MAGMA_INCLUDES=${MAGMA_INCLUDES}")
include_directories("${MAGMA_INCLUDES}")
target_link_libraries(mamul1 "${MAGMA_LIBRARIES}")
else()
find_package( BLAS REQUIRED )
find_package( LAPACK REQUIRED )
# message("BLAS_LIBRARIES=${BLAS_LIBRARIES}")
# message("LAPACK_LIBRARIES=${LAPACK_LIBRARIES}")
target_compile_definitions(mamul1 PUBLIC USE_DGEMM)
# Link Blas and Lapack libraries
target_link_libraries(mamul1 "${LAPACK_LIBRARIES}")
target_link_libraries(mamul1 "${BLAS_LIBRARIES}")
endif()
install(TARGETS mamul1)

335
test/mamul1/mamul1.F90 Normal file
View File

@ -0,0 +1,335 @@
#define MAMUL1_VERSION "1.0.0"
#define magma_devptr_t integer(kind=8)
subroutine print_usage(prog_path)
character(len=*), intent(in) :: prog_path
character(len=80) :: build_variant
#if defined(USE_MAGMA_DGEMM_GPU)
build_variant='gpu'
#elif defined(USE_DGEMM)
build_variant='cpu'
#else
build_variant='unknown'
#endif
write(6,'("mamul1 v",a," (variant:",a,"): benchmark performs a square matrix multiplication in double precision")') MAMUL1_VERSION, trim(build_variant);
write(6,'()');
write(6,'("Usage: ",a," <NDIM> <NUM_LOOPS>")') trim(prog_path);
write(6,'(" <NDIM> positive integer representing the size of the square matrices to multiply ")');
write(6,'(" <NUM_LOOPS> positive integer representing the number of times the multiplication is performed")');
end subroutine
program mamul1
implicit none
integer :: argc, info, ndim, num_loops
character(len=32) :: arg0, arg1, arg2
call get_command_argument(0,arg0)
argc = command_argument_count()
if (argc /= 2) then
call print_usage(trim(arg0))
! write(6,'("Usage: ",a," NDIM NUM_LOOPS, where NDIM is a positive integer")') trim(arg0);
stop
end if
call get_command_argument(1,arg1,status=info)
if (info /= 0) then
write(6,'("Error reading argument: info = ",i2)') info
call print_usage(trim(arg0))
stop
end if
call get_command_argument(2,arg2,status=info)
if (info /= 0) then
write(6,'("Error reading argument: info = ",i2)') info
call print_usage(trim(arg0))
stop
end if
read(arg1,*,iostat=info) ndim
if (info /= 0) then
write(6,'("Error converting ndim argument to integer: info = ",i2)') info
call print_usage(trim(arg0))
stop
end if
read(arg2,*,iostat=info) num_loops
if (info /= 0) then
write(6,'("Error converting num_loops argument to integer: info = ",i2)') info
call print_usage(trim(arg0))
stop
end if
if (ndim < 1) then
call print_usage(trim(arg0))
stop
end if
call test_dgemm(ndim, num_loops)
stop
end program mamul1
subroutine set_random_seed(seed)
integer :: seed
integer :: seed_array_size
INTEGER, ALLOCATABLE :: seed_array (:)
CALL RANDOM_SEED (SIZE = seed_array_size) ! I is set to the size of
! ! the seed array
ALLOCATE (seed_array(seed_array_size))
seed_array = seed
CALL RANDOM_SEED (PUT=seed_array(1:seed_array_size))
end subroutine
subroutine print_matrix(mat, ndim)
implicit none
integer, parameter :: dp = kind(1.0d0)
real(dp), intent(in) :: mat(ndim, ndim)
integer, intent(in) :: ndim
integer :: irow
do irow = 1, ndim
write(6, *) mat(irow,:)
end do
end subroutine
! square matrix multiplication
subroutine sqmatmul(amat, bmat, cmat, ndim)
#if defined(USE_MAGMA_DGEMM_GPU)
use magma, only: magmaf_init, magmaf_finalize
use magma, only: magmaf_queue_create, magmaf_queue_destroy
use magma, only: magmaf_dmalloc, magmaf_free
use magma, only: magmaf_dsetmatrix, magmaf_dgetmatrix
use magma, only: magmablasf_dgemm
#endif
real*8, intent(in) :: amat(ndim,ndim)
real*8, intent(in) :: bmat(ndim,ndim)
real*8, intent(out) :: cmat(ndim,ndim)
integer :: lda, ldb, ldc
integer :: info
real :: time_before, time_after
integer(8) :: num_ops
real :: gflops
#ifdef USE_MAGMA_DGEMM_GPU
magma_devptr_t :: d_amat
magma_devptr_t :: d_bmat
magma_devptr_t :: d_cmat
magma_devptr_t :: queue !! really a CPU pointer
#endif
lda = ceiling(real(ndim)/32)*32
ldb = ceiling(real(ndim)/32)*32
ldc = ceiling(real(ndim)/32)*32
#if defined(USE_MAGMA_DGEMM_GPU)
!! allocate GPU memory
write(6,'("DEBUG: before matrix A gpu memory allocation (",i0," doubles)")') lda * ndim
info = magmaf_dmalloc( d_amat, lda*ndim )
if (d_amat == 0) then
print "(a)", "failed to allocate d_amat"
return
endif
write(6,'("DEBUG: before matrix B gpu memory allocation (",i0," doubles)")') ldb * ndim
info = magmaf_dmalloc( d_bmat, ldb*ndim )
if (d_bmat == 0) then
print "(a)", "failed to allocate d_bmat"
return
endif
write(6,'("DEBUG: before matrix C gpu memory allocation (",i0," doubles)")') ldc * ndim
info = magmaf_dmalloc( d_cmat, ldc*ndim )
if (d_cmat == 0) then
print "(a)", "failed to allocate d_cmat"
return
endif
! copy A to dA and B to dB
call magmaf_queue_create( 0, queue )
write(6,'("DEBUG: queue = ",i0)') queue
if (queue == 0) then
print "(a)", "failed to create a queue"
return
endif
write(6,*) 'DEBUG: copying matrix A from CPU to GPU memory'
call magmaf_dsetmatrix( ndim, ndim, amat, ndim, d_amat, lda, queue )
write(6,*) 'DEBUG: copying matrix B from CPU to GPU memory'
call magmaf_dsetmatrix( ndim, ndim, bmat, ndim, d_bmat, ldb, queue )
call cpu_time(time_before)
write (6,*) 'before magmablasf_dgemm, time=', time_before
call magmablasf_dgemm ('N', 'N', ndim, ndim, ndim, 1.0d0, d_amat, lda, d_bmat, ldb, 0.0d0, d_cmat, ldc, queue)
call magmaf_queue_sync(queue)
call cpu_time(time_after)
num_ops = real(ndim) * real(ndim) * real(ndim) * 2
gflops = num_ops / (time_after - time_before) / 1.0e9
write (6,*) 'after magmablasf_dgemm, time=', time_after
write (6,*) 'magmablasf_dgemm (from gpu memory to gpu memory) duration :', (time_after - time_before), '(', gflops, ' gflops)'
write(6,*) 'DEBUG: copying matrix C from GPU to CPU memory'
call magmaf_dgetmatrix( ndim, ndim, d_cmat, ldc, cmat, ndim, queue )
call magmaf_queue_destroy( queue )
info = magmaf_free(d_cmat)
info = magmaf_free(d_bmat)
info = magmaf_free(d_amat)
#endif
#ifdef USE_DGEMM
! subroutine dgemm ( character TRANSA,
! character TRANSB,
! integer M,
! integer N,
! integer K,
! double precision ALPHA,
! double precision, dimension(lda,*) A,
! integer LDA,
! double precision, dimension(ldb,*) B,
! integer LDB,
! double precision BETA,
! double precision, dimension(ldc,*) C,
! integer LDC
! )
call dgemm('N', 'N', ndim, ndim, ndim, 1.0d0, amat, ndim, bmat, ndim, 0.0d0, cmat, ndim)
#endif
end subroutine
subroutine check_cmat_element(cmat, row, col, amat, bmat, ndim)
real(8), intent(in) :: cmat(ndim, ndim)
integer, intent(in) :: row
integer, intent(in) :: col
real(8), intent(in) :: amat(ndim, ndim)
real(8), intent(in) :: bmat(ndim, ndim)
integer, intent(in) :: ndim
real(8) :: x
x = 0.0d0
do i = 1, ndim
x = x + amat(row, i) * bmat(i, col)
end do
write(6, '("expected cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, x
write(6, '("computed cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, cmat(row, col)
if (abs(cmat(row, col) - x) > 1.0e-8) then
stop 'a computed element has a wrong value'
end if
end subroutine
subroutine test_dgemm(ndim, num_loops)
#if defined(USE_MAGMA_DGEMM_GPU)
use magma, only: magmaf_init, magmaf_finalize
use magma, only: magmablasf_dgemm !, magmaf_dgemm_gpu
#endif
implicit none
integer, intent(in) :: ndim
integer, intent(in) :: num_loops
integer, parameter :: dp = kind(1.0d0)
real :: tstart, tstop
integer(8) :: num_ops
real :: gflops
INTEGER :: c1,c2,cr,cm,s
REAL :: a_diff, diff, rate
real*8, allocatable :: amat(:,:)
real*8, allocatable :: bmat(:,:)
real*8, allocatable :: cmat(:,:)
real(dp) :: x
integer :: i, j
#if defined(USE_MAGMA_DGEMM_GPU)
write(6,*) 'DEBUG: init magma'
call magmaf_init()
#endif
! First initialize the system_clock
CALL system_clock(count_rate=cr)
CALL system_clock(count_max=cm)
rate = REAL(cr)
WRITE(*,*) "system_clock rate ",rate
diff = 0.0
a_diff = 0.0
s = 0
allocate(amat(ndim, ndim))
allocate(bmat(ndim, ndim))
allocate(cmat(ndim, ndim))
call set_random_seed(42)
!call random_number(amat)
!amat = 0.5_dp*(amat + transpose(amat))
do j = 1, ndim
do i = 1, ndim
call random_number(x)
amat(i,j) = x
call random_number(x)
bmat(i,j) = x
end do
end do
call cpu_time(tstart)
call system_clock(c1)
do j = 1, num_loops
! playmat = amat
call sqmatmul(amat, bmat, cmat, ndim)
end do
call cpu_time(tstop)
call system_clock(c2)
if ( (c2 - c1)/rate < (tstop - tstart) ) s = s + 1
diff = (c2 - c1)/rate - (tstop - tstart) + diff
a_diff = ABS((c2 - c1)/rate - (tstop - tstart)) + a_diff
! check one of the elements of cmat (the last one here: cmat(ndim, ndim))
call check_cmat_element(cmat, 1, 1, amat, bmat, ndim)
call check_cmat_element(cmat, 1, ndim, amat, bmat, ndim)
call check_cmat_element(cmat, ndim, 1, amat, bmat, ndim)
call check_cmat_element(cmat, ndim, ndim, amat, bmat, ndim)
! write(6, *) 'amat = '
! call print_matrix(amat, ndim)
! write(6, *) 'bmat = '
! call print_matrix(bmat, ndim)
! write(6, *) 'cmat = '
! call print_matrix(cmat, ndim)
num_ops = real(ndim) * real(ndim) * real(ndim) * 2 * num_loops
gflops = num_ops / (tstop-tstart) / 1.0e9
write(6, '("Time taken by dgemm for matrix size ",i8," was ",f10.2," seconds")') ndim, tstop-tstart
WRITE(*,*) "gflops (from cpu memory to cpu memory) : ", gflops
WRITE(*,*) "system_clock : ",(c2 - c1)/rate
WRITE(*,*) "cpu_time : ",(tstop - tstart)
WRITE(*,*) "sc < ct : ",s
WRITE(*,*) "mean diff : ",diff
WRITE(*,*) "abs mean diff: ",a_diff
#if defined(USE_MAGMA_DGEMM_GPU)
write(6,*) 'DEBUG: deinit magma'
call magmaf_finalize()
#endif
deallocate(amat, bmat, cmat)
end

25
test/test_starbench.py Normal file
View File

@ -0,0 +1,25 @@
import unittest
import logging
from pathlib import Path
# from cocluto import ClusterController
from starbench.main import starbench_cmake_app, ExistingDir
class StarbenchTestCase(unittest.TestCase):
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def setUp(self) -> None:
return super().setUp()
def test_mamul1_benchmark(self):
logging.info('test_mamul1_benchmark')
source_code_provider = ExistingDir(Path('/home/graffy/work/starbench/starbench.git/test/mamul1'))
tmp_dir = Path('tmp').absolute()
benchmark_command = ['./mamul1', '3000', '10']
starbench_cmake_app(source_code_provider=source_code_provider, tmp_dir=tmp_dir, num_cores=2, benchmark_command=benchmark_command)
# self.assertIsInstance(job_state, JobsState)
if __name__ == '__main__':
unittest.main()