added a complete standalone usage example (mamul1: mutiplication of matrices) in the form of a unit test
note: [matmul] is a copy of [https://github.com/g-raffy/flobe/tree/main/benchmarks/mamul1]
This commit is contained in:
parent
3dc0d12307
commit
c05ff89d29
|
@ -1,3 +1,4 @@
|
|||
src/starbench/__pycache__/
|
||||
**/__pycache__/
|
||||
dist/
|
||||
src/starbench.egg-info/
|
||||
tmp/
|
||||
|
|
|
@ -45,3 +45,9 @@ Installing collected packages: starbench
|
|||
Successfully installed starbench-1.0.0
|
||||
bob@bob-ws2:~/work/starbench$ starbench --git-repos-url https://github.com/hibridon/hibridon --code-version a3bed1c3ccfbca572003020d3e3d3b1ff3934fad --git-user g-raffy --git-pass-file "$HOME/.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat" --num-cores 2 --output-dir=/tmp/hibench --cmake-path=/opt/cmake/cmake-3.23.0/bin/cmake --cmake-option=-DCMAKE_BUILD_TYPE=Release --cmake-option=-DBUILD_TESTING=ON --benchmark-command='ctest --output-on-failure -L ^arch4_quick$'
|
||||
```
|
||||
|
||||
## how to test
|
||||
|
||||
```sh
|
||||
(starbench.venv) graffy@graffy-ws2:~/work/starbench/starbench.git$ python3 -m unittest test.test_starbench
|
||||
```
|
||||
|
|
|
@ -0,0 +1,2 @@
|
|||
# from .main import starbench_cmake_app
|
||||
# __all__ = [starbench_cmake_app]
|
|
@ -233,6 +233,7 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef
|
|||
def _interpret_tags(tagged_string: str, tags_value: Dict[str, str]) -> str:
|
||||
untagged_string = tagged_string
|
||||
for tag_id, tag_value in tags_value.items():
|
||||
assert isinstance(untagged_string, str)
|
||||
untagged_string = untagged_string.replace(tag_id, tag_value)
|
||||
return untagged_string
|
||||
|
||||
|
@ -246,9 +247,12 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef
|
|||
stdout_filepath = None
|
||||
if self.stdout_filepath is not None:
|
||||
stdout_filepath = CommandPerfEstimator._interpret_tags(str(self.stdout_filepath), tags_value)
|
||||
Path(stdout_filepath).parent.mkdir(exist_ok=True)
|
||||
stderr_filepath = None
|
||||
if self.stderr_filepath is not None:
|
||||
stderr_filepath = CommandPerfEstimator._interpret_tags(str(self.stderr_filepath), tags_value)
|
||||
Path(stderr_filepath).parent.mkdir(exist_ok=True)
|
||||
|
||||
with self._runs_lock:
|
||||
run = Run(self._next_run_id, worker_id)
|
||||
self._next_run_id += 1
|
||||
|
@ -335,7 +339,7 @@ class GitRepos(IFileTreeProvider):
|
|||
return self.src_dir
|
||||
|
||||
|
||||
def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, num_cores: int, benchmark_command: List[str], cmake_options: List[str] = None, cmake_exe_location: Path = None):
|
||||
def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, num_cores: int, benchmark_command: List[str], cmake_options: List[str] = [], cmake_exe_location: Path = None):
|
||||
"""
|
||||
tests_to_run : regular expression as understood by ctest's -L option. eg '^arch4_quick$'
|
||||
"""
|
||||
|
@ -345,7 +349,7 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path,
|
|||
build_dir = worker_dir / 'build'
|
||||
print(f'creating build directory {worker_dir}')
|
||||
create_build_dir = CommandPerfEstimator(
|
||||
run_command=['mkdir', '-p', build_dir],
|
||||
run_command=['mkdir', '-p', str(build_dir)],
|
||||
num_cores_per_run=1,
|
||||
num_parallel_runs=num_cores,
|
||||
max_num_cores=num_cores,
|
||||
|
@ -360,7 +364,7 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path,
|
|||
if cmake_exe_location:
|
||||
cmake_prog = str(cmake_exe_location)
|
||||
configure = CommandPerfEstimator(
|
||||
run_command=[cmake_prog] + cmake_options + [src_dir],
|
||||
run_command=[cmake_prog] + cmake_options + [str(src_dir)],
|
||||
num_cores_per_run=1,
|
||||
num_parallel_runs=num_cores,
|
||||
max_num_cores=num_cores,
|
||||
|
|
|
@ -0,0 +1,43 @@
|
|||
|
||||
enable_language (Fortran)
|
||||
|
||||
set(MAMUL1_USE_MAGMA "OFF" CACHE BOOL "if set, mamul1 build uses magma (matrix algebra on gpu)")
|
||||
|
||||
set(MAMUL1_MAGMA_API "CPU_MEM_API" CACHE STRING "which magma API to use when building mamul1: CPU_MEM_API for BLAS compatible API (uses matrices stored on CPU memory) or GPU_MEM_API (use matrices stored on GPU memory)")
|
||||
|
||||
add_executable(mamul1 mamul1.F90)
|
||||
|
||||
if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
|
||||
# Allow arbitrary long lines. Needed as preprocessing could generate long line lengths.
|
||||
target_compile_options(mamul1 PUBLIC -ffree-line-length-none)
|
||||
elseif (Fortran_COMPILER_NAME STREQUAL "ifort")
|
||||
# Intel (ifort)
|
||||
target_compile_options(mamul1 PUBLIC -no-wrap-margin)
|
||||
endif()
|
||||
|
||||
|
||||
if (MAMUL1_USE_MAGMA)
|
||||
find_package( MAGMA REQUIRED )
|
||||
if( MAMUL1_MAGMA_API STREQUAL "CPU_MEM_API" )
|
||||
target_compile_definitions(mamul1 PUBLIC USE_MAGMA_DGEMM)
|
||||
elseif( MAMUL1_MAGMA_API STREQUAL "GPU_MEM_API" )
|
||||
target_compile_definitions(mamul1 PUBLIC USE_MAGMA_DGEMM_GPU)
|
||||
else()
|
||||
message(FATAL_ERROR "unexpected value for MAMUL1_MAGMA_API : ${MAMUL1_MAGMA_API}")
|
||||
endif()
|
||||
message(STATUS "MAGMA_INCLUDES=${MAGMA_INCLUDES}")
|
||||
include_directories("${MAGMA_INCLUDES}")
|
||||
target_link_libraries(mamul1 "${MAGMA_LIBRARIES}")
|
||||
else()
|
||||
find_package( BLAS REQUIRED )
|
||||
find_package( LAPACK REQUIRED )
|
||||
# message("BLAS_LIBRARIES=${BLAS_LIBRARIES}")
|
||||
# message("LAPACK_LIBRARIES=${LAPACK_LIBRARIES}")
|
||||
target_compile_definitions(mamul1 PUBLIC USE_DGEMM)
|
||||
|
||||
# Link Blas and Lapack libraries
|
||||
target_link_libraries(mamul1 "${LAPACK_LIBRARIES}")
|
||||
target_link_libraries(mamul1 "${BLAS_LIBRARIES}")
|
||||
endif()
|
||||
|
||||
install(TARGETS mamul1)
|
|
@ -0,0 +1,335 @@
|
|||
#define MAMUL1_VERSION "1.0.0"
|
||||
|
||||
#define magma_devptr_t integer(kind=8)
|
||||
subroutine print_usage(prog_path)
|
||||
character(len=*), intent(in) :: prog_path
|
||||
character(len=80) :: build_variant
|
||||
#if defined(USE_MAGMA_DGEMM_GPU)
|
||||
build_variant='gpu'
|
||||
#elif defined(USE_DGEMM)
|
||||
build_variant='cpu'
|
||||
#else
|
||||
build_variant='unknown'
|
||||
#endif
|
||||
write(6,'("mamul1 v",a," (variant:",a,"): benchmark performs a square matrix multiplication in double precision")') MAMUL1_VERSION, trim(build_variant);
|
||||
write(6,'()');
|
||||
write(6,'("Usage: ",a," <NDIM> <NUM_LOOPS>")') trim(prog_path);
|
||||
write(6,'(" <NDIM> positive integer representing the size of the square matrices to multiply ")');
|
||||
write(6,'(" <NUM_LOOPS> positive integer representing the number of times the multiplication is performed")');
|
||||
end subroutine
|
||||
|
||||
program mamul1
|
||||
|
||||
implicit none
|
||||
|
||||
|
||||
integer :: argc, info, ndim, num_loops
|
||||
|
||||
character(len=32) :: arg0, arg1, arg2
|
||||
|
||||
|
||||
call get_command_argument(0,arg0)
|
||||
|
||||
argc = command_argument_count()
|
||||
if (argc /= 2) then
|
||||
call print_usage(trim(arg0))
|
||||
! write(6,'("Usage: ",a," NDIM NUM_LOOPS, where NDIM is a positive integer")') trim(arg0);
|
||||
stop
|
||||
end if
|
||||
|
||||
call get_command_argument(1,arg1,status=info)
|
||||
if (info /= 0) then
|
||||
write(6,'("Error reading argument: info = ",i2)') info
|
||||
call print_usage(trim(arg0))
|
||||
stop
|
||||
end if
|
||||
|
||||
call get_command_argument(2,arg2,status=info)
|
||||
if (info /= 0) then
|
||||
write(6,'("Error reading argument: info = ",i2)') info
|
||||
call print_usage(trim(arg0))
|
||||
stop
|
||||
end if
|
||||
|
||||
read(arg1,*,iostat=info) ndim
|
||||
if (info /= 0) then
|
||||
write(6,'("Error converting ndim argument to integer: info = ",i2)') info
|
||||
call print_usage(trim(arg0))
|
||||
stop
|
||||
end if
|
||||
|
||||
read(arg2,*,iostat=info) num_loops
|
||||
if (info /= 0) then
|
||||
write(6,'("Error converting num_loops argument to integer: info = ",i2)') info
|
||||
call print_usage(trim(arg0))
|
||||
stop
|
||||
end if
|
||||
|
||||
|
||||
if (ndim < 1) then
|
||||
call print_usage(trim(arg0))
|
||||
stop
|
||||
end if
|
||||
|
||||
call test_dgemm(ndim, num_loops)
|
||||
|
||||
stop
|
||||
end program mamul1
|
||||
|
||||
subroutine set_random_seed(seed)
|
||||
integer :: seed
|
||||
integer :: seed_array_size
|
||||
INTEGER, ALLOCATABLE :: seed_array (:)
|
||||
CALL RANDOM_SEED (SIZE = seed_array_size) ! I is set to the size of
|
||||
! ! the seed array
|
||||
ALLOCATE (seed_array(seed_array_size))
|
||||
seed_array = seed
|
||||
CALL RANDOM_SEED (PUT=seed_array(1:seed_array_size))
|
||||
end subroutine
|
||||
|
||||
subroutine print_matrix(mat, ndim)
|
||||
implicit none
|
||||
integer, parameter :: dp = kind(1.0d0)
|
||||
real(dp), intent(in) :: mat(ndim, ndim)
|
||||
integer, intent(in) :: ndim
|
||||
integer :: irow
|
||||
do irow = 1, ndim
|
||||
write(6, *) mat(irow,:)
|
||||
end do
|
||||
end subroutine
|
||||
|
||||
! square matrix multiplication
|
||||
subroutine sqmatmul(amat, bmat, cmat, ndim)
|
||||
#if defined(USE_MAGMA_DGEMM_GPU)
|
||||
use magma, only: magmaf_init, magmaf_finalize
|
||||
use magma, only: magmaf_queue_create, magmaf_queue_destroy
|
||||
use magma, only: magmaf_dmalloc, magmaf_free
|
||||
use magma, only: magmaf_dsetmatrix, magmaf_dgetmatrix
|
||||
use magma, only: magmablasf_dgemm
|
||||
#endif
|
||||
real*8, intent(in) :: amat(ndim,ndim)
|
||||
real*8, intent(in) :: bmat(ndim,ndim)
|
||||
real*8, intent(out) :: cmat(ndim,ndim)
|
||||
integer :: lda, ldb, ldc
|
||||
integer :: info
|
||||
|
||||
real :: time_before, time_after
|
||||
integer(8) :: num_ops
|
||||
real :: gflops
|
||||
|
||||
#ifdef USE_MAGMA_DGEMM_GPU
|
||||
magma_devptr_t :: d_amat
|
||||
magma_devptr_t :: d_bmat
|
||||
magma_devptr_t :: d_cmat
|
||||
magma_devptr_t :: queue !! really a CPU pointer
|
||||
#endif
|
||||
lda = ceiling(real(ndim)/32)*32
|
||||
ldb = ceiling(real(ndim)/32)*32
|
||||
ldc = ceiling(real(ndim)/32)*32
|
||||
|
||||
|
||||
#if defined(USE_MAGMA_DGEMM_GPU)
|
||||
!! allocate GPU memory
|
||||
write(6,'("DEBUG: before matrix A gpu memory allocation (",i0," doubles)")') lda * ndim
|
||||
info = magmaf_dmalloc( d_amat, lda*ndim )
|
||||
if (d_amat == 0) then
|
||||
print "(a)", "failed to allocate d_amat"
|
||||
return
|
||||
endif
|
||||
write(6,'("DEBUG: before matrix B gpu memory allocation (",i0," doubles)")') ldb * ndim
|
||||
info = magmaf_dmalloc( d_bmat, ldb*ndim )
|
||||
if (d_bmat == 0) then
|
||||
print "(a)", "failed to allocate d_bmat"
|
||||
return
|
||||
endif
|
||||
write(6,'("DEBUG: before matrix C gpu memory allocation (",i0," doubles)")') ldc * ndim
|
||||
info = magmaf_dmalloc( d_cmat, ldc*ndim )
|
||||
if (d_cmat == 0) then
|
||||
print "(a)", "failed to allocate d_cmat"
|
||||
return
|
||||
endif
|
||||
|
||||
! copy A to dA and B to dB
|
||||
call magmaf_queue_create( 0, queue )
|
||||
write(6,'("DEBUG: queue = ",i0)') queue
|
||||
if (queue == 0) then
|
||||
print "(a)", "failed to create a queue"
|
||||
return
|
||||
endif
|
||||
|
||||
write(6,*) 'DEBUG: copying matrix A from CPU to GPU memory'
|
||||
call magmaf_dsetmatrix( ndim, ndim, amat, ndim, d_amat, lda, queue )
|
||||
write(6,*) 'DEBUG: copying matrix B from CPU to GPU memory'
|
||||
call magmaf_dsetmatrix( ndim, ndim, bmat, ndim, d_bmat, ldb, queue )
|
||||
|
||||
call cpu_time(time_before)
|
||||
write (6,*) 'before magmablasf_dgemm, time=', time_before
|
||||
|
||||
call magmablasf_dgemm ('N', 'N', ndim, ndim, ndim, 1.0d0, d_amat, lda, d_bmat, ldb, 0.0d0, d_cmat, ldc, queue)
|
||||
call magmaf_queue_sync(queue)
|
||||
|
||||
call cpu_time(time_after)
|
||||
num_ops = real(ndim) * real(ndim) * real(ndim) * 2
|
||||
gflops = num_ops / (time_after - time_before) / 1.0e9
|
||||
write (6,*) 'after magmablasf_dgemm, time=', time_after
|
||||
write (6,*) 'magmablasf_dgemm (from gpu memory to gpu memory) duration :', (time_after - time_before), '(', gflops, ' gflops)'
|
||||
|
||||
write(6,*) 'DEBUG: copying matrix C from GPU to CPU memory'
|
||||
call magmaf_dgetmatrix( ndim, ndim, d_cmat, ldc, cmat, ndim, queue )
|
||||
call magmaf_queue_destroy( queue )
|
||||
|
||||
info = magmaf_free(d_cmat)
|
||||
info = magmaf_free(d_bmat)
|
||||
info = magmaf_free(d_amat)
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef USE_DGEMM
|
||||
! subroutine dgemm ( character TRANSA,
|
||||
! character TRANSB,
|
||||
! integer M,
|
||||
! integer N,
|
||||
! integer K,
|
||||
! double precision ALPHA,
|
||||
! double precision, dimension(lda,*) A,
|
||||
! integer LDA,
|
||||
! double precision, dimension(ldb,*) B,
|
||||
! integer LDB,
|
||||
! double precision BETA,
|
||||
! double precision, dimension(ldc,*) C,
|
||||
! integer LDC
|
||||
! )
|
||||
call dgemm('N', 'N', ndim, ndim, ndim, 1.0d0, amat, ndim, bmat, ndim, 0.0d0, cmat, ndim)
|
||||
#endif
|
||||
|
||||
end subroutine
|
||||
|
||||
subroutine check_cmat_element(cmat, row, col, amat, bmat, ndim)
|
||||
real(8), intent(in) :: cmat(ndim, ndim)
|
||||
integer, intent(in) :: row
|
||||
integer, intent(in) :: col
|
||||
real(8), intent(in) :: amat(ndim, ndim)
|
||||
real(8), intent(in) :: bmat(ndim, ndim)
|
||||
integer, intent(in) :: ndim
|
||||
|
||||
real(8) :: x
|
||||
x = 0.0d0
|
||||
do i = 1, ndim
|
||||
x = x + amat(row, i) * bmat(i, col)
|
||||
end do
|
||||
|
||||
write(6, '("expected cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, x
|
||||
write(6, '("computed cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, cmat(row, col)
|
||||
if (abs(cmat(row, col) - x) > 1.0e-8) then
|
||||
stop 'a computed element has a wrong value'
|
||||
end if
|
||||
end subroutine
|
||||
|
||||
|
||||
subroutine test_dgemm(ndim, num_loops)
|
||||
#if defined(USE_MAGMA_DGEMM_GPU)
|
||||
use magma, only: magmaf_init, magmaf_finalize
|
||||
use magma, only: magmablasf_dgemm !, magmaf_dgemm_gpu
|
||||
#endif
|
||||
|
||||
implicit none
|
||||
integer, intent(in) :: ndim
|
||||
integer, intent(in) :: num_loops
|
||||
integer, parameter :: dp = kind(1.0d0)
|
||||
real :: tstart, tstop
|
||||
integer(8) :: num_ops
|
||||
real :: gflops
|
||||
|
||||
INTEGER :: c1,c2,cr,cm,s
|
||||
REAL :: a_diff, diff, rate
|
||||
|
||||
real*8, allocatable :: amat(:,:)
|
||||
real*8, allocatable :: bmat(:,:)
|
||||
real*8, allocatable :: cmat(:,:)
|
||||
real(dp) :: x
|
||||
integer :: i, j
|
||||
|
||||
#if defined(USE_MAGMA_DGEMM_GPU)
|
||||
write(6,*) 'DEBUG: init magma'
|
||||
call magmaf_init()
|
||||
#endif
|
||||
|
||||
! First initialize the system_clock
|
||||
CALL system_clock(count_rate=cr)
|
||||
CALL system_clock(count_max=cm)
|
||||
rate = REAL(cr)
|
||||
WRITE(*,*) "system_clock rate ",rate
|
||||
|
||||
diff = 0.0
|
||||
a_diff = 0.0
|
||||
s = 0
|
||||
|
||||
allocate(amat(ndim, ndim))
|
||||
allocate(bmat(ndim, ndim))
|
||||
allocate(cmat(ndim, ndim))
|
||||
|
||||
call set_random_seed(42)
|
||||
|
||||
!call random_number(amat)
|
||||
!amat = 0.5_dp*(amat + transpose(amat))
|
||||
do j = 1, ndim
|
||||
do i = 1, ndim
|
||||
call random_number(x)
|
||||
amat(i,j) = x
|
||||
call random_number(x)
|
||||
bmat(i,j) = x
|
||||
end do
|
||||
end do
|
||||
|
||||
call cpu_time(tstart)
|
||||
call system_clock(c1)
|
||||
|
||||
do j = 1, num_loops
|
||||
! playmat = amat
|
||||
|
||||
call sqmatmul(amat, bmat, cmat, ndim)
|
||||
|
||||
end do
|
||||
|
||||
call cpu_time(tstop)
|
||||
call system_clock(c2)
|
||||
if ( (c2 - c1)/rate < (tstop - tstart) ) s = s + 1
|
||||
diff = (c2 - c1)/rate - (tstop - tstart) + diff
|
||||
a_diff = ABS((c2 - c1)/rate - (tstop - tstart)) + a_diff
|
||||
|
||||
! check one of the elements of cmat (the last one here: cmat(ndim, ndim))
|
||||
call check_cmat_element(cmat, 1, 1, amat, bmat, ndim)
|
||||
call check_cmat_element(cmat, 1, ndim, amat, bmat, ndim)
|
||||
call check_cmat_element(cmat, ndim, 1, amat, bmat, ndim)
|
||||
call check_cmat_element(cmat, ndim, ndim, amat, bmat, ndim)
|
||||
|
||||
! write(6, *) 'amat = '
|
||||
! call print_matrix(amat, ndim)
|
||||
|
||||
! write(6, *) 'bmat = '
|
||||
! call print_matrix(bmat, ndim)
|
||||
|
||||
! write(6, *) 'cmat = '
|
||||
! call print_matrix(cmat, ndim)
|
||||
|
||||
num_ops = real(ndim) * real(ndim) * real(ndim) * 2 * num_loops
|
||||
gflops = num_ops / (tstop-tstart) / 1.0e9
|
||||
|
||||
|
||||
write(6, '("Time taken by dgemm for matrix size ",i8," was ",f10.2," seconds")') ndim, tstop-tstart
|
||||
WRITE(*,*) "gflops (from cpu memory to cpu memory) : ", gflops
|
||||
|
||||
WRITE(*,*) "system_clock : ",(c2 - c1)/rate
|
||||
WRITE(*,*) "cpu_time : ",(tstop - tstart)
|
||||
WRITE(*,*) "sc < ct : ",s
|
||||
WRITE(*,*) "mean diff : ",diff
|
||||
WRITE(*,*) "abs mean diff: ",a_diff
|
||||
|
||||
#if defined(USE_MAGMA_DGEMM_GPU)
|
||||
write(6,*) 'DEBUG: deinit magma'
|
||||
call magmaf_finalize()
|
||||
#endif
|
||||
|
||||
|
||||
deallocate(amat, bmat, cmat)
|
||||
end
|
|
@ -0,0 +1,25 @@
|
|||
import unittest
|
||||
import logging
|
||||
from pathlib import Path
|
||||
# from cocluto import ClusterController
|
||||
from starbench.main import starbench_cmake_app, ExistingDir
|
||||
|
||||
|
||||
class StarbenchTestCase(unittest.TestCase):
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
|
||||
def setUp(self) -> None:
|
||||
return super().setUp()
|
||||
|
||||
def test_mamul1_benchmark(self):
|
||||
logging.info('test_mamul1_benchmark')
|
||||
source_code_provider = ExistingDir(Path('/home/graffy/work/starbench/starbench.git/test/mamul1'))
|
||||
tmp_dir = Path('tmp').absolute()
|
||||
benchmark_command = ['./mamul1', '3000', '10']
|
||||
starbench_cmake_app(source_code_provider=source_code_provider, tmp_dir=tmp_dir, num_cores=2, benchmark_command=benchmark_command)
|
||||
# self.assertIsInstance(job_state, JobsState)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
unittest.main()
|
Loading…
Reference in New Issue