diff --git a/.gitignore b/.gitignore index c15e89f..7ddddf1 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ -src/starbench/__pycache__/ +**/__pycache__/ dist/ src/starbench.egg-info/ +tmp/ diff --git a/README.md b/README.md index 32afe9a..8de3fa4 100644 --- a/README.md +++ b/README.md @@ -45,3 +45,9 @@ Installing collected packages: starbench Successfully installed starbench-1.0.0 bob@bob-ws2:~/work/starbench$ starbench --git-repos-url https://github.com/hibridon/hibridon --code-version a3bed1c3ccfbca572003020d3e3d3b1ff3934fad --git-user g-raffy --git-pass-file "$HOME/.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat" --num-cores 2 --output-dir=/tmp/hibench --cmake-path=/opt/cmake/cmake-3.23.0/bin/cmake --cmake-option=-DCMAKE_BUILD_TYPE=Release --cmake-option=-DBUILD_TESTING=ON --benchmark-command='ctest --output-on-failure -L ^arch4_quick$' ``` + +## how to test + +```sh +(starbench.venv) graffy@graffy-ws2:~/work/starbench/starbench.git$ python3 -m unittest test.test_starbench +``` diff --git a/src/starbench/__init__.py b/src/starbench/__init__.py index e69de29..4c3ad0b 100644 --- a/src/starbench/__init__.py +++ b/src/starbench/__init__.py @@ -0,0 +1,2 @@ +# from .main import starbench_cmake_app +# __all__ = [starbench_cmake_app] \ No newline at end of file diff --git a/src/starbench/main.py b/src/starbench/main.py index e0e8991..f8a66af 100755 --- a/src/starbench/main.py +++ b/src/starbench/main.py @@ -233,6 +233,7 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef def _interpret_tags(tagged_string: str, tags_value: Dict[str, str]) -> str: untagged_string = tagged_string for tag_id, tag_value in tags_value.items(): + assert isinstance(untagged_string, str) untagged_string = untagged_string.replace(tag_id, tag_value) return untagged_string @@ -246,9 +247,12 @@ class CommandPerfEstimator(): # (false positive) pylint: disable=function-redef stdout_filepath = None if self.stdout_filepath is not None: stdout_filepath = CommandPerfEstimator._interpret_tags(str(self.stdout_filepath), tags_value) + Path(stdout_filepath).parent.mkdir(exist_ok=True) stderr_filepath = None if self.stderr_filepath is not None: stderr_filepath = CommandPerfEstimator._interpret_tags(str(self.stderr_filepath), tags_value) + Path(stderr_filepath).parent.mkdir(exist_ok=True) + with self._runs_lock: run = Run(self._next_run_id, worker_id) self._next_run_id += 1 @@ -335,7 +339,7 @@ class GitRepos(IFileTreeProvider): return self.src_dir -def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, num_cores: int, benchmark_command: List[str], cmake_options: List[str] = None, cmake_exe_location: Path = None): +def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, num_cores: int, benchmark_command: List[str], cmake_options: List[str] = [], cmake_exe_location: Path = None): """ tests_to_run : regular expression as understood by ctest's -L option. eg '^arch4_quick$' """ @@ -345,7 +349,7 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, build_dir = worker_dir / 'build' print(f'creating build directory {worker_dir}') create_build_dir = CommandPerfEstimator( - run_command=['mkdir', '-p', build_dir], + run_command=['mkdir', '-p', str(build_dir)], num_cores_per_run=1, num_parallel_runs=num_cores, max_num_cores=num_cores, @@ -360,7 +364,7 @@ def starbench_cmake_app(source_code_provider: IFileTreeProvider, tmp_dir: Path, if cmake_exe_location: cmake_prog = str(cmake_exe_location) configure = CommandPerfEstimator( - run_command=[cmake_prog] + cmake_options + [src_dir], + run_command=[cmake_prog] + cmake_options + [str(src_dir)], num_cores_per_run=1, num_parallel_runs=num_cores, max_num_cores=num_cores, diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/mamul1/CMakeLists.txt b/test/mamul1/CMakeLists.txt new file mode 100644 index 0000000..80095ea --- /dev/null +++ b/test/mamul1/CMakeLists.txt @@ -0,0 +1,43 @@ + +enable_language (Fortran) + +set(MAMUL1_USE_MAGMA "OFF" CACHE BOOL "if set, mamul1 build uses magma (matrix algebra on gpu)") + +set(MAMUL1_MAGMA_API "CPU_MEM_API" CACHE STRING "which magma API to use when building mamul1: CPU_MEM_API for BLAS compatible API (uses matrices stored on CPU memory) or GPU_MEM_API (use matrices stored on GPU memory)") + +add_executable(mamul1 mamul1.F90) + +if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU") + # Allow arbitrary long lines. Needed as preprocessing could generate long line lengths. + target_compile_options(mamul1 PUBLIC -ffree-line-length-none) +elseif (Fortran_COMPILER_NAME STREQUAL "ifort") + # Intel (ifort) + target_compile_options(mamul1 PUBLIC -no-wrap-margin) +endif() + + +if (MAMUL1_USE_MAGMA) + find_package( MAGMA REQUIRED ) + if( MAMUL1_MAGMA_API STREQUAL "CPU_MEM_API" ) + target_compile_definitions(mamul1 PUBLIC USE_MAGMA_DGEMM) + elseif( MAMUL1_MAGMA_API STREQUAL "GPU_MEM_API" ) + target_compile_definitions(mamul1 PUBLIC USE_MAGMA_DGEMM_GPU) + else() + message(FATAL_ERROR "unexpected value for MAMUL1_MAGMA_API : ${MAMUL1_MAGMA_API}") + endif() + message(STATUS "MAGMA_INCLUDES=${MAGMA_INCLUDES}") + include_directories("${MAGMA_INCLUDES}") + target_link_libraries(mamul1 "${MAGMA_LIBRARIES}") +else() + find_package( BLAS REQUIRED ) + find_package( LAPACK REQUIRED ) + # message("BLAS_LIBRARIES=${BLAS_LIBRARIES}") + # message("LAPACK_LIBRARIES=${LAPACK_LIBRARIES}") + target_compile_definitions(mamul1 PUBLIC USE_DGEMM) + + # Link Blas and Lapack libraries + target_link_libraries(mamul1 "${LAPACK_LIBRARIES}") + target_link_libraries(mamul1 "${BLAS_LIBRARIES}") +endif() + +install(TARGETS mamul1) diff --git a/test/mamul1/mamul1.F90 b/test/mamul1/mamul1.F90 new file mode 100644 index 0000000..9f923de --- /dev/null +++ b/test/mamul1/mamul1.F90 @@ -0,0 +1,335 @@ +#define MAMUL1_VERSION "1.0.0" + +#define magma_devptr_t integer(kind=8) +subroutine print_usage(prog_path) + character(len=*), intent(in) :: prog_path + character(len=80) :: build_variant +#if defined(USE_MAGMA_DGEMM_GPU) + build_variant='gpu' +#elif defined(USE_DGEMM) + build_variant='cpu' +#else + build_variant='unknown' +#endif + write(6,'("mamul1 v",a," (variant:",a,"): benchmark performs a square matrix multiplication in double precision")') MAMUL1_VERSION, trim(build_variant); + write(6,'()'); + write(6,'("Usage: ",a," ")') trim(prog_path); + write(6,'(" positive integer representing the size of the square matrices to multiply ")'); + write(6,'(" positive integer representing the number of times the multiplication is performed")'); +end subroutine + +program mamul1 + +implicit none + + +integer :: argc, info, ndim, num_loops + +character(len=32) :: arg0, arg1, arg2 + + +call get_command_argument(0,arg0) + +argc = command_argument_count() +if (argc /= 2) then + call print_usage(trim(arg0)) + ! write(6,'("Usage: ",a," NDIM NUM_LOOPS, where NDIM is a positive integer")') trim(arg0); + stop +end if + +call get_command_argument(1,arg1,status=info) +if (info /= 0) then + write(6,'("Error reading argument: info = ",i2)') info + call print_usage(trim(arg0)) +stop +end if + +call get_command_argument(2,arg2,status=info) +if (info /= 0) then + write(6,'("Error reading argument: info = ",i2)') info + call print_usage(trim(arg0)) +stop +end if + +read(arg1,*,iostat=info) ndim +if (info /= 0) then + write(6,'("Error converting ndim argument to integer: info = ",i2)') info + call print_usage(trim(arg0)) +stop +end if + +read(arg2,*,iostat=info) num_loops +if (info /= 0) then + write(6,'("Error converting num_loops argument to integer: info = ",i2)') info + call print_usage(trim(arg0)) +stop +end if + + +if (ndim < 1) then + call print_usage(trim(arg0)) +stop +end if + + call test_dgemm(ndim, num_loops) + +stop +end program mamul1 + +subroutine set_random_seed(seed) + integer :: seed + integer :: seed_array_size + INTEGER, ALLOCATABLE :: seed_array (:) + CALL RANDOM_SEED (SIZE = seed_array_size) ! I is set to the size of + ! ! the seed array + ALLOCATE (seed_array(seed_array_size)) + seed_array = seed + CALL RANDOM_SEED (PUT=seed_array(1:seed_array_size)) +end subroutine + +subroutine print_matrix(mat, ndim) + implicit none + integer, parameter :: dp = kind(1.0d0) + real(dp), intent(in) :: mat(ndim, ndim) + integer, intent(in) :: ndim + integer :: irow + do irow = 1, ndim + write(6, *) mat(irow,:) + end do +end subroutine + +! square matrix multiplication +subroutine sqmatmul(amat, bmat, cmat, ndim) +#if defined(USE_MAGMA_DGEMM_GPU) + use magma, only: magmaf_init, magmaf_finalize + use magma, only: magmaf_queue_create, magmaf_queue_destroy + use magma, only: magmaf_dmalloc, magmaf_free + use magma, only: magmaf_dsetmatrix, magmaf_dgetmatrix + use magma, only: magmablasf_dgemm +#endif + real*8, intent(in) :: amat(ndim,ndim) + real*8, intent(in) :: bmat(ndim,ndim) + real*8, intent(out) :: cmat(ndim,ndim) + integer :: lda, ldb, ldc + integer :: info + + real :: time_before, time_after + integer(8) :: num_ops + real :: gflops + +#ifdef USE_MAGMA_DGEMM_GPU + magma_devptr_t :: d_amat + magma_devptr_t :: d_bmat + magma_devptr_t :: d_cmat + magma_devptr_t :: queue !! really a CPU pointer +#endif + lda = ceiling(real(ndim)/32)*32 + ldb = ceiling(real(ndim)/32)*32 + ldc = ceiling(real(ndim)/32)*32 + + +#if defined(USE_MAGMA_DGEMM_GPU) + !! allocate GPU memory + write(6,'("DEBUG: before matrix A gpu memory allocation (",i0," doubles)")') lda * ndim + info = magmaf_dmalloc( d_amat, lda*ndim ) + if (d_amat == 0) then + print "(a)", "failed to allocate d_amat" + return + endif + write(6,'("DEBUG: before matrix B gpu memory allocation (",i0," doubles)")') ldb * ndim + info = magmaf_dmalloc( d_bmat, ldb*ndim ) + if (d_bmat == 0) then + print "(a)", "failed to allocate d_bmat" + return + endif + write(6,'("DEBUG: before matrix C gpu memory allocation (",i0," doubles)")') ldc * ndim + info = magmaf_dmalloc( d_cmat, ldc*ndim ) + if (d_cmat == 0) then + print "(a)", "failed to allocate d_cmat" + return + endif + + ! copy A to dA and B to dB + call magmaf_queue_create( 0, queue ) + write(6,'("DEBUG: queue = ",i0)') queue + if (queue == 0) then + print "(a)", "failed to create a queue" + return + endif + + write(6,*) 'DEBUG: copying matrix A from CPU to GPU memory' + call magmaf_dsetmatrix( ndim, ndim, amat, ndim, d_amat, lda, queue ) + write(6,*) 'DEBUG: copying matrix B from CPU to GPU memory' + call magmaf_dsetmatrix( ndim, ndim, bmat, ndim, d_bmat, ldb, queue ) + + call cpu_time(time_before) + write (6,*) 'before magmablasf_dgemm, time=', time_before + + call magmablasf_dgemm ('N', 'N', ndim, ndim, ndim, 1.0d0, d_amat, lda, d_bmat, ldb, 0.0d0, d_cmat, ldc, queue) + call magmaf_queue_sync(queue) + + call cpu_time(time_after) + num_ops = real(ndim) * real(ndim) * real(ndim) * 2 + gflops = num_ops / (time_after - time_before) / 1.0e9 + write (6,*) 'after magmablasf_dgemm, time=', time_after + write (6,*) 'magmablasf_dgemm (from gpu memory to gpu memory) duration :', (time_after - time_before), '(', gflops, ' gflops)' + + write(6,*) 'DEBUG: copying matrix C from GPU to CPU memory' + call magmaf_dgetmatrix( ndim, ndim, d_cmat, ldc, cmat, ndim, queue ) + call magmaf_queue_destroy( queue ) + + info = magmaf_free(d_cmat) + info = magmaf_free(d_bmat) + info = magmaf_free(d_amat) + +#endif + +#ifdef USE_DGEMM + ! subroutine dgemm ( character TRANSA, + ! character TRANSB, + ! integer M, + ! integer N, + ! integer K, + ! double precision ALPHA, + ! double precision, dimension(lda,*) A, + ! integer LDA, + ! double precision, dimension(ldb,*) B, + ! integer LDB, + ! double precision BETA, + ! double precision, dimension(ldc,*) C, + ! integer LDC + ! ) + call dgemm('N', 'N', ndim, ndim, ndim, 1.0d0, amat, ndim, bmat, ndim, 0.0d0, cmat, ndim) +#endif + +end subroutine + +subroutine check_cmat_element(cmat, row, col, amat, bmat, ndim) + real(8), intent(in) :: cmat(ndim, ndim) + integer, intent(in) :: row + integer, intent(in) :: col + real(8), intent(in) :: amat(ndim, ndim) + real(8), intent(in) :: bmat(ndim, ndim) + integer, intent(in) :: ndim + + real(8) :: x + x = 0.0d0 + do i = 1, ndim + x = x + amat(row, i) * bmat(i, col) + end do + + write(6, '("expected cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, x + write(6, '("computed cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, cmat(row, col) + if (abs(cmat(row, col) - x) > 1.0e-8) then + stop 'a computed element has a wrong value' + end if +end subroutine + + +subroutine test_dgemm(ndim, num_loops) +#if defined(USE_MAGMA_DGEMM_GPU) + use magma, only: magmaf_init, magmaf_finalize + use magma, only: magmablasf_dgemm !, magmaf_dgemm_gpu +#endif + + implicit none + integer, intent(in) :: ndim + integer, intent(in) :: num_loops + integer, parameter :: dp = kind(1.0d0) + real :: tstart, tstop + integer(8) :: num_ops + real :: gflops + + INTEGER :: c1,c2,cr,cm,s + REAL :: a_diff, diff, rate + + real*8, allocatable :: amat(:,:) + real*8, allocatable :: bmat(:,:) + real*8, allocatable :: cmat(:,:) + real(dp) :: x + integer :: i, j + +#if defined(USE_MAGMA_DGEMM_GPU) + write(6,*) 'DEBUG: init magma' + call magmaf_init() +#endif + + ! First initialize the system_clock + CALL system_clock(count_rate=cr) + CALL system_clock(count_max=cm) + rate = REAL(cr) + WRITE(*,*) "system_clock rate ",rate + + diff = 0.0 + a_diff = 0.0 + s = 0 + + allocate(amat(ndim, ndim)) + allocate(bmat(ndim, ndim)) + allocate(cmat(ndim, ndim)) + + call set_random_seed(42) + + !call random_number(amat) + !amat = 0.5_dp*(amat + transpose(amat)) + do j = 1, ndim + do i = 1, ndim + call random_number(x) + amat(i,j) = x + call random_number(x) + bmat(i,j) = x + end do + end do + + call cpu_time(tstart) + call system_clock(c1) + + do j = 1, num_loops + ! playmat = amat + + call sqmatmul(amat, bmat, cmat, ndim) + + end do + + call cpu_time(tstop) + call system_clock(c2) + if ( (c2 - c1)/rate < (tstop - tstart) ) s = s + 1 + diff = (c2 - c1)/rate - (tstop - tstart) + diff + a_diff = ABS((c2 - c1)/rate - (tstop - tstart)) + a_diff + + ! check one of the elements of cmat (the last one here: cmat(ndim, ndim)) + call check_cmat_element(cmat, 1, 1, amat, bmat, ndim) + call check_cmat_element(cmat, 1, ndim, amat, bmat, ndim) + call check_cmat_element(cmat, ndim, 1, amat, bmat, ndim) + call check_cmat_element(cmat, ndim, ndim, amat, bmat, ndim) + + ! write(6, *) 'amat = ' + ! call print_matrix(amat, ndim) + + ! write(6, *) 'bmat = ' + ! call print_matrix(bmat, ndim) + + ! write(6, *) 'cmat = ' + ! call print_matrix(cmat, ndim) + + num_ops = real(ndim) * real(ndim) * real(ndim) * 2 * num_loops + gflops = num_ops / (tstop-tstart) / 1.0e9 + + + write(6, '("Time taken by dgemm for matrix size ",i8," was ",f10.2," seconds")') ndim, tstop-tstart + WRITE(*,*) "gflops (from cpu memory to cpu memory) : ", gflops + + WRITE(*,*) "system_clock : ",(c2 - c1)/rate + WRITE(*,*) "cpu_time : ",(tstop - tstart) + WRITE(*,*) "sc < ct : ",s + WRITE(*,*) "mean diff : ",diff + WRITE(*,*) "abs mean diff: ",a_diff + +#if defined(USE_MAGMA_DGEMM_GPU) + write(6,*) 'DEBUG: deinit magma' + call magmaf_finalize() +#endif + + + deallocate(amat, bmat, cmat) + end diff --git a/test/test_starbench.py b/test/test_starbench.py new file mode 100644 index 0000000..a57739a --- /dev/null +++ b/test/test_starbench.py @@ -0,0 +1,25 @@ +import unittest +import logging +from pathlib import Path +# from cocluto import ClusterController +from starbench.main import starbench_cmake_app, ExistingDir + + +class StarbenchTestCase(unittest.TestCase): + + logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') + + def setUp(self) -> None: + return super().setUp() + + def test_mamul1_benchmark(self): + logging.info('test_mamul1_benchmark') + source_code_provider = ExistingDir(Path('/home/graffy/work/starbench/starbench.git/test/mamul1')) + tmp_dir = Path('tmp').absolute() + benchmark_command = ['./mamul1', '3000', '10'] + starbench_cmake_app(source_code_provider=source_code_provider, tmp_dir=tmp_dir, num_cores=2, benchmark_command=benchmark_command) + # self.assertIsInstance(job_state, JobsState) + + +if __name__ == '__main__': + unittest.main()