refactored iprbench to separate ipr benchmark framework from the actual benchmarks

This decoupling allows to write benchmarks as modules that can be used in various situations (from a benchmark job or directly from a user), but this design will allow automatic registering of the benchmark results in a user selectable form (sql database, stdout, etc.) - separated `hibenchonphysix.py` into `clusterbench.py` (tool to run a benchmark on a cluster) and `hibench.py` (hibridon benchmark module) so that `clusterbench.py` no longer has a knowledge about hibridon. - there are currently 2 ways to run a bechmark: 1. as a simple run through `clusterbench-run` command (which will eventually be renamed as iprbench-run since it might be completely independent from the concept of cluster) 2. as cluster jobs through `clusterbench-submit` command - added unit test - added another benchmark `mamul1` that is used as a unittest because it has 2 benefits over `hibench` benchmark: 1. it's standalone (no external resources needed) 2. it's quicker to execute note: this refactoring work is not complete yet, but the concept proof is complete (the 2 unittests pass): - still need to provide the user a way to switch between IpRCluster and DummyCluster(which is only intended to only be used for testing clusterbench)) - still need to run multiple configs of the same benchmark in one run (as hibenchonphysix did) work related to [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3958] and [https://bugzilla.ipr.univ-rennes.fr/show_bug.cgi?id=3372]
2024-10-22 09:16:41 +02:00 · 2024-10-22 09:16:41 +02:00 · 011d4eddf9
parent fe4a07a67e
commit 011d4eddf9
17 changed files with 865 additions and 132 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,7 @@
 iprbench.venv/
 results/
-iprbench/__pycache__/__init__.cpython-38.pyc
+iprbench.egg-info/
+iprbench/benchmarks/__pycache__/
+iprbench/__pycache__/
+test/__pycache__/
+iprbench/resources/__pycache__/
--- a/README.md
+++ b/README.md
@ -90,6 +90,44 @@ Installing collected packages: pytz, tzdata, typing-extensions, starbench, six,
 Successfully installed contourpy-1.3.0 cycler-0.12.1 fonttools-4.54.1 greenlet-3.1.1 iprbench-0.0.1 kiwisolver-1.4.7 matplotlib-3.9.2 numpy-2.1.2 packaging-24.1 pandas-2.2.3 pillow-10.4.0 pyparsing-3.1.4 python-dateutil-2.9.0.post0 pytz-2024.2 six-1.16.0 sqlalchemy-2.0.35 starbench-1.0.0 typing-extensions-4.12.2 tzdata-2024.2
 ```

+## run unit tests
+
+```sh
+20241018-16:56:05 graffy@graffy-ws2:~/work/starbench/iprbench.git$ python3 -m unittest test.test_iprbench
+2024-10-18 16:57:42,589 - INFO - test_iprbench_run
+creating build directory /tmp/mamul1_out/output/worker<worker_id>
+executing the following command in parallel (2 parallel runs) : '['mkdir', '-p', '/tmp/mamul1_out/output/worker<worker_id>/build']'
+mean duration : 0.004 s (2 runs)
+configuring /home/graffy/work/starbench/iprbench.git/test/mamul1 into /tmp/mamul1_out/output/worker<worker_id>/build ...
+executing the following command in parallel (2 parallel runs) : '['/usr/bin/cmake', '-DCMAKE_BUILD_TYPE=Release', '-DCMAKE_Fortran_COMPILER=gfortran', '/home/graffy/work/starbench/iprbench.git/test/mamul1']'
+mean duration : 0.098 s (2 runs)
+building /tmp/mamul1_out/output/worker<worker_id>/build ...
+executing the following command in parallel (2 parallel runs) : '['make']'
+mean duration : 0.073 s (2 runs)
+benchmarking /tmp/mamul1_out/output/worker<worker_id>/build ...
+executing the following command in parallel (2 parallel runs) : '['./mamul1', '1024', '10']'
+mean duration : 0.660 s (2 runs)
+duration : 0.660 s
+.
+----------------------------------------------------------------------
+Ran 1 test in 1.035s
+
+OK
+last command status : [0]
+```
+
+```sh
+20241018-16:56:05 graffy@graffy-ws2:~/work/starbench/iprbench.git$ python3 -m unittest test.test_clusterbench
+```
+
+
+
+## launch a benchmark on the current system
+
+```sh
+iprbench-run --benchmark-id 'mamul1' --config '{"compiler_id": "gfortran", "matrix_size": 1024, "num_loops":10, "num_cores":2}' --results-dir /tmp/mamul1_out
+```
+
 ## launch benchmark jobs on alambix cluster

 ```sh
--- a/iprbench/benchmarks/init.py
+++ b/iprbench/benchmarks/init.py
--- a/iprbench/benchmarks/hibench.py
+++ b/iprbench/benchmarks/hibench.py
@ -0,0 +1,75 @@
+from ..core import IBenchmark, BenchParam, BenchmarkConfig
+from pathlib import Path
+import subprocess
+import os
+
+
+class HiBench(IBenchmark):
+    """Hibridon benchmark
+    """
+
+    def __init__(self):
+        bench_params = []
+        bench_params.append(BenchParam('hibridon_version', BenchParam.Type.PARAM_TYPE_STRING, 'the version of hibridon, in the form of a commit id'))
+        bench_params.append(BenchParam('compiler_id', BenchParam.Type.PARAM_TYPE_STRING, 'the id of the compiler used in the benchmark'))
+        bench_params.append(BenchParam('compiler_version', BenchParam.Type.PARAM_TYPE_STRING, 'the version of the used compiler'))
+        bench_params.append(BenchParam('blas_id', BenchParam.Type.PARAM_TYPE_STRING, 'the id of the blas library used in the benchmark'))
+        bench_params.append(BenchParam('blas_version', BenchParam.Type.PARAM_TYPE_STRING, 'the version of the blas library used in the benchmark'))
+        bench_params.append(BenchParam('test_id', BenchParam.Type.PARAM_TYPE_STRING, 'the name of the test to run (eg arch4_quick (about 2s on a core i5 8th generation) or nh3h2_qma_long (about 10min on a core i5 8th generation))'))
+
+        super().__init__(bench_id='hibench', bench_params=bench_params)
+
+    def get_ram_requirements(self, config: BenchmarkConfig) -> int:
+        GIBIBYTE_TO_BYTE = 1024 * 1024 * 1024
+        ram_per_core = 0  # in bytes
+        benchmark_test = config['test_id']
+        if benchmark_test == 'arch4_quick':
+            ram_per_core = int(1 * GIBIBYTE_TO_BYTE)
+        elif benchmark_test == 'nh3h2_qma_long':
+            ram_per_core = int(2.8 * GIBIBYTE_TO_BYTE)  # this was enough on physix48, but maybe we can reduce more
+        else:
+            assert f'unhandled benchmark_test : {benchmark_test}'
+        return ram_per_core
+
+    def execute(self, config: BenchmarkConfig, benchmark_output_dir: Path):
+        git_repos_url = 'https://github.com/hibridon/hibridon'
+        git_user = 'g-raffy'  # os.environ['HIBRIDON_REPOS_USER']
+        git_pass_file = f'{os.getenv("HOME")}/.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat'
+        hibridon_version = config['hibridon_version']
+        test_id = config['test_id']  # eg arch4_quick or nh3h2_qma_long
+        compiler_id = config['compiler_id']
+
+        src_dir = benchmark_output_dir / 'hibridon.git'
+        output_dir = benchmark_output_dir / 'output'
+
+        password_provider = f'{{"type": "password-file", "password-file-path": "{git_pass_file}"}}'
+        source_tree_provider = f'{{"type": "git-cloner", "repos-url": "{git_repos_url}", "src-dir": "{src_dir}", "code-version": "{hibridon_version}", "git-user": "{git_user}", "password-provider": {password_provider}}}'
+        benchmark_command = f'ctest --output-on-failure -L ^{test_id}$'
+
+        cmake_options = [
+            '-DCMAKE_BUILD_TYPE=Release',  # build in release mode for highest performance
+            '-DBUILD_TESTING=ON'  # enable hibridon tests
+        ]
+
+        env_vars_bash_commands = ''
+        if compiler_id == 'ifort':
+            env_vars_bash_commands = 'module load compilers/ifort/latest'
+            cmake_options.append('-DCMAKE_Fortran_COMPILER=ifort')  # use intel fortran compiler
+            cmake_options.append('-DBLA_VENDOR=Intel10_64lp')  # use 64 bits intel mkl with multithreading
+        elif compiler_id == 'gfortran':
+            env_vars_bash_commands = ''
+            cmake_options.append('-DCMAKE_Fortran_COMPILER=gfortran')  # use gfortran compiler
+        else:
+            assert f'unhandled compiler_id : {compiler_id}'
+
+        ur1_proxy_url = 'http://proxy-nt.univ-rennes1.fr:3128'
+        proxy_env_vars = ''
+        proxy_env_vars = f'{proxy_env_vars} HTTP_PROXY={ur1_proxy_url}'
+        proxy_env_vars = f'{proxy_env_vars} HTTPS_PROXY={ur1_proxy_url}'
+        proxy_env_vars = f'{proxy_env_vars} FTP_PROXY={ur1_proxy_url}'
+        proxy_env_vars = f'{proxy_env_vars} http_proxy={ur1_proxy_url}'
+        proxy_env_vars = f'{proxy_env_vars} https_proxy={ur1_proxy_url}'
+        proxy_env_vars = f'{proxy_env_vars} ftp_proxy={ur1_proxy_url}'
+
+        shell_command = f'{env_vars_bash_commands} && {proxy_env_vars} starbench --source-tree-provider \'{source_tree_provider}\' --num-cores 2 --output-dir={output_dir} --cmake-path=/usr/bin/cmake {" ".join([f"--cmake-option={option}" for option in cmake_options])} --benchmark-command=\'{benchmark_command}\''
+        subprocess.run(shell_command, shell=True, check=True, encoding='/bin/bash')
--- a/iprbench/benchmarks/mamul1.py
+++ b/iprbench/benchmarks/mamul1.py
@ -0,0 +1,62 @@
+from ..core import IBenchmark, BenchParam, BenchmarkConfig
+from pathlib import Path
+import subprocess
+# import importlib.resources
+
+
+class MaMul1(IBenchmark):
+    """Matrix multiplication benchmark
+    """
+
+    def __init__(self):
+        bench_params = []
+        bench_params.append(BenchParam('compiler_id', BenchParam.Type.PARAM_TYPE_STRING, 'the id of the compiler used in the benchmark'))
+        bench_params.append(BenchParam('num_cores', BenchParam.Type.PARAM_TYPE_INT, 'the number of cores to use by this benchmark'))
+        bench_params.append(BenchParam('matrix_size', BenchParam.Type.PARAM_TYPE_INT, 'the size n of all the the n * n matrices'))
+        bench_params.append(BenchParam('num_loops', BenchParam.Type.PARAM_TYPE_INT, 'the number of identical multiplications performed in sequence'))
+        # bench_params.append(BenchParam('source_dir', BenchParam.Type.PARAM_TYPE_STRING, 'the path to the directory containing mamul1 test source files'))
+        super().__init__(bench_id='mamul1', bench_params=bench_params)
+
+    def get_ram_requirements(self, config: BenchmarkConfig) -> int:
+        GIBIBYTE_TO_BYTE = 1024 * 1024 * 1024
+        SIZE_OF_DOUBLE = 8  # in bytes
+        matrix_size = config['matrix_size']
+        matrix_ram_size = matrix_size * matrix_size * SIZE_OF_DOUBLE
+        num_matrices = 3
+        ram_requirements = int(1 * GIBIBYTE_TO_BYTE) + num_matrices * matrix_ram_size
+        return ram_requirements
+
+    def execute(self, config: BenchmarkConfig, benchmark_output_dir: Path):
+        compiler_id = config['compiler_id']
+        num_cores = config['num_cores']
+        matrix_size = config['matrix_size']
+        num_loops = config['num_loops']
+
+        # src_dir = Path('test/mamul1').absolute()
+        src_dir = Path('/home/graffy/work/starbench/iprbench.git/test/mamul1')
+        # with importlib.resources.path('iprbench.resources', 'mamul1') as src_dir:
+        output_dir = benchmark_output_dir / 'output'
+
+        source_tree_provider = f'{{"type": "existing-dir", "dir-path": "{src_dir}"}}'
+        benchmark_command = ['./mamul1', f'{matrix_size}', f'{num_loops}']
+
+        cmake_options = [
+            '-DCMAKE_BUILD_TYPE=Release',  # build in release mode for highest performance
+        ]
+
+        env_vars_bash_commands = ''
+        if compiler_id == 'ifort':
+            env_vars_bash_commands = 'module load compilers/ifort/latest'
+            cmake_options.append('-DCMAKE_Fortran_COMPILER=ifort')  # use intel fortran compiler
+            cmake_options.append('-DBLA_VENDOR=Intel10_64lp')  # use 64 bits intel mkl with multithreading
+        elif compiler_id == 'gfortran':
+            env_vars_bash_commands = ''
+            cmake_options.append('-DCMAKE_Fortran_COMPILER=gfortran')  # use gfortran compiler
+        else:
+            assert f'unhandled compiler_id : {compiler_id}'
+
+        shell_command = ''
+        if len(env_vars_bash_commands) > 0:
+            shell_command += f'{env_vars_bash_commands} && '
+        shell_command += f'starbench --source-tree-provider \'{source_tree_provider}\' --num-cores {num_cores} --output-dir={output_dir} --cmake-path=/usr/bin/cmake {" ".join([f"--cmake-option={option}" for option in cmake_options])} --benchmark-command=\'{" ".join(benchmark_command)}\''
+        subprocess.run(shell_command, shell=True, check=True, encoding='/bin/bash')
--- a/iprbench/benchmarks/showresults.py
+++ b/iprbench/benchmarks/showresults.py
--- a/iprbench/hibench/hibenchonphysix.py
+++ b/iprbench/hibench/hibenchonphysix.py
@ -1,8 +1,10 @@
 #!/usr/bin/env python3
 # this script launches jobs to run hibridon benchmarks on physix cluster for the given version of hibridon (commit number)
 from typing import List, Tuple, Dict
-from argparse import ArgumentParser
+import argparse
 from os import getenv, makedirs
+from .core import IBenchmark, BenchmarkConfig, BenchmarkId
+from .main import BenchmarkFactory
 import shutil
 from pathlib import Path
 import subprocess
@ -10,6 +12,8 @@ import re
 import logging
 import importlib.resources
 import venv
+import json
+import abc


 HostFqdn = str  # eg 'physix90.ipr.univ-rennes1.fr'
@ -63,9 +67,8 @@ class ClusterNodeDb:
    cluster_nodes_defs: List[ClusterNodeDef]
    cpu_defs: Dict[str, int]

-    def __init__(self):
+    def __init__(self, cluster_id='alambix'):
        self.cluster_nodes_defs = []
-        cluster_id = 'alambix'
        include_multiqueue_nodes = False  # at the moment hibench only works on nodes that have all their cores in the same queue
        if cluster_id == 'alambix':
            self.add_cluster_node_def(ClusterNodeDef('alambix50.ipr.univ-rennes.fr', 'intel_xeon_x5650', 2))
@ -135,8 +138,13 @@ class ClusterNodeDb:
            self.add_cluster_node_def(ClusterNodeDef('physix100.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
            self.add_cluster_node_def(ClusterNodeDef('physix101.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
            self.add_cluster_node_def(ClusterNodeDef('physix102.ipr.univ-rennes1.fr', 'intel_xeon_gold_6248r', 2))
+        elif cluster_id == 'dummy':
+            self.add_cluster_node_def(ClusterNodeDef('graffy-ws2.ipr.univ-rennes.fr', 'intel_core_i5_8350u', 1))
+        else:
+            assert False

        self.cpu_defs = {}
+        self.add_cpu_def(CpuDef('intel_core_i5_8350u', 4))
        self.add_cpu_def(CpuDef('intel_xeon_x5550', 4))
        self.add_cpu_def(CpuDef('intel_xeon_x5650', 6))
        self.add_cpu_def(CpuDef('intel_xeon_e5-2660', 8))
@ -169,6 +177,62 @@ class ClusterNodeDb:
        return (hosts, num_cores)


+class ICluster(abc.ABC):
+    cluster_db: ClusterNodeDb
+
+    def __init__(self, cluster_db: ClusterNodeDb):
+        self.cluster_db = cluster_db
+
+    @abc.abstractmethod
+    def path_is_reachable_by_compute_nodes(self, path: Path):
+        pass
+
+    @abc.abstractmethod
+    def submit_job(self, qsub_args: List[str], exec_path: Path, exec_args: List[str], working_dir: Path):
+        """
+        qsub_args: the arguments sent to qsub, eg ['-pe', 'smp', '12', 'gaussian.job', 'h2o.gjf']
+        """
+
+    def get_cluster_db(self) -> ClusterNodeDb:
+        return self.cluster_db
+
+
+class IprCluster(ICluster):
+
+    def __init__(self):
+        super().__init__(ClusterNodeDb('alambix'))
+
+    def path_is_reachable_by_compute_nodes(self, path: Path):
+        path_is_reachable = False
+        for shared_disk_path in [Path('/opt/ipr/cluster/work.global')]:
+            try:
+                _ = path.relative_to(shared_disk_path)
+            except ValueError:
+                continue
+            path_is_reachable = True
+            break
+        return path_is_reachable
+
+    def submit_job(self, qsub_args: List[str], exec_path: Path, exec_args: List[str], working_dir: Path):
+        qsub_command = f'qsub {" ".join(qsub_args)} {exec_path} {" ".join(exec_args)}'
+        logging.debug('qsub_command = %s, working_dir=%s', qsub_command, working_dir)
+        subprocess.run(qsub_command, cwd=working_dir, check=True, shell=True)
+
+
+class DummyCluster(ICluster):
+
+    def __init__(self):
+        super().__init__(ClusterNodeDb('dummy'))
+
+    def path_is_reachable_by_compute_nodes(self, path: Path):
+        return True
+
+    def submit_job(self, qsub_args: List[str], exec_path: Path, exec_args: List[str], working_dir: Path):
+        qsub_command = f'qsub {" ".join(qsub_args)} {exec_path} {" ".join(exec_args)}'
+        logging.info('executing %s as a replacement of qsub_command %s, working_dir=%s', exec_path, qsub_command, working_dir)
+        subprocess.run(exec_path, check=True, cwd=working_dir)
+
+
 def duplicate_this_virtualenv_to(duplicate_virtualenv_path: Path):
    this_virtualenv_path = Path(getenv('VIRTUAL_ENV'))  # eg /home/graffy/work/starbench/iprbench.git/iprbench.venv
    assert this_virtualenv_path.exists(), f'failed to find the root the virtual environment in use (VIRTUAL_ENV environment variable has the value {this_virtualenv_path})'
@ -188,55 +252,22 @@ def archive_this_virtualenv_to(venv_archive_path: Path, venv_hardcoded_path: Pat
    subprocess.run(f'tar czvf {venv_archive_path} {venv_hardcoded_path.relative_to(venv_hardcoded_path.parent)}', shell=True, check=True, cwd=venv_hardcoded_path.parent, stdout=subprocess.DEVNULL)


-def launch_job_for_host_group(hibridon_version: GitCommitTag, host_group_id: HostGroupId, results_dir: Path, compiler_id: CompilerId, cmake_path: str):
+def launch_job_for_host_group(benchmark: IBenchmark, benchmark_config: BenchmarkConfig, host_group_id: HostGroupId, results_dir: Path, cluster: ICluster, cmake_path: str):

-    cluster_db = ClusterNodeDb()
+    compiler_id: CompilerId = benchmark_config['compiler_id']
+
+    cluster_db = cluster.get_cluster_db()

    (hosts, num_cores) = cluster_db.get_host_group_info(host_group_id)
    if len(hosts) == 0:
        logging.warning('skipping benchmarks with compiler %s on architecture %s because no hosts are available for it', compiler_id, host_group_id)
        return

-    quick_test = 'arch4_quick'  # about 2s on a core i5 8th generation
-    representative_test = 'nh3h2_qma_long'  # about 10min on a core i5 8th generation
-    use_test_mode = True
-    if use_test_mode:
-        benchmark_test = quick_test
-    else:
-        benchmark_test = representative_test
-
-    logging.info('using test %s for benchmarking', benchmark_test)
-    if benchmark_test == 'arch4_quick':
-        ram_per_core = '1G'
-    elif benchmark_test == 'nh3h2_qma_long':
-        ram_per_core = '2.8G'  # this was enough on physix48, but maybe we can reduce more
-    else:
-        assert f'unhandled benchmark_test : {benchmark_test}'
-
-    git_repos_url = 'https://github.com/hibridon/hibridon'
-    git_user = 'g-raffy'  # os.environ['HIBRIDON_REPOS_USER']
-    git_pass_file = f'{getenv("HOME")}/.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat'
-    cmake_options = [
-        '-DCMAKE_BUILD_TYPE=Release',  # build in release mode for highest performance
-        '-DBUILD_TESTING=ON'  # enable hibridon tests
-    ]
-
-    benchmark_command = f'ctest --output-on-failure -L ^{benchmark_test}$'
-
-    env_vars_bash_commands = ''
-    if compiler_id == 'ifort':
-        env_vars_bash_commands = 'module load compilers/ifort/latest'
-        cmake_options.append('-DCMAKE_Fortran_COMPILER=ifort')  # use intel fortran compiler
-        cmake_options.append('-DBLA_VENDOR=Intel10_64lp')  # use 64 bits intel mkl with multithreading
-    elif compiler_id == 'gfortran':
-        env_vars_bash_commands = ''
-        cmake_options.append('-DCMAKE_Fortran_COMPILER=gfortran')  # use gfortran compiler
-    else:
-        assert f'unhandled compiler_id : {compiler_id}'
+    benchmark_config['num_cores'] = num_cores  # we expect the benchmark to have the parameter num_cores

    makedirs(results_dir, exist_ok=True)

-    this_bench_dir = Path(f'{results_dir}/{hibridon_version}/{benchmark_test}/{host_group_id}/{compiler_id}')
+    this_bench_dir = Path(f'{results_dir}/{host_group_id}')
    makedirs(this_bench_dir, exist_ok=True)

    starbench_job_path = this_bench_dir / 'starbench.job'
@ -253,44 +284,45 @@ def launch_job_for_host_group(hibridon_version: GitCommitTag, host_group_id: Hos
    # create the job file (which embeds starbench.py)
    tags_dict = {
        # '<include:starbench.py>': scripts_dir / 'starbench.py',
+        '<benchmark_id>': str(benchmark.bench_id),
        '<starbench_job_path>': str(starbench_job_path),
        '<iprbench_venv_hardcoded_path>': str(iprbench_venv_hardcoded_path),
-        '<iprbench_venv_archive_path>': str(job_venv_archive_path)
+        '<iprbench_venv_archive_path>': str(job_venv_archive_path),
+        '<benchmark_config>': json.dumps(benchmark_config).replace('"', r'\"'),
+        '<results_dir>': str(results_dir)
    }
-    with importlib.resources.path('iprbench.resources', 'starbench-template.job') as job_template_path:
+    logging.debug('tags_dict = %s', str(tags_dict))
+    with importlib.resources.path('iprbench.resources', 'clusterbench-template.job') as job_template_path:
        # job_template_path = importlib.resources..files('iprbench.resources') / 'hibench' / 'starbench-template.job'
        substitute_tags(input_file_path=job_template_path, tags_dict=tags_dict, output_file_path=starbench_job_path)
    subprocess.run(['chmod', 'a+x', starbench_job_path], check=True)

-    command = f'{starbench_job_path} "{git_repos_url}" "{git_user}" "{git_pass_file}" "{hibridon_version}" "{" ".join(cmake_options)}" "{benchmark_command}" "{env_vars_bash_commands}" "{cmake_path}"'
-    logging.debug('command = %s', command)
+    ram_requirements = benchmark.get_ram_requirements(benchmark_config)
+    ram_per_core = f'{ram_requirements / num_cores / 1.e9}G'

-    qsub_command = 'qsub'
-    qsub_command += f' -pe smp {num_cores}'
-    qsub_command += f' -l "hostname={"|".join(hosts)}"'
-    qsub_command += ' -S /bin/bash'
-    qsub_command += ' -cwd'
-    qsub_command += ' -m ae'
-    qsub_command += f' -l mem_available={ram_per_core}'
-    qsub_command += ' -j y'  # merge stderr file into stdout file for easier reading of history of events
-    qsub_command += f' -N hibench_{host_group_id}_{compiler_id}_{hibridon_version}'
-    qsub_command += f' {command}'
-    logging.debug('qsub_command = %s', qsub_command)
+    qsub_args = []
+    qsub_args += ['-pe', 'smp', f'{num_cores}']
+    qsub_args += ['-l', f'"hostname={"|".join(hosts)}"']
+    qsub_args += ['-S', '/bin/bash']
+    qsub_args += ['-cwd']
+    qsub_args += ['-m', 'ae']
+    qsub_args += ['-l', f'mem_available={ram_per_core}']
+    qsub_args += ['-j', 'y']  # merge stderr file into stdout file for easier reading of history of events
+    qsub_args += ['-N', f'hibench_{host_group_id}']

-    subprocess.run(qsub_command, cwd=this_bench_dir, check=True, shell=True)
+    logging.debug('qsub_args = %s', str(qsub_args))
+
+    exec_path = starbench_job_path
+    exec_args = []
+
+    cluster.submit_job(qsub_args, exec_path, exec_args, this_bench_dir)


-def launch_perf_jobs(hibridon_version: GitCommitTag, results_dir: Path, arch_regexp: str, cmake_path: str):
+def launch_perf_jobs(benchmark: IBenchmark, benchmark_config: BenchmarkConfig, results_dir: Path, cluster: ICluster, arch_regexp: str, cmake_path: str):
    """
-    hibridon_version: the version of hibridon to test, in the form of a valid commit number eg 'a3bed1c3ccfbca572003020d3e3d3b1ff3934fad'
    results_dir: where the results of the benchmark are stored (eg $GLOBAL_WORK_DIR/graffy/benchmarks/hibench)
    """

-    compilers = [
-        'gfortran',
-        'ifort'
-    ]
-
    cluster_db = ClusterNodeDb()
    all_host_groups = cluster_db.cpu_defs.keys()

@ -298,43 +330,37 @@ def launch_perf_jobs(hibridon_version: GitCommitTag, results_dir: Path, arch_reg
    host_groups = [host_group for host_group in all_host_groups if re.match(arch_regexp, host_group) is not None]
    logging.info('requested host groups: %s', host_groups)

-    for compiler in compilers:
-        for host_group in host_groups:
-            launch_job_for_host_group(hibridon_version, host_group, results_dir, compiler, cmake_path)
-
-
-def path_is_reachable_by_compute_nodes(path: Path):
-    path_is_reachable = False
-    for shared_disk_path in [Path('/opt/ipr/cluster/work.global')]:
-        try:
-            _ = path.relative_to(shared_disk_path)
-        except ValueError:
-            continue
-        path_is_reachable = True
-        break
-    return path_is_reachable
+    for host_group in host_groups:
+        launch_job_for_host_group(benchmark, benchmark_config, host_group, results_dir, cluster, cmake_path)


 def main():
    logging.basicConfig(level=logging.DEBUG)
-    arg_parser = ArgumentParser(description='launches hibridon benchmark jobs on IPR\'s physix cluster', epilog='example:\n    --commit-id a3bed1c3ccfbca572003020d3e3d3b1ff3934fad')
-    arg_parser.add_argument('--commit-id', type=str, required=True, help='the commit id of the version of code to benchmark')
+    example_text = '''example:
+
+    %(prog)s --benchmark-id 'mamul1' --config '{"matrix_size": 1024, "num_loops":10}' --results-dir /tmp/mamul1_out
+
+    '''
+
+    arg_parser = argparse.ArgumentParser(description='submits a benchmark on the compute cluster (assuming this is running from a sge cluster machine where qsub command is available)', epilog=example_text, formatter_class=argparse.RawDescriptionHelpFormatter)
+    arg_parser.add_argument('--benchmark-id', type=BenchmarkId, required=True, help='the benchmark id of the benchmark to perform (eg mamul1)')
    arg_parser.add_argument('--results-dir', type=Path, required=True, help='the root directory of the tree where the results of the benchmarks are stored (eg $GLOBAL_WORK_DIR/graffy/benchmarks/hibench)')
+    arg_parser.add_argument('--config', type=str, default='cmake', help='the benchmark configuration in json format, eg {"compiler_id": "gfortran", "matrix_size": 1024}')
    arg_parser.add_argument('--arch-regexp', type=str, default='.*', help='the regular expression for the architectures the benchmark is allowed to run on (eg "intel_xeon_.*"). By defauls, all available architectures are allowed.')
    arg_parser.add_argument('--cmake-path', type=str, default='cmake', help='the location of the cmake command to use (eg /opt/cmake/cmake-3.23.0/bin/cmake)')

    args = arg_parser.parse_args()
-    hibridon_version = args.commit_id
-
-    # the version of hibridon to test, in the form of a valid commit number eg 'a3bed1c3ccfbca572003020d3e3d3b1ff3934fad'
-    # '53894da48505892bfa05693a52312bacb12c70c9'  # latest from branch master as of 10/06/2022 00:30
-    # code_version='dd0f413b85cf0f727a5a4e88b2b02d75a28b377f'  # latest from branch graffy-issue51 as of 10/06/2022 00:30
+    benchmark_id = BenchmarkId(args.benchmark_id)
+    benchmark = BenchmarkFactory().create_benchmark(benchmark_id)

    results_dir = Path(args.results_dir)
    arch_regexp = args.arch_regexp
    cmake_path = args.cmake_path
+    benchmark_config = json.loads(args.config)

-    if not path_is_reachable_by_compute_nodes(results_dir):
+    cluster = DummyCluster()
+
+    if not cluster.path_is_reachable_by_compute_nodes(results_dir):
        raise ValueError('the results path is expected to be on a disk that is accessible to all cluster nodes, and it doesn\'t seem to be the case for {results_dir}')

-    launch_perf_jobs(hibridon_version, results_dir, arch_regexp, cmake_path)
+    launch_perf_jobs(benchmark, benchmark_config, results_dir, cluster, arch_regexp, cmake_path)
--- a/iprbench/core.py
+++ b/iprbench/core.py
@ -0,0 +1,74 @@
+from typing import List, Dict, Union
+from enum import Enum
+import abc
+from pathlib import Path
+
+BenchmarkId = str  # a unique name for a benchmark, eg 'matmul1'
+BenchParamId = str
+BenchParamType = Union[int, str]
+BenchmarkConfig = Dict[BenchParamId, BenchParamType]
+
+
+class Singleton(type):
+    _instances = {}
+
+    def __call__(cls, *args, **kwargs):
+        if cls not in cls._instances:
+            cls._instances[cls] = super(type(cls), cls).__call__(*args, **kwargs)  # pylint: disable=bad-super-call, no-member
+        return cls._instances[cls]
+
+
+class BenchParam():
+    '''a parameter of a benchmark
+
+    for example the id of the compiler, the cpu id, the size of the matrix, etc.
+
+    '''
+    class Type(Enum):
+        PARAM_TYPE_STRING = 0
+        PARAM_TYPE_INT = 1
+
+    name: BenchParamId  # the name of the parameter, eg 'matrix_size'
+    param_type: Type  # the type of the parameter, eg 'PARAM_TYPE_INT'
+    description: str  # the description of the parameter, eg 'the size n of the n*n matrix '
+
+    def __init__(self, name: str, param_type: Type, description: str):
+        self.name = name
+        self.param_type = param_type
+        self.description = description
+
+
+class IBenchmark(abc.ABC):
+
+    bench_id: BenchmarkId  # a unique name for this benchmark, eg 'matmul1'
+    bench_params: List[BenchParam]
+
+    def __init__(self, bench_id: str, bench_params: List[BenchParam]):
+        self.bench_id = bench_id
+        self.bench_params = bench_params
+
+    @abc.abstractmethod
+    def get_ram_requirements(self, config: BenchmarkConfig) -> int:
+        """returns the ram requirements for this benchmark, in bytes
+        """
+
+    @abc.abstractmethod
+    def execute(self, config: BenchmarkConfig, benchmark_output_dir: Path):
+        """execute the benchmark for the given config
+        """
+
+    def validate_config(self, config: BenchmarkConfig):
+        """checks that all benchmark parameters have been set in the given config"""
+        for bench_param in self.bench_params:
+            try:
+                _ = config[bench_param.name]
+            except KeyError:
+                assert False, f'failed to find the benchmark parameter {bench_param.name} in the benchmark config'
+        # check that all parameters in benchmark config exist as parameters for this benchmark
+        for param_name in config.keys():
+            param_exists = False
+            for bench_param in self.bench_params:
+                if bench_param.name == param_name:
+                    param_exists = True
+                    break
+            assert param_exists, f'parameter {param_name} doesn\'t exist for benchmark {self.bench_id}'
--- a/iprbench/main.py
+++ b/iprbench/main.py
@ -1 +1,47 @@
-__version__ = '0.0.1'
+from .core import BenchmarkId, IBenchmark, Singleton
+from .benchmarks.hibench import HiBench
+from .benchmarks.mamul1 import MaMul1
+import logging
+import argparse
+from pathlib import Path
+import json
+
+__version__ = '0.0.1'
+
+
+class BenchmarkFactory(metaclass=Singleton):
+
+    def __init__(self):
+        pass
+
+    def create_benchmark(self, bench_id: BenchmarkId) -> IBenchmark:
+        benchmark = {
+            'hibench': HiBench(),
+            'mamul1': MaMul1()
+        }[bench_id]
+        return benchmark
+
+
+def main():
+    """
+
+    """
+    logging.basicConfig(level=logging.DEBUG)
+
+    example_text = '''example:
+
+    %(prog)s --benchmark-id 'mamul1' --config '{"compiler_id": "gfortran", "matrix_size": 1024, "num_loops":10, "num_cores":2}' --results-dir /tmp/mamul1_out
+
+    '''
+
+    arg_parser = argparse.ArgumentParser(description='executes a benchmark in a cluster job environment', epilog=example_text, formatter_class=argparse.RawDescriptionHelpFormatter)
+    arg_parser.add_argument('--benchmark-id', type=BenchmarkId, required=True, help='the benchmark id of the benchmark to perform (eg mamul1)')
+    arg_parser.add_argument('--results-dir', type=Path, required=True, help='the root directory of the tree where the results of the benchmarks are stored (eg $GLOBAL_WORK_DIR/graffy/benchmarks/hibench)')
+    arg_parser.add_argument('--config', type=str, default='cmake', help='the benchmark configuration in json format, eg {"compiler_id": "gfortran", "matrix_size": 1024}')
+
+    args = arg_parser.parse_args()
+    benchmark_id = BenchmarkId(args.benchmark_id)
+    benchmark = BenchmarkFactory().create_benchmark(benchmark_id)
+    benchmark_config = json.loads(args.config)
+    benchmark.validate_config(benchmark_config)
+    benchmark.execute(benchmark_config, args.results_dir)
--- a/iprbench/resources/init.py
+++ b/iprbench/resources/init.py
--- a/iprbench/resources/clusterbench-template.job
+++ b/iprbench/resources/clusterbench-template.job
@ -1,13 +1,5 @@
 #!/usr/bin/env bash
-# this job file is a template file for starbench jobs
-git_repos_url="$1" # eg "https://github.com/hibridon/hibridon"
-git_user="$2" # eg 'g-raffy'
-git_pass_file="$3" # eg "$HOME/.github/personal_access_tokens/bench.hibridon.cluster.ipr.univ-rennes1.fr.pat"
-code_version="$4"  # git branch id or commit id eg : 'a3bed1c3ccfbca572003020d3e3d3b1ff3934fad'
-cmake_options="$5"  # eg '-DCMAKE_BUILD_TYPE=Release -DBUILD_TESTING=ON'
-benchmark_command="$6"  # eg 'ctest -L ^arch4_quick$'
-env_vars_bash_commands="$7"  # defines extra environment variables prior to launch starbench. eg "export MKLROOT=/opt/intel/compilers_and_libraries_2020.1.217/linux/mkl"
-cmake_path="$8"   # eg '/opt/cmake/cmake-3.23.0/bin/cmake'
+# this job file is a template file for clusterbench jobs
 executed_by_sge=''

 if [ "${JOB_ID}" = '' ]
@ -37,7 +29,7 @@ iprbench_venv_parent=$(dirname "$iprbench_venv_path")
 iprbench_venv_archive_path='<iprbench_venv_archive_path>'
 echo "unarchiving virtual environment ${iprbench_venv_archive_path} to ${iprbench_venv_parent}"
 pushd "${iprbench_venv_parent}"
-	tar xzvf "${iprbench_venv_archive_path}"
+	tar xzvf "${iprbench_venv_archive_path}" > /dev/null
 popd
 if [ ! -d "${iprbench_venv_path}" ]
 then
@ -61,34 +53,9 @@ num_cores=${NSLOTS}

 # set environment variables

-echo "env_vars_bash_commands=$env_vars_bash_commands"
-eval $env_vars_bash_commands

-# launch starbench
-
-strUr1ProxyUrl='http://proxy-nt.univ-rennes1.fr:3128'
-strProxyVars=''
-strProxyVars="$strProxyVars HTTP_PROXY=$strUr1ProxyUrl"
-strProxyVars="$strProxyVars HTTPS_PROXY=$strUr1ProxyUrl"
-strProxyVars="$strProxyVars FTP_PROXY=$strUr1ProxyUrl"
-strProxyVars="$strProxyVars http_proxy=$strUr1ProxyUrl"
-strProxyVars="$strProxyVars https_proxy=$strUr1ProxyUrl"
-strProxyVars="$strProxyVars ftp_proxy=$strUr1ProxyUrl"
-
-command="$strProxyVars starbench"
-command="${command} --git-repos-url ${git_repos_url}"
-command="${command} --git-user ${git_user}"
-command="${command} --git-pass-file ${git_pass_file}"
-command="${command} --num-cores ${num_cores}"
-command="${command} --output-dir ${output_dir}"
-command="${command} --code-version ${code_version}"
-command="${command} --cmake-path ${cmake_path}"
-# echo "cmake_options: @$cmake_options@"
-for cmake_option in ${cmake_options}
-do
-	command="${command} --cmake-option=${cmake_option}"
-done
-command="${command} --benchmark-command=\"${benchmark_command}\""
+# launch the benchmark
+command="iprbench-run --benchmark-id '<benchmark_id>' --config '<benchmark_config>' --results-dir '${output_dir}'"

 echo "command: ${command}"
 eval ${command}
--- a/pyproject.toml
+++ b/pyproject.toml
@ -15,7 +15,8 @@ dependencies = [
    "sqlalchemy",
 #   "cocluto >= 1.2"
 #    "cocluto@git+https://git.ipr.univ-rennes.fr/cellinfo/cocluto"
-    "starbench@git+https://github.com/g-raffy/starbench"
+   "starbench >= 1.0.1"
+#    "starbench@git+https://github.com/g-raffy/starbench"
 ]
 requires-python = ">= 3.8"
 authors = [
@ -23,14 +24,15 @@ authors = [
 ]

 [project.scripts]
-hibenchonphysix = "iprbench.hibench.hibenchonphysix:main"
-showresults = "iprbench.hibench.showresults:main"
+clusterbench-submit = "iprbench.clusterbench:main"
+iprbench-run = "iprbench.main:main"
+showresults = "iprbench.benchmarks.showresults:main"

 [project.urls]
 Repository = "https://github.com/g-raffy/starbench"

 [tool.setuptools]
-packages = ["iprbench", "iprbench.hibench"]
+packages = ["iprbench", "iprbench.benchmarks"]

 [tool.setuptools.dynamic]
 version = {attr = "iprbench.main.__version__"}
--- a/test/init.py
+++ b/test/init.py
--- a/test/mamul1/CMakeLists.txt
+++ b/test/mamul1/CMakeLists.txt
@ -0,0 +1,43 @@
+
+enable_language (Fortran)
+
+set(MAMUL1_USE_MAGMA "OFF" CACHE BOOL "if set, mamul1 build uses magma (matrix algebra on gpu)")
+
+set(MAMUL1_MAGMA_API "CPU_MEM_API" CACHE STRING "which magma API to use when building mamul1: CPU_MEM_API for BLAS compatible API (uses matrices stored on CPU memory) or GPU_MEM_API (use matrices stored on GPU memory)")
+
+add_executable(mamul1 mamul1.F90)
+
+if (CMAKE_Fortran_COMPILER_ID STREQUAL "GNU")
+	# Allow arbitrary long lines. Needed as preprocessing could generate long line lengths.
+	target_compile_options(mamul1 PUBLIC -ffree-line-length-none)
+elseif (Fortran_COMPILER_NAME STREQUAL "ifort")
+	# Intel (ifort)
+	target_compile_options(mamul1 PUBLIC -no-wrap-margin)
+endif()
+
+
+if (MAMUL1_USE_MAGMA)
+	find_package( MAGMA REQUIRED )
+	if( MAMUL1_MAGMA_API STREQUAL "CPU_MEM_API" )
+		target_compile_definitions(mamul1 PUBLIC USE_MAGMA_DGEMM)
+	elseif( MAMUL1_MAGMA_API STREQUAL "GPU_MEM_API" )
+		target_compile_definitions(mamul1 PUBLIC USE_MAGMA_DGEMM_GPU)
+	else()
+		message(FATAL_ERROR "unexpected value for MAMUL1_MAGMA_API : ${MAMUL1_MAGMA_API}")
+	endif()
+	message(STATUS "MAGMA_INCLUDES=${MAGMA_INCLUDES}")
+	include_directories("${MAGMA_INCLUDES}")
+	target_link_libraries(mamul1 "${MAGMA_LIBRARIES}")
+else()
+	find_package( BLAS REQUIRED )
+	find_package( LAPACK REQUIRED )
+	# message("BLAS_LIBRARIES=${BLAS_LIBRARIES}")
+	# message("LAPACK_LIBRARIES=${LAPACK_LIBRARIES}")
+	target_compile_definitions(mamul1 PUBLIC USE_DGEMM)
+
+	# Link Blas and Lapack libraries
+	target_link_libraries(mamul1 "${LAPACK_LIBRARIES}")
+	target_link_libraries(mamul1 "${BLAS_LIBRARIES}")
+endif()
+
+install(TARGETS mamul1)
--- a/test/mamul1/mamul1.F90
+++ b/test/mamul1/mamul1.F90
@ -0,0 +1,339 @@
+#define MAMUL1_VERSION "1.0.0"
+
+#define magma_devptr_t integer(kind=8)
+subroutine print_usage(prog_path)
+    character(len=*), intent(in) :: prog_path
+    character(len=80) :: build_variant
+#if defined(USE_MAGMA_DGEMM_GPU)
+    build_variant='gpu'
+#elif defined(USE_DGEMM)
+    build_variant='cpu'
+#else
+    build_variant='unknown'
+#endif
+    write(6,'("mamul1 v",a," (variant:",a,"): benchmark performs a square matrix multiplication in double precision")') MAMUL1_VERSION, trim(build_variant);
+    write(6,'()');
+    write(6,'("Usage: ",a," <NDIM> <NUM_LOOPS>")') trim(prog_path);
+    write(6,'("   <NDIM> positive integer representing the size of the square matrices to multiply ")');
+    write(6,'("   <NUM_LOOPS> positive integer representing the number of times the multiplication is performed")');
+end subroutine
+
+program mamul1
+
+implicit none
+
+
+integer :: argc, info, ndim, num_loops
+
+character(len=32) :: arg0, arg1, arg2
+
+
+call get_command_argument(0,arg0)
+
+argc = command_argument_count()
+if (argc /= 2) then
+    call print_usage(trim(arg0))
+    ! write(6,'("Usage: ",a," NDIM NUM_LOOPS, where NDIM is a positive integer")') trim(arg0);
+    stop
+end if
+
+call get_command_argument(1,arg1,status=info)
+if (info /= 0) then
+    write(6,'("Error reading argument: info = ",i2)') info
+    call print_usage(trim(arg0))
+stop
+end if
+
+call get_command_argument(2,arg2,status=info)
+if (info /= 0) then
+    write(6,'("Error reading argument: info = ",i2)') info
+    call print_usage(trim(arg0))
+stop
+end if
+
+read(arg1,*,iostat=info) ndim
+if (info /= 0) then
+    write(6,'("Error converting ndim argument to integer: info = ",i2)') info
+    call print_usage(trim(arg0))
+stop
+end if
+
+read(arg2,*,iostat=info) num_loops
+if (info /= 0) then
+    write(6,'("Error converting num_loops argument to integer: info = ",i2)') info
+    call print_usage(trim(arg0))
+stop
+end if
+
+
+if (ndim < 1) then
+    call print_usage(trim(arg0))
+stop
+end if
+
+    call test_dgemm(ndim, num_loops)
+
+stop
+end program mamul1
+
+subroutine set_random_seed(seed)
+    integer :: seed
+    integer :: seed_array_size
+    INTEGER, ALLOCATABLE :: seed_array (:)
+    CALL RANDOM_SEED (SIZE = seed_array_size)  ! I is set to the size of
+    !                              ! the seed array
+    ALLOCATE (seed_array(seed_array_size))
+    seed_array = seed
+    CALL RANDOM_SEED (PUT=seed_array(1:seed_array_size))
+end subroutine
+
+subroutine print_matrix(mat, ndim)
+    implicit none
+    integer, parameter :: dp = kind(1.0d0)
+    real(dp), intent(in) :: mat(ndim, ndim)
+    integer, intent(in) :: ndim
+    integer :: irow
+    do irow = 1, ndim
+        write(6, *) mat(irow,:)
+    end do
+end subroutine
+
+! square matrix multiplication
+subroutine sqmatmul(amat, bmat, cmat, ndim)
+#if defined(USE_MAGMA_DGEMM_GPU)
+    use magma, only: magmaf_init, magmaf_finalize
+    use magma, only: magmaf_queue_create, magmaf_queue_destroy
+    use magma, only: magmaf_dmalloc, magmaf_free
+    use magma, only: magmaf_dsetmatrix, magmaf_dgetmatrix
+    use magma, only: magmablasf_dgemm
+#endif
+    real*8, intent(in) :: amat(ndim,ndim)
+    real*8, intent(in) :: bmat(ndim,ndim)
+    real*8, intent(out) :: cmat(ndim,ndim)
+    integer :: lda, ldb, ldc
+    integer :: info
+
+    real :: time_before, time_after
+    integer(8) :: num_ops
+    real :: gflops
+
+#ifdef USE_MAGMA_DGEMM_GPU
+    magma_devptr_t :: d_amat
+    magma_devptr_t :: d_bmat
+    magma_devptr_t :: d_cmat
+    magma_devptr_t :: queue  !! really a CPU pointer
+#endif
+    lda = ceiling(real(ndim)/32)*32
+    ldb = ceiling(real(ndim)/32)*32
+    ldc = ceiling(real(ndim)/32)*32
+
+
+#if defined(USE_MAGMA_DGEMM_GPU)
+    !! allocate GPU memory
+    write(6,'("DEBUG: before matrix A gpu memory allocation (",i0," doubles)")') lda * ndim
+    info = magmaf_dmalloc( d_amat, lda*ndim )
+    if (d_amat == 0) then
+        print "(a)", "failed to allocate d_amat"
+        return
+    endif
+    write(6,'("DEBUG: before matrix B gpu memory allocation (",i0," doubles)")') ldb * ndim
+    info = magmaf_dmalloc( d_bmat, ldb*ndim )
+    if (d_bmat == 0) then
+        print "(a)", "failed to allocate d_bmat"
+        return
+    endif
+    write(6,'("DEBUG: before matrix C gpu memory allocation (",i0," doubles)")') ldc * ndim
+    info = magmaf_dmalloc( d_cmat, ldc*ndim )
+    if (d_cmat == 0) then
+        print "(a)", "failed to allocate d_cmat"
+        return
+    endif
+
+    ! copy A to dA and B to dB
+    call magmaf_queue_create( 0, queue )
+    write(6,'("DEBUG: queue = ",i0)') queue
+    if (queue == 0) then
+        print "(a)", "failed to create a queue"
+        return
+    endif
+
+    write(6,*) 'DEBUG: copying matrix A from CPU to GPU memory'
+    call magmaf_dsetmatrix( ndim, ndim, amat, ndim, d_amat, lda, queue )
+    write(6,*) 'DEBUG: copying matrix B from CPU to GPU memory'
+    call magmaf_dsetmatrix( ndim, ndim, bmat, ndim, d_bmat, ldb, queue )
+
+    call cpu_time(time_before)
+    write (6,*) 'before magmablasf_dgemm, time=', time_before
+
+    call magmablasf_dgemm ('N', 'N', ndim, ndim, ndim, 1.0d0, d_amat, lda, d_bmat, ldb, 0.0d0, d_cmat, ldc, queue)
+    call magmaf_queue_sync(queue)
+
+    call cpu_time(time_after)
+    num_ops = real(ndim) * real(ndim) * real(ndim) * 2
+    gflops = num_ops / (time_after - time_before) / 1.0e9
+    write (6,*) 'after magmablasf_dgemm, time=', time_after
+    write (6,*) 'magmablasf_dgemm (from gpu memory to gpu memory) duration :', (time_after - time_before), '(', gflops, ' gflops)'
+
+    write(6,*) 'DEBUG: copying matrix C from GPU to CPU memory'
+    call magmaf_dgetmatrix( ndim, ndim, d_cmat, ldc, cmat, ndim, queue )
+    call magmaf_queue_destroy( queue )
+
+    info = magmaf_free(d_cmat)
+    info = magmaf_free(d_bmat)
+    info = magmaf_free(d_amat)
+
+#endif
+
+#ifdef USE_DGEMM
+    ! subroutine dgemm 	( 	character  	TRANSA,
+    ! 		character  	TRANSB,
+    ! 		integer  	M,
+    ! 		integer  	N,
+    ! 		integer  	K,
+    ! 		double precision  	ALPHA,
+    ! 		double precision, dimension(lda,*)  	A,
+    ! 		integer  	LDA,
+    ! 		double precision, dimension(ldb,*)  	B,
+    ! 		integer  	LDB,
+    ! 		double precision  	BETA,
+    ! 		double precision, dimension(ldc,*)  	C,
+    ! 		integer  	LDC 
+    ! 	) 	        
+    call dgemm('N', 'N', ndim, ndim, ndim, 1.0d0, amat, ndim, bmat, ndim, 0.0d0, cmat, ndim)
+#endif
+
+end subroutine
+
+subroutine check_cmat_element(cmat, row, col, amat, bmat, ndim)
+    real(8), intent(in) :: cmat(ndim, ndim)
+    integer, intent(in) :: row
+    integer, intent(in) :: col
+    real(8), intent(in) :: amat(ndim, ndim)
+    real(8), intent(in) :: bmat(ndim, ndim)
+    integer, intent(in) :: ndim
+
+    real(8) :: x
+    x = 0.0d0
+    do i = 1, ndim
+       x = x + amat(row, i) * bmat(i, col)
+    end do
+
+    write(6, '("expected cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, x
+    write(6, '("computed cmat(", i0, ", ", i0, ")", e23.15e3)') row, col, cmat(row, col)
+    if (abs(cmat(row, col) - x) > 1.0e-8) then
+        stop 'a computed element has a wrong value'
+    end if
+end subroutine
+
+
+subroutine test_dgemm(ndim, num_loops)
+#if defined(USE_MAGMA_DGEMM_GPU)
+    use magma, only: magmaf_init, magmaf_finalize
+    use magma, only: magmablasf_dgemm  !, magmaf_dgemm_gpu
+#endif
+
+    implicit none
+    integer, intent(in) :: ndim
+    integer, intent(in) :: num_loops
+    integer, parameter :: dp = kind(1.0d0)
+    real :: ct_start, ct_stop  ! elapsed cpu time relative to an arbitrary fixed time. Expressed in seconds with the granularity of 1 microsecond
+    integer(8) :: num_ops
+    real :: gflops
+
+    integer :: sc_start, sc_stop   ! system clock time of start and stop events, expressed in ticks
+    integer :: sc_count_rate  ! number of system clock ticks per second
+    integer :: sc_count_max   ! the max possible number of system clock ticks returned by system_clock
+    integer :: s
+    REAL :: a_diff, diff
+    REAL :: num_sc_ticks_per_second  ! the number of system clock ticks per second
+
+    real*8, allocatable :: amat(:,:)
+    real*8, allocatable :: bmat(:,:)
+    real*8, allocatable :: cmat(:,:)
+    real(dp) :: x
+    integer :: i, j
+
+#if defined(USE_MAGMA_DGEMM_GPU)
+    write(6,*) 'DEBUG: init magma'
+    call magmaf_init()
+#endif
+
+    ! First initialize the system_clock
+    CALL system_clock(count_rate=sc_count_rate)
+    CALL system_clock(count_max=sc_count_max)
+    num_sc_ticks_per_second = REAL(sc_count_rate)
+    WRITE(*,*) "system_clock rate : ", num_sc_ticks_per_second, " ticks per second"
+
+    diff = 0.0
+    a_diff = 0.0
+    s = 0
+
+    allocate(amat(ndim, ndim))
+    allocate(bmat(ndim, ndim))
+    allocate(cmat(ndim, ndim))
+
+    call set_random_seed(42)
+
+    !call random_number(amat)
+    !amat = 0.5_dp*(amat + transpose(amat))
+    do j = 1, ndim
+        do i = 1, ndim
+           call random_number(x)
+           amat(i,j) = x
+           call random_number(x)
+           bmat(i,j) = x
+        end do
+    end do
+
+    call cpu_time(ct_start)
+    call system_clock(sc_start)
+
+    do j = 1, num_loops
+        ! playmat = amat
+
+        call sqmatmul(amat, bmat, cmat, ndim)
+
+    end do
+
+    call cpu_time(ct_stop)
+    call system_clock(sc_stop)
+    if ( (sc_stop - sc_start)/num_sc_ticks_per_second < (ct_stop - ct_start) ) s = s + 1
+    diff = (sc_stop - sc_start)/num_sc_ticks_per_second - (ct_stop - ct_start) + diff
+    a_diff = ABS((sc_stop - sc_start)/num_sc_ticks_per_second - (ct_stop - ct_start)) + a_diff
+
+    ! check one of the elements of cmat (the last one here: cmat(ndim, ndim))
+    call check_cmat_element(cmat,    1,    1, amat, bmat, ndim)
+    call check_cmat_element(cmat,    1, ndim, amat, bmat, ndim)
+    call check_cmat_element(cmat, ndim,    1, amat, bmat, ndim)
+    call check_cmat_element(cmat, ndim, ndim, amat, bmat, ndim)
+
+    ! write(6, *) 'amat = '
+    ! call print_matrix(amat, ndim)
+
+    ! write(6, *) 'bmat = '
+    ! call print_matrix(bmat, ndim)
+
+    ! write(6, *) 'cmat = '
+    ! call print_matrix(cmat, ndim)
+
+    num_ops = real(ndim) * real(ndim) * real(ndim) * 2 * num_loops
+    gflops = num_ops / (ct_stop-ct_start) / 1.0e9
+
+
+    write(6, '("Time taken by dgemm for matrix size ",i8," was ",f10.2," seconds")') ndim, ct_stop-ct_start
+    WRITE(*,*) "gflops (including potential memory transfers)       : ", gflops
+    
+    WRITE(*,*) "system_clock         : ",(sc_stop - sc_start)/num_sc_ticks_per_second
+    WRITE(*,*) "cpu_time             : ",(ct_stop - ct_start)
+    WRITE(*,*) "sys_clock < cpu_time : ",s
+    WRITE(*,*) "mean diff            : ",diff
+    WRITE(*,*) "abs mean diff        : ",a_diff
+
+#if defined(USE_MAGMA_DGEMM_GPU)
+    write(6,*) 'DEBUG: deinit magma'
+    call magmaf_finalize()
+#endif
+
+
+    deallocate(amat, bmat, cmat)
+    end
--- a/test/test_clusterbench.py
+++ b/test/test_clusterbench.py
@ -0,0 +1,30 @@
+import unittest
+import logging
+import subprocess
+# import importlib.resources
+
+
+class ClusterBenchTestCase(unittest.TestCase):
+
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+    def setUp(self) -> None:  # pylint: disable=useless-parent-delegation
+        return super().setUp()
+
+    def test_clusterbench_submit(self):
+        logging.info('test_clusterbench_submit')
+        # with importlib.resources.path('iprbench.resources', 'clusterbench-template.job') as job_template_path:
+        #     print(job_template_path)
+        #     assert False
+        # subprocess.run('pip list', shell=True, check=True, executable='/bin/bash')
+        command = 'clusterbench-submit --arch-regexp "intel_core.*"  --benchmark-id \'mamul1\' --config \'{"compiler_id": "gfortran", "matrix_size": 1024, "num_loops":10}\' --results-dir /tmp/mamul1_out'
+        subprocess.run(command, shell=True, check=True, executable='/bin/bash')
+
+    # def test_clusterbench_hibench(self):
+    #     logging.info('test_clusterbench_hibench')
+    #     command = 'clusterbench-submit --benchmark-id \'hibench\' --config \'{"compiler_id": "gfortran", "test_id": "arch4_quick"}\' --results-dir /tmp/mamul1_out'
+    #     subprocess.run(command, shell=True, check=True, executable='/bin/bash')
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/test/test_iprbench.py
+++ b/test/test_iprbench.py
@ -0,0 +1,27 @@
+import unittest
+import logging
+import subprocess
+# import importlib.resources
+
+
+class IprBenchTestCase(unittest.TestCase):
+
+    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+    def setUp(self) -> None:  # pylint: disable=useless-parent-delegation
+        return super().setUp()
+
+    def test_iprbench_run(self):
+        logging.info('test_iprbench_run')
+        # with importlib.resources.path('iprbench.resources', 'mamul1') as src_dir:
+        #     with open(src_dir / 'mamul1.F90', encoding='utf8') as f:
+        #         print(f.readlines())
+        #     with open(src_dir / 'CMakeLists.txt', encoding='utf8') as f:
+        #         print(f.readlines())
+        #     subprocess.run(f'cat {src_dir / "CMakeLists.txt"}', check=True)
+        command = 'iprbench-run --benchmark-id \'mamul1\' --config \'{"compiler_id": "gfortran", "matrix_size": 1024, "num_loops":10, "num_cores":2}\' --results-dir /tmp/mamul1_out'
+        subprocess.run(command, shell=True, check=True, executable='/bin/bash')
+
+
+if __name__ == '__main__':
+    unittest.main()