fixed bug : epyc zen2 cores are able to perform 16 dflops per cycle, not 32 !!!

2021-04-08 13:10:08 +02:00 · 2021-04-08 13:10:08 +02:00 · 80bc27b17a
parent 0093829256
commit 80bc27b17a
1 changed files with 11 additions and 1 deletions
--- a/concho/config.py
+++ b/concho/config.py
@ -111,8 +111,15 @@ class Cpu(Item):
        # - Full support for 256-bit AVX2 instructions with two 256-bit FMA units per CPU core. The previous “Naples” architecture split 256-bit instructions into two separate 128-bit operations
        # - Up to 16 double-precision FLOPS per cycle per core
        # - Double-precision floating point multiplies complete in 3 cycles (down from 4)
+        # note : zen2 rome core has 2 256 bits fma units per core, which corresponds to avx2 technology according to https://stackoverflow.com/questions/15655835/flops-per-cycle-for-sandy-bridge-and-haswell-sse2-avx-avx2:
+        # Intel Haswell/Broadwell/Skylake/Kaby Lake/Coffee/... (AVX+FMA3):
+        # - 16 DP FLOPs/cycle: two 4-wide FMA (fused multiply-add) instructions
+        # - 32 SP FLOPs/cycle: two 8-wide FMA (fused multiply-add) instructions
+        # - (Using 256-bit vector instructions can reduce max turbo clock speed on some CPUs.)
+        # so, rome core have one avx2 simd, which has 2 256-bit fmadd units. Each 256-bit fma unit is able to perform 4*2 = 8 dflops/cycle; and in total we have 16 dflops per cycle per rome core, which is confirmed by internet
+
        if proc_arch == 'rome':
-            num_simd_per_core = 2
+            num_simd_per_core = 1

        dp_flops_per_cycle = num_simd_per_core * simd_id_to_dp_flops_per_cycle(simd_id)
        # print(self.uid, dp_flops_per_cycle)
@ -166,6 +173,9 @@ def simd_id_to_dp_flops_per_cycle(simd_id):
    #     16 DP FLOPs/cycle: two 4-wide FMA (fused multiply-add) instructions
    #     32 SP FLOPs/cycle: two 8-wide FMA (fused multiply-add) instructions

+    # https://www.dell.com/support/kbdoc/fr-fr/000137696/amd-rome-is-it-for-real-architecture-and-initial-hpc-performance
+    # The Rome micro-architecture can retire 16 DP FLOP/cycle, double that of Naples which was 8 FLOPS/cycle
+
    return {    
        'sse4.1':4,
        'sse4.2':4,