diff --git a/concho/config.py b/concho/config.py index f7e9225..a3698dd 100644 --- a/concho/config.py +++ b/concho/config.py @@ -111,8 +111,15 @@ class Cpu(Item): # - Full support for 256-bit AVX2 instructions with two 256-bit FMA units per CPU core. The previous “Naples” architecture split 256-bit instructions into two separate 128-bit operations # - Up to 16 double-precision FLOPS per cycle per core # - Double-precision floating point multiplies complete in 3 cycles (down from 4) + # note : zen2 rome core has 2 256 bits fma units per core, which corresponds to avx2 technology according to https://stackoverflow.com/questions/15655835/flops-per-cycle-for-sandy-bridge-and-haswell-sse2-avx-avx2: + # Intel Haswell/Broadwell/Skylake/Kaby Lake/Coffee/... (AVX+FMA3): + # - 16 DP FLOPs/cycle: two 4-wide FMA (fused multiply-add) instructions + # - 32 SP FLOPs/cycle: two 8-wide FMA (fused multiply-add) instructions + # - (Using 256-bit vector instructions can reduce max turbo clock speed on some CPUs.) + # so, rome core have one avx2 simd, which has 2 256-bit fmadd units. Each 256-bit fma unit is able to perform 4*2 = 8 dflops/cycle; and in total we have 16 dflops per cycle per rome core, which is confirmed by internet + if proc_arch == 'rome': - num_simd_per_core = 2 + num_simd_per_core = 1 dp_flops_per_cycle = num_simd_per_core * simd_id_to_dp_flops_per_cycle(simd_id) # print(self.uid, dp_flops_per_cycle) @@ -166,6 +173,9 @@ def simd_id_to_dp_flops_per_cycle(simd_id): # 16 DP FLOPs/cycle: two 4-wide FMA (fused multiply-add) instructions # 32 SP FLOPs/cycle: two 8-wide FMA (fused multiply-add) instructions + # https://www.dell.com/support/kbdoc/fr-fr/000137696/amd-rome-is-it-for-real-architecture-and-initial-hpc-performance + # The Rome micro-architecture can retire 16 DP FLOP/cycle, double that of Naples which was 8 FLOPS/cycle + return { 'sse4.1':4, 'sse4.2':4,