fixed bug : epyc zen2 cores are able to perform 16 dflops per cycle, not 32 !!!
This commit is contained in:
		
							parent
							
								
									0093829256
								
							
						
					
					
						commit
						80bc27b17a
					
				|  | @ -111,8 +111,15 @@ class Cpu(Item): | ||||||
|         # - Full support for 256-bit AVX2 instructions with two 256-bit FMA units per CPU core. The previous “Naples” architecture split 256-bit instructions into two separate 128-bit operations |         # - Full support for 256-bit AVX2 instructions with two 256-bit FMA units per CPU core. The previous “Naples” architecture split 256-bit instructions into two separate 128-bit operations | ||||||
|         # - Up to 16 double-precision FLOPS per cycle per core |         # - Up to 16 double-precision FLOPS per cycle per core | ||||||
|         # - Double-precision floating point multiplies complete in 3 cycles (down from 4) |         # - Double-precision floating point multiplies complete in 3 cycles (down from 4) | ||||||
|  |         # note : zen2 rome core has 2 256 bits fma units per core, which corresponds to avx2 technology according to https://stackoverflow.com/questions/15655835/flops-per-cycle-for-sandy-bridge-and-haswell-sse2-avx-avx2: | ||||||
|  |         # Intel Haswell/Broadwell/Skylake/Kaby Lake/Coffee/... (AVX+FMA3): | ||||||
|  |         # - 16 DP FLOPs/cycle: two 4-wide FMA (fused multiply-add) instructions | ||||||
|  |         # - 32 SP FLOPs/cycle: two 8-wide FMA (fused multiply-add) instructions | ||||||
|  |         # - (Using 256-bit vector instructions can reduce max turbo clock speed on some CPUs.) | ||||||
|  |         # so, rome core have one avx2 simd, which has 2 256-bit fmadd units. Each 256-bit fma unit is able to perform 4*2 = 8 dflops/cycle; and in total we have 16 dflops per cycle per rome core, which is confirmed by internet | ||||||
|  | 
 | ||||||
|         if proc_arch == 'rome': |         if proc_arch == 'rome': | ||||||
|             num_simd_per_core = 2 |             num_simd_per_core = 1 | ||||||
| 
 | 
 | ||||||
|         dp_flops_per_cycle = num_simd_per_core * simd_id_to_dp_flops_per_cycle(simd_id) |         dp_flops_per_cycle = num_simd_per_core * simd_id_to_dp_flops_per_cycle(simd_id) | ||||||
|         # print(self.uid, dp_flops_per_cycle) |         # print(self.uid, dp_flops_per_cycle) | ||||||
|  | @ -166,6 +173,9 @@ def simd_id_to_dp_flops_per_cycle(simd_id): | ||||||
|     #     16 DP FLOPs/cycle: two 4-wide FMA (fused multiply-add) instructions |     #     16 DP FLOPs/cycle: two 4-wide FMA (fused multiply-add) instructions | ||||||
|     #     32 SP FLOPs/cycle: two 8-wide FMA (fused multiply-add) instructions |     #     32 SP FLOPs/cycle: two 8-wide FMA (fused multiply-add) instructions | ||||||
| 
 | 
 | ||||||
|  |     # https://www.dell.com/support/kbdoc/fr-fr/000137696/amd-rome-is-it-for-real-architecture-and-initial-hpc-performance | ||||||
|  |     # The Rome micro-architecture can retire 16 DP FLOP/cycle, double that of Naples which was 8 FLOPS/cycle | ||||||
|  | 
 | ||||||
|     return {     |     return {     | ||||||
|         'sse4.1':4, |         'sse4.1':4, | ||||||
|         'sse4.2':4, |         'sse4.2':4, | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue