From c9461b73139c742a7a86af045212e08dea83c72b Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Sun, 13 Nov 2022 01:39:24 +0000 Subject: [PATCH] * x86: Flags that relate to optimization hints rather than features have been moved to a separate set --- compiler/i386/cpuinfo.pas | 48 +++++++++++++++++++++++++++---------- compiler/i8086/cpuinfo.pas | 39 +++++++++++++++++++++++------- compiler/x86/aoptx86.pas | 2 +- compiler/x86_64/cpuinfo.pas | 36 +++++++++++++++++++++------- 4 files changed, 95 insertions(+), 30 deletions(-) diff --git a/compiler/i386/cpuinfo.pas b/compiler/i386/cpuinfo.pas index a2cef54856..d4e625d6ac 100644 --- a/compiler/i386/cpuinfo.pas +++ b/compiler/i386/cpuinfo.pas @@ -167,11 +167,7 @@ Const type tcpuflags = (CPUX86_HAS_BTX, { Bit-test instructions (BT, BTC, BTR and BTS) are available } - CPUX86_HAS_FAST_XCHG, { XCHG %reg,%reg executes in 2 cycles or less } CPUX86_HAS_CMOV, { CMOVcc instructions are available } - CPUX86_HAS_FAST_BTX, { BT/C/R/S instructions with register operands are at least as fast as logical instructions } - CPUX86_HAS_FAST_BT_MEM, { BT instructions with memory operands are at least as fast as logical instructions } - CPUX86_HAS_FAST_BTX_MEM, { BTC/R/S instructions with memory operands are at least as fast as logical instructions } CPUX86_HAS_SSEUNIT, { SSE instructions are available } CPUX86_HAS_SSE2, { SSE2 instructions are available } CPUX86_HAS_BMI1, { BMI1 instructions are available } @@ -190,20 +186,33 @@ type FPUX86_HAS_AVX512DQ ); + { Instruction optimisation hints } + TCPUOptimizeFlags = + (CPUX86_HINT_FAST_BT_REG_IMM, { BT instructions with register source and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BT_REG_REG, { BT instructions with register source and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BTX_REG_IMM, { BTC/R/S instructions with register source and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BTX_REG_REG, { BTC/R/S instructions with register source and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BT_MEM_IMM, { BT instructions with memory sources and inmediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BT_MEM_REG, { BT instructions with memory sources and register indices and a register index are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BTX_MEM_IMM, { BTC/R/S instructions with memory sources and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BTX_MEM_REG, { BTC/R/S instructions with memory sources and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_XCHG { XCHG %reg,%reg executes in 2 cycles or less } + ); + const cpu_capabilities : array[tcputype] of set of tcpuflags = ( { cpu_none } [], { cpu_386 } [CPUX86_HAS_BTX], { cpu_486 } [CPUX86_HAS_BTX], { cpu_Pentium } [CPUX86_HAS_BTX], - { cpu_Pentium2 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX], - { cpu_Pentium3 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT], - { cpu_Pentium4 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2], - { cpu_PentiumM } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2], - { cpu_core_i } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT], - { cpu_core_avx } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT], - { cpu_core_avx2 } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE], - { cpu_zen } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_FAST_BT_MEM,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE] + { cpu_Pentium2 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV], + { cpu_Pentium3 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT], + { cpu_Pentium4 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2], + { cpu_PentiumM } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2], + { cpu_core_i } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT], + { cpu_core_avx } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT], + { cpu_core_avx2 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE], + { cpu_zen } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE] ); fpu_capabilities : array[tfputype] of set of tfpuflags = ( @@ -220,6 +229,21 @@ type { fpu_avx512 } [FPUX86_HAS_AVXUNIT,FPUX86_HAS_FMA,FPUX86_HAS_AVX512F,FPUX86_HAS_AVX512VL,FPUX86_HAS_AVX512DQ] ); + cpu_optimization_hints : array[TCPUType] of set of TCPUOptimizeFlags = ( + { cpu_none } [], + { cpu_386 } [], + { cpu_486 } [], + { cpu_Pentium } [], + { cpu_Pentium2 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM], + { cpu_Pentium3 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM], + { cpu_Pentium4 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM], + { cpu_PentiumM } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG], + { cpu_core_i } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG], + { cpu_core_avx } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG], + { cpu_core_avx2 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG], + { cpu_zen } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_BT_MEM_IMM,CPUX86_HINT_FAST_XCHG] + ); + Implementation end. diff --git a/compiler/i8086/cpuinfo.pas b/compiler/i8086/cpuinfo.pas index 61aad223a6..8127ae48d7 100644 --- a/compiler/i8086/cpuinfo.pas +++ b/compiler/i8086/cpuinfo.pas @@ -156,15 +156,24 @@ Const type tcpuflags = (CPUX86_HAS_BTX, { Bit-test instructions (BT, BTC, BTR and BTS) are available } - CPUX86_HAS_FAST_XCHG, { XCHG %reg,%reg executes in 2 cycles or less } CPUX86_HAS_CMOV, { CMOVcc instructions are available } - CPUX86_HAS_FAST_BTX, { BT/C/R/S instructions with register operands are at least as fast as logical instructions } - CPUX86_HAS_FAST_BT_MEM, { BT instructions with memory operands are at least as fast as logical instructions } - CPUX86_HAS_FAST_BTX_MEM, { BTC/R/S instructions with memory operands are at least as fast as logical instructions } CPUX86_HAS_SSEUNIT, { SSE instructions are available } CPUX86_HAS_SSE2 { SSE2 instructions are available } ); + { Instruction optimisation hints } + TCPUOptimizeFlags = + (CPUX86_HINT_FAST_BT_REG_IMM, { BT instructions with register source and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BT_REG_REG, { BT instructions with register source and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BTX_REG_IMM, { BTC/R/S instructions with register source and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BTX_REG_REG, { BTC/R/S instructions with register source and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BT_MEM_IMM, { BT instructions with memory sources and inmediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BT_MEM_REG, { BT instructions with memory sources and register indices and a register index are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BTX_MEM_IMM, { BTC/R/S instructions with memory sources and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BTX_MEM_REG, { BTC/R/S instructions with memory sources and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_XCHG { XCHG %reg,%reg executes in 2 cycles or less } + ); + const cpu_capabilities : array[tcputype] of set of tcpuflags = ( { cpu_none } [], @@ -174,10 +183,24 @@ type { cpu_386 } [CPUX86_HAS_BTX], { cpu_486 } [CPUX86_HAS_BTX], { cpu_Pentium } [CPUX86_HAS_BTX], - { cpu_Pentium2 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX], - { cpu_Pentium3 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT], - { cpu_Pentium4 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2], - { cpu_PentiumM } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2] + { cpu_Pentium2 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV], + { cpu_Pentium3 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT], + { cpu_Pentium4 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2], + { cpu_PentiumM } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2] + ); + + cpu_optimization_hints : array[TCPUType] of set of TCPUOptimizeFlags = ( + { cpu_none } [], + { cpu_8086 } [], + { cpu_186 } [], + { cpu_286 } [], + { cpu_386 } [], + { cpu_486 } [], + { cpu_Pentium } [], + { cpu_Pentium2 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM], + { cpu_Pentium3 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM], + { cpu_Pentium4 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM], + { cpu_PentiumM } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG] ); x86_near_code_models = [mm_tiny,mm_small,mm_compact]; diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index 6a4feb59f2..dfa943d5b6 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -8614,7 +8614,7 @@ unit aoptx86; { From the Pentium M onwards, XCHG only has a latency of 2 rather than 3, so it becomes a saving compared to three MOVs with two of them able to execute simultaneously. [Kit] } - (CPUX86_HAS_FAST_XCHG in cpu_capabilities[current_settings.optimizecputype]); + (CPUX86_HINT_FAST_XCHG in cpu_optimization_hints[current_settings.optimizecputype]); end; var diff --git a/compiler/x86_64/cpuinfo.pas b/compiler/x86_64/cpuinfo.pas index df47a19a8c..322cec5835 100644 --- a/compiler/x86_64/cpuinfo.pas +++ b/compiler/x86_64/cpuinfo.pas @@ -168,11 +168,7 @@ Const type tcpuflags = (CPUX86_HAS_BTX, { Bit-test instructions (BT, BTC, BTR and BTS) are available } - CPUX86_HAS_FAST_XCHG, { XCHG %reg,%reg executes in 2 cycles or less } CPUX86_HAS_CMOV, { CMOVcc instructions are available } - CPUX86_HAS_FAST_BTX, { BT/C/R/S instructions with register operands are at least as fast as logical instructions } - CPUX86_HAS_FAST_BT_MEM, { BT instructions with memory operands are at least as fast as logical instructions } - CPUX86_HAS_FAST_BTX_MEM, { BTC/R/S instructions with memory operands are at least as fast as logical instructions } CPUX86_HAS_SSEUNIT, { SSE instructions are available } CPUX86_HAS_SSE2, { SSE2 instructions are available } CPUX86_HAS_BMI1, { BMI1 instructions are available } @@ -192,14 +188,27 @@ type FPUX86_HAS_AVX512DQ ); + { Instruction optimisation hints } + TCPUOptimizeFlags = + (CPUX86_HINT_FAST_BT_REG_IMM, { BT instructions with register source and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BT_REG_REG, { BT instructions with register source and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BTX_REG_IMM, { BTC/R/S instructions with register source and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BTX_REG_REG, { BTC/R/S instructions with register source and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BT_MEM_IMM, { BT instructions with memory sources and inmediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BT_MEM_REG, { BT instructions with memory sources and register indices and a register index are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_BTX_MEM_IMM, { BTC/R/S instructions with memory sources and immediate indices are at least as fast as logical instructions } + CPUX86_HINT_FAST_BTX_MEM_REG, { BTC/R/S instructions with memory sources and register indices are at least as fast as equivalent logical instructions } + CPUX86_HINT_FAST_XCHG { XCHG %reg,%reg executes in 2 cycles or less } + ); + const cpu_capabilities : array[tcputype] of set of tcpuflags = ( { cpu_none } [], - { Athlon64 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2], - { cpu_core_i } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT], - { cpu_core_avx } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT], - { cpu_core_avx2 } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE], - { cpu_zen } [CPUX86_HAS_BTX,CPUX86_HAS_FAST_XCHG,CPUX86_HAS_CMOV,CPUX86_HAS_FAST_BTX,CPUX86_HAS_FAST_BT_MEM,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE] + { Athlon64 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2], + { cpu_core_i } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT], + { cpu_core_avx } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT], + { cpu_core_avx2 } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE], + { cpu_zen } [CPUX86_HAS_BTX,CPUX86_HAS_CMOV,CPUX86_HAS_SSEUNIT,CPUX86_HAS_SSE2,CPUX86_HAS_POPCNT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE] ); fpu_capabilities : array[tfputype] of set of tfpuflags = ( @@ -214,6 +223,15 @@ type { fpu_avx512 } [FPUX86_HAS_AVXUNIT,FPUX86_HAS_FMA,FPUX86_HAS_32MMREGS,FPUX86_HAS_AVX512F,FPUX86_HAS_AVX512VL,FPUX86_HAS_AVX512DQ] ); + cpu_optimization_hints : array[TCPUType] of set of TCPUOptimizeFlags = ( + { cpu_none } [], + { cpu_Athlon64 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG], + { cpu_core_i } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG], + { cpu_core_avx } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG], + { cpu_core_avx2 } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_XCHG], + { cpu_zen } [CPUX86_HINT_FAST_BT_REG_IMM,CPUX86_HINT_FAST_BTX_REG_IMM,CPUX86_HINT_FAST_BT_MEM_IMM,CPUX86_HINT_FAST_XCHG] + ); + Implementation end.