diff --git a/compiler/x86/nx86add.pas b/compiler/x86/nx86add.pas index 810c6ecbb4..32fec77458 100644 --- a/compiler/x86/nx86add.pas +++ b/compiler/x86/nx86add.pas @@ -82,7 +82,7 @@ unit nx86add; tgobj,ncgutil, ncon,nset,ninl,ncnv,ncal,nmat, defutil,defcmp,constexp, - htypechk; + pass_2,htypechk; { Range check must be disabled explicitly as the code serves on three different architecture sizes } @@ -1943,6 +1943,78 @@ unit nx86add; opsize:=def_cgsize(left.resultdef); +{$ifndef i8086} + { Bit-manipulation optimisations } + if (cs_opt_level2 in current_settings.optimizerswitches) and + (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then + begin + { Can we turn "x and (not y)" into an ANDN instruction instead? } + if (nodetype = andn) and + (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and + ((left.nodetype = notn) or (right.nodetype = notn)) and + ( + { With "const and (not variable)", ANDN will produce larger + code once everything is moved into registers (as a side-note, + "const and (not const)" and "variable and (not const)" will + have been simplified earlier to remove the NOT operation). } + not (cs_opt_size in current_settings.optimizerswitches) or + ( + (left.location.loc <> LOC_CONSTANT) and + (right.location.loc <> LOC_CONSTANT) + ) + ) then + begin + { ANDN only supports the second operand being inverted; however, + since we're dealing with ordinals, there won't be any Boolean + shortcutting, so we can safely swap the parameters } + + if (right.nodetype <> notn) then + swapleftright; + + secondpass(left); + { Skip the not node completely } + secondpass(tnotnode(right).left); + + { allocate registers } + hlcg.location_force_reg( + current_asmdata.CurrAsmList, + tnotnode(right).left.location, + tnotnode(right).left.resultdef, + tnotnode(right).left.resultdef, + false + ); + + if left.location.loc = LOC_CONSTANT then + { With "const and (not variable)", we can probably still make a + saving when it comes to pipeline stalls (left.location.loc + will become LOC_CREGISTER). } + hlcg.location_force_reg( + current_asmdata.CurrAsmList, + left.location, + left.resultdef, + left.resultdef, + true + ); + + set_result_location_reg; + + case left.location.loc of + LOC_REFERENCE, + LOC_CREFERENCE: + emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register); + LOC_REGISTER, + LOC_CREGISTER: + emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register) + else + InternalError(2022102101); + end; + + { Overflow can't happen with and/andn } + Exit; + end; + end; +{$endif not i8086} + pass_left_right; { do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions