* x86: The code generator will now attempt manipulate "x and (not y)" (where x and y are ordinals) to use ANDN.

This commit is contained in:
J. Gareth "Curious Kit" Moreton 2022-10-21 10:35:05 +01:00 committed by FPK
parent 567244007e
commit 626e23d89f

View File

@ -82,7 +82,7 @@ unit nx86add;
tgobj,ncgutil,
ncon,nset,ninl,ncnv,ncal,nmat,
defutil,defcmp,constexp,
htypechk;
pass_2,htypechk;
{ Range check must be disabled explicitly as the code serves
on three different architecture sizes }
@ -1943,6 +1943,78 @@ unit nx86add;
opsize:=def_cgsize(left.resultdef);
{$ifndef i8086}
{ Bit-manipulation optimisations }
if (cs_opt_level2 in current_settings.optimizerswitches) and
(CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then
begin
{ Can we turn "x and (not y)" into an ANDN instruction instead? }
if (nodetype = andn) and
(opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
((left.nodetype = notn) or (right.nodetype = notn)) and
(
{ With "const and (not variable)", ANDN will produce larger
code once everything is moved into registers (as a side-note,
"const and (not const)" and "variable and (not const)" will
have been simplified earlier to remove the NOT operation). }
not (cs_opt_size in current_settings.optimizerswitches) or
(
(left.location.loc <> LOC_CONSTANT) and
(right.location.loc <> LOC_CONSTANT)
)
) then
begin
{ ANDN only supports the second operand being inverted; however,
since we're dealing with ordinals, there won't be any Boolean
shortcutting, so we can safely swap the parameters }
if (right.nodetype <> notn) then
swapleftright;
secondpass(left);
{ Skip the not node completely }
secondpass(tnotnode(right).left);
{ allocate registers }
hlcg.location_force_reg(
current_asmdata.CurrAsmList,
tnotnode(right).left.location,
tnotnode(right).left.resultdef,
tnotnode(right).left.resultdef,
false
);
if left.location.loc = LOC_CONSTANT then
{ With "const and (not variable)", we can probably still make a
saving when it comes to pipeline stalls (left.location.loc
will become LOC_CREGISTER). }
hlcg.location_force_reg(
current_asmdata.CurrAsmList,
left.location,
left.resultdef,
left.resultdef,
true
);
set_result_location_reg;
case left.location.loc of
LOC_REFERENCE,
LOC_CREFERENCE:
emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register);
LOC_REGISTER,
LOC_CREGISTER:
emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register)
else
InternalError(2022102101);
end;
{ Overflow can't happen with and/andn }
Exit;
end;
end;
{$endif not i8086}
pass_left_right;
{ do we have to allocate a register? If yes, then three opcode instructions are better, however for sub three op code instructions