* x86: The code generator will now attempt to manipulate "x and ((1 shl y) - 1)" to use BZHI

This commit is contained in:
J. Gareth "Curious Kit" Moreton 2022-11-02 13:48:54 +00:00 committed by FPK
parent e748e4c7ab
commit 7da9b4a988

View File

@ -79,10 +79,10 @@ unit nx86add;
aasmbase,aasmdata,aasmcpu, aasmbase,aasmdata,aasmcpu,
symconst,symdef, symconst,symdef,
cgobj,hlcgobj,cgx86,cga,cgutils, cgobj,hlcgobj,cgx86,cga,cgutils,
tgobj,ncgutil, tgobj,ncgutil,nutils,
ncon,nset,ninl,ncnv,ncal,nmat, ncon,nset,ninl,ncnv,ncal,nmat,
defutil,defcmp,constexp, defutil,defcmp,constexp,
pass_2,htypechk; pass_1,pass_2,htypechk;
{ Range check must be disabled explicitly as the code serves { Range check must be disabled explicitly as the code serves
on three different architecture sizes } on three different architecture sizes }
@ -1892,6 +1892,7 @@ unit nx86add;
checkoverflow : Boolean; checkoverflow : Boolean;
ovloc : tlocation; ovloc : tlocation;
tmpreg : TRegister; tmpreg : TRegister;
indexnode : TNode;
begin begin
{ determine if the comparison will be unsigned } { determine if the comparison will be unsigned }
unsigned:=not(is_signed(left.resultdef)) or unsigned:=not(is_signed(left.resultdef)) or
@ -1944,73 +1945,156 @@ unit nx86add;
opsize:=def_cgsize(left.resultdef); opsize:=def_cgsize(left.resultdef);
{$ifndef i8086} {$ifndef i8086}
{ BMI1 optimisations } if (cs_opt_level2 in current_settings.optimizerswitches) then
if (cs_opt_level2 in current_settings.optimizerswitches) and
(CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) then
begin begin
{ Can we turn "x and (not y)" into an ANDN instruction instead? } { BMI1 optimisations }
if (nodetype = andn) and if (CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) then
(opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
((left.nodetype = notn) or (right.nodetype = notn)) and
(
{ With "const and (not variable)", ANDN will produce larger
code once everything is moved into registers (as a side-note,
"const and (not const)" and "variable and (not const)" will
have been simplified earlier to remove the NOT operation). }
not (cs_opt_size in current_settings.optimizerswitches) or
(
(left.location.loc <> LOC_CONSTANT) and
(right.location.loc <> LOC_CONSTANT)
)
) then
begin begin
{ ANDN only supports the second operand being inverted; however, { Can we turn "x and (not y)" into an ANDN instruction instead? }
since we're dealing with ordinals, there won't be any Boolean if (nodetype = andn) and
shortcutting, so we can safely swap the parameters } (opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
((left.nodetype = notn) or (right.nodetype = notn)) and
(
{ With "const and (not variable)", ANDN will produce larger
code once everything is moved into registers (as a side-note,
"const and (not const)" and "variable and (not const)" will
have been simplified earlier to remove the NOT operation). }
not (cs_opt_size in current_settings.optimizerswitches) or
(
(left.location.loc <> LOC_CONSTANT) and
(right.location.loc <> LOC_CONSTANT)
)
) then
begin
{ ANDN only supports the second operand being inverted; however,
since we're dealing with ordinals, there won't be any Boolean
shortcutting, so we can safely swap the parameters }
if (right.nodetype <> notn) then if (right.nodetype <> notn) then
swapleftright; swapleftright;
secondpass(left); secondpass(left);
{ Skip the not node completely } { Skip the not node completely }
secondpass(tnotnode(right).left); secondpass(tnotnode(right).left);
{ allocate registers } { allocate registers }
hlcg.location_force_reg( hlcg.location_force_reg(
current_asmdata.CurrAsmList, current_asmdata.CurrAsmList,
tnotnode(right).left.location, tnotnode(right).left.location,
tnotnode(right).left.resultdef, tnotnode(right).left.resultdef,
tnotnode(right).left.resultdef, tnotnode(right).left.resultdef,
false false
); );
if left.location.loc = LOC_CONSTANT then if left.location.loc = LOC_CONSTANT then
{ With "const and (not variable)", we can probably still make a { With "const and (not variable)", we can probably still make a
saving when it comes to pipeline stalls (left.location.loc saving when it comes to pipeline stalls (left.location.loc
will become LOC_CREGISTER). } will become LOC_CREGISTER). }
hlcg.location_force_reg( hlcg.location_force_reg(
current_asmdata.CurrAsmList, current_asmdata.CurrAsmList,
left.location, left.location,
left.resultdef, left.resultdef,
left.resultdef, left.resultdef,
true true
); );
set_result_location_reg; set_result_location_reg;
case left.location.loc of case left.location.loc of
LOC_REFERENCE, LOC_REFERENCE,
LOC_CREFERENCE: LOC_CREFERENCE:
emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register); emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register);
LOC_REGISTER, LOC_REGISTER,
LOC_CREGISTER: LOC_CREGISTER:
emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register) emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register);
else else
InternalError(2022102101); InternalError(2022102110);
end; end;
{ Overflow can't happen with and/andn } { Overflow can't happen with and/andn }
Exit; Exit;
end;
end;
{ BMI2 optimisations }
if (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then
begin
{ Can we turn "x and ((1 shl y) - 1)" into a BZHI instruction instead? }
if (nodetype = andn) and
(opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
(
(
(right.nodetype = subn) and
(taddnode(right).right.nodetype = ordconstn) and
(tordconstnode(taddnode(right).right).value = 1) and
(taddnode(right).left.nodetype = shln) and
(tshlshrnode(taddnode(right).left).left.nodetype = ordconstn) and
(tordconstnode(tshlshrnode(taddnode(right).left).left).value = 1)
) or
(
(left.nodetype = subn) and
(taddnode(left).right.nodetype = ordconstn) and
(tordconstnode(taddnode(left).right).value = 1) and
(taddnode(left).left.nodetype = shln) and
(tshlshrnode(taddnode(left).left).left.nodetype = ordconstn) and
(tordconstnode(tshlshrnode(taddnode(left).left).left).value = 1)
)
) then
begin
{ Put the subtract node on the right }
if (right.nodetype <> subn) then
swapleftright;
secondpass(left);
{ Skip the subtract and shift nodes completely }
{ Helps avoid all the awkward typecasts }
indexnode := tshlshrnode(taddnode(right).left).right;
{$ifdef x86_64}
{ The code generator sometimes extends the shift result to 64-bit unnecessarily }
if (indexnode.nodetype = typeconvn) and (opsize in [OS_32, OS_S32]) and
(def_cgsize(TTypeConvNode(indexnode).resultdef) in [OS_64, OS_S64]) then
begin
{ Convert to the 32-bit type }
indexnode.resultdef := resultdef;
node_reset_flags(indexnode,[nf_pass1_done]);
{ We should't be getting any new errors }
if do_firstpass(indexnode) then
InternalError(2022110201);
{ Keep things internally consistent in case indexnode changed }
tshlshrnode(taddnode(right).left).right := indexnode;
end;
{$endif x86_64}
secondpass(indexnode);
{ allocate registers }
hlcg.location_force_reg(
current_asmdata.CurrAsmList,
indexnode.location,
indexnode.resultdef,
resultdef,
false
);
set_result_location_reg;
case left.location.loc of
LOC_REFERENCE,
LOC_CREFERENCE:
emit_reg_ref_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, left.location.reference, location.register);
LOC_REGISTER,
LOC_CREGISTER:
emit_reg_reg_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, left.location.register, location.register);
else
InternalError(2022102111);
end;
Exit;
end;
end; end;
end; end;
{$endif not i8086} {$endif not i8086}