mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-18 23:29:32 +02:00
* x86: The code generator will now attempt to manipulate "x and ((1 shl y) - 1)" to use BZHI
This commit is contained in:
parent
e748e4c7ab
commit
7da9b4a988
@ -79,10 +79,10 @@ unit nx86add;
|
|||||||
aasmbase,aasmdata,aasmcpu,
|
aasmbase,aasmdata,aasmcpu,
|
||||||
symconst,symdef,
|
symconst,symdef,
|
||||||
cgobj,hlcgobj,cgx86,cga,cgutils,
|
cgobj,hlcgobj,cgx86,cga,cgutils,
|
||||||
tgobj,ncgutil,
|
tgobj,ncgutil,nutils,
|
||||||
ncon,nset,ninl,ncnv,ncal,nmat,
|
ncon,nset,ninl,ncnv,ncal,nmat,
|
||||||
defutil,defcmp,constexp,
|
defutil,defcmp,constexp,
|
||||||
pass_2,htypechk;
|
pass_1,pass_2,htypechk;
|
||||||
|
|
||||||
{ Range check must be disabled explicitly as the code serves
|
{ Range check must be disabled explicitly as the code serves
|
||||||
on three different architecture sizes }
|
on three different architecture sizes }
|
||||||
@ -1892,6 +1892,7 @@ unit nx86add;
|
|||||||
checkoverflow : Boolean;
|
checkoverflow : Boolean;
|
||||||
ovloc : tlocation;
|
ovloc : tlocation;
|
||||||
tmpreg : TRegister;
|
tmpreg : TRegister;
|
||||||
|
indexnode : TNode;
|
||||||
begin
|
begin
|
||||||
{ determine if the comparison will be unsigned }
|
{ determine if the comparison will be unsigned }
|
||||||
unsigned:=not(is_signed(left.resultdef)) or
|
unsigned:=not(is_signed(left.resultdef)) or
|
||||||
@ -1944,73 +1945,156 @@ unit nx86add;
|
|||||||
opsize:=def_cgsize(left.resultdef);
|
opsize:=def_cgsize(left.resultdef);
|
||||||
|
|
||||||
{$ifndef i8086}
|
{$ifndef i8086}
|
||||||
{ BMI1 optimisations }
|
if (cs_opt_level2 in current_settings.optimizerswitches) then
|
||||||
if (cs_opt_level2 in current_settings.optimizerswitches) and
|
|
||||||
(CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) then
|
|
||||||
begin
|
begin
|
||||||
{ Can we turn "x and (not y)" into an ANDN instruction instead? }
|
{ BMI1 optimisations }
|
||||||
if (nodetype = andn) and
|
if (CPUX86_HAS_BMI1 in cpu_capabilities[current_settings.cputype]) then
|
||||||
(opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
|
|
||||||
((left.nodetype = notn) or (right.nodetype = notn)) and
|
|
||||||
(
|
|
||||||
{ With "const and (not variable)", ANDN will produce larger
|
|
||||||
code once everything is moved into registers (as a side-note,
|
|
||||||
"const and (not const)" and "variable and (not const)" will
|
|
||||||
have been simplified earlier to remove the NOT operation). }
|
|
||||||
not (cs_opt_size in current_settings.optimizerswitches) or
|
|
||||||
(
|
|
||||||
(left.location.loc <> LOC_CONSTANT) and
|
|
||||||
(right.location.loc <> LOC_CONSTANT)
|
|
||||||
)
|
|
||||||
) then
|
|
||||||
begin
|
begin
|
||||||
{ ANDN only supports the second operand being inverted; however,
|
{ Can we turn "x and (not y)" into an ANDN instruction instead? }
|
||||||
since we're dealing with ordinals, there won't be any Boolean
|
if (nodetype = andn) and
|
||||||
shortcutting, so we can safely swap the parameters }
|
(opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
|
||||||
|
((left.nodetype = notn) or (right.nodetype = notn)) and
|
||||||
|
(
|
||||||
|
{ With "const and (not variable)", ANDN will produce larger
|
||||||
|
code once everything is moved into registers (as a side-note,
|
||||||
|
"const and (not const)" and "variable and (not const)" will
|
||||||
|
have been simplified earlier to remove the NOT operation). }
|
||||||
|
not (cs_opt_size in current_settings.optimizerswitches) or
|
||||||
|
(
|
||||||
|
(left.location.loc <> LOC_CONSTANT) and
|
||||||
|
(right.location.loc <> LOC_CONSTANT)
|
||||||
|
)
|
||||||
|
) then
|
||||||
|
begin
|
||||||
|
{ ANDN only supports the second operand being inverted; however,
|
||||||
|
since we're dealing with ordinals, there won't be any Boolean
|
||||||
|
shortcutting, so we can safely swap the parameters }
|
||||||
|
|
||||||
if (right.nodetype <> notn) then
|
if (right.nodetype <> notn) then
|
||||||
swapleftright;
|
swapleftright;
|
||||||
|
|
||||||
secondpass(left);
|
secondpass(left);
|
||||||
{ Skip the not node completely }
|
{ Skip the not node completely }
|
||||||
secondpass(tnotnode(right).left);
|
secondpass(tnotnode(right).left);
|
||||||
|
|
||||||
{ allocate registers }
|
{ allocate registers }
|
||||||
hlcg.location_force_reg(
|
hlcg.location_force_reg(
|
||||||
current_asmdata.CurrAsmList,
|
current_asmdata.CurrAsmList,
|
||||||
tnotnode(right).left.location,
|
tnotnode(right).left.location,
|
||||||
tnotnode(right).left.resultdef,
|
tnotnode(right).left.resultdef,
|
||||||
tnotnode(right).left.resultdef,
|
tnotnode(right).left.resultdef,
|
||||||
false
|
false
|
||||||
);
|
);
|
||||||
|
|
||||||
if left.location.loc = LOC_CONSTANT then
|
if left.location.loc = LOC_CONSTANT then
|
||||||
{ With "const and (not variable)", we can probably still make a
|
{ With "const and (not variable)", we can probably still make a
|
||||||
saving when it comes to pipeline stalls (left.location.loc
|
saving when it comes to pipeline stalls (left.location.loc
|
||||||
will become LOC_CREGISTER). }
|
will become LOC_CREGISTER). }
|
||||||
hlcg.location_force_reg(
|
hlcg.location_force_reg(
|
||||||
current_asmdata.CurrAsmList,
|
current_asmdata.CurrAsmList,
|
||||||
left.location,
|
left.location,
|
||||||
left.resultdef,
|
left.resultdef,
|
||||||
left.resultdef,
|
left.resultdef,
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
|
|
||||||
set_result_location_reg;
|
set_result_location_reg;
|
||||||
|
|
||||||
case left.location.loc of
|
case left.location.loc of
|
||||||
LOC_REFERENCE,
|
LOC_REFERENCE,
|
||||||
LOC_CREFERENCE:
|
LOC_CREFERENCE:
|
||||||
emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register);
|
emit_ref_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.reference, tnotnode(right).left.location.register, location.register);
|
||||||
LOC_REGISTER,
|
LOC_REGISTER,
|
||||||
LOC_CREGISTER:
|
LOC_CREGISTER:
|
||||||
emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register)
|
emit_reg_reg_reg(A_ANDN, TCGSize2OpSize[opsize], left.location.register, tnotnode(right).left.location.register, location.register);
|
||||||
else
|
else
|
||||||
InternalError(2022102101);
|
InternalError(2022102110);
|
||||||
end;
|
end;
|
||||||
|
|
||||||
{ Overflow can't happen with and/andn }
|
{ Overflow can't happen with and/andn }
|
||||||
Exit;
|
Exit;
|
||||||
|
end;
|
||||||
|
end;
|
||||||
|
|
||||||
|
{ BMI2 optimisations }
|
||||||
|
if (CPUX86_HAS_BMI2 in cpu_capabilities[current_settings.cputype]) then
|
||||||
|
begin
|
||||||
|
{ Can we turn "x and ((1 shl y) - 1)" into a BZHI instruction instead? }
|
||||||
|
if (nodetype = andn) and
|
||||||
|
(opsize in [OS_32, OS_S32{$ifdef x86_64}, OS_64, OS_S64{$endif x86_64}]) and
|
||||||
|
(
|
||||||
|
(
|
||||||
|
(right.nodetype = subn) and
|
||||||
|
(taddnode(right).right.nodetype = ordconstn) and
|
||||||
|
(tordconstnode(taddnode(right).right).value = 1) and
|
||||||
|
(taddnode(right).left.nodetype = shln) and
|
||||||
|
(tshlshrnode(taddnode(right).left).left.nodetype = ordconstn) and
|
||||||
|
(tordconstnode(tshlshrnode(taddnode(right).left).left).value = 1)
|
||||||
|
) or
|
||||||
|
(
|
||||||
|
(left.nodetype = subn) and
|
||||||
|
(taddnode(left).right.nodetype = ordconstn) and
|
||||||
|
(tordconstnode(taddnode(left).right).value = 1) and
|
||||||
|
(taddnode(left).left.nodetype = shln) and
|
||||||
|
(tshlshrnode(taddnode(left).left).left.nodetype = ordconstn) and
|
||||||
|
(tordconstnode(tshlshrnode(taddnode(left).left).left).value = 1)
|
||||||
|
)
|
||||||
|
) then
|
||||||
|
begin
|
||||||
|
|
||||||
|
{ Put the subtract node on the right }
|
||||||
|
if (right.nodetype <> subn) then
|
||||||
|
swapleftright;
|
||||||
|
|
||||||
|
secondpass(left);
|
||||||
|
|
||||||
|
{ Skip the subtract and shift nodes completely }
|
||||||
|
|
||||||
|
{ Helps avoid all the awkward typecasts }
|
||||||
|
indexnode := tshlshrnode(taddnode(right).left).right;
|
||||||
|
{$ifdef x86_64}
|
||||||
|
{ The code generator sometimes extends the shift result to 64-bit unnecessarily }
|
||||||
|
if (indexnode.nodetype = typeconvn) and (opsize in [OS_32, OS_S32]) and
|
||||||
|
(def_cgsize(TTypeConvNode(indexnode).resultdef) in [OS_64, OS_S64]) then
|
||||||
|
begin
|
||||||
|
{ Convert to the 32-bit type }
|
||||||
|
indexnode.resultdef := resultdef;
|
||||||
|
node_reset_flags(indexnode,[nf_pass1_done]);
|
||||||
|
|
||||||
|
{ We should't be getting any new errors }
|
||||||
|
if do_firstpass(indexnode) then
|
||||||
|
InternalError(2022110201);
|
||||||
|
|
||||||
|
{ Keep things internally consistent in case indexnode changed }
|
||||||
|
tshlshrnode(taddnode(right).left).right := indexnode;
|
||||||
|
end;
|
||||||
|
{$endif x86_64}
|
||||||
|
secondpass(indexnode);
|
||||||
|
|
||||||
|
{ allocate registers }
|
||||||
|
hlcg.location_force_reg(
|
||||||
|
current_asmdata.CurrAsmList,
|
||||||
|
indexnode.location,
|
||||||
|
indexnode.resultdef,
|
||||||
|
resultdef,
|
||||||
|
false
|
||||||
|
);
|
||||||
|
|
||||||
|
set_result_location_reg;
|
||||||
|
|
||||||
|
case left.location.loc of
|
||||||
|
LOC_REFERENCE,
|
||||||
|
LOC_CREFERENCE:
|
||||||
|
emit_reg_ref_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, left.location.reference, location.register);
|
||||||
|
LOC_REGISTER,
|
||||||
|
LOC_CREGISTER:
|
||||||
|
emit_reg_reg_reg(A_BZHI, TCGSize2OpSize[opsize], indexnode.location.register, left.location.register, location.register);
|
||||||
|
else
|
||||||
|
InternalError(2022102111);
|
||||||
|
end;
|
||||||
|
|
||||||
|
Exit;
|
||||||
|
end;
|
||||||
end;
|
end;
|
||||||
end;
|
end;
|
||||||
{$endif not i8086}
|
{$endif not i8086}
|
||||||
|
Loading…
Reference in New Issue
Block a user