+ patch by J. Gareth Moreton: SHL-centric peephole optimisations, resolves #37389

git-svn-id: trunk@45811 -
This commit is contained in:
florian 2020-07-19 20:50:13 +00:00
parent d6d92e3666
commit 09125e834f

View File

@ -3396,10 +3396,15 @@ unit aoptx86;
TmpBool1,TmpBool2 : Boolean;
tmpref : treference;
hp1,hp2: tai;
mask: tcgint;
begin
Result:=false;
if MatchOpType(taicpu(p),top_const,top_reg) and
(taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
{ All these optimisations work on "shl/sal const,%reg" }
if not MatchOpType(taicpu(p),top_const,top_reg) then
Exit;
if (taicpu(p).opsize in [S_L{$ifdef x86_64},S_Q{$endif x86_64}]) and
(taicpu(p).oper[0]^.val <= 3) then
{ Changes "shl const, %reg32; add const/reg, %reg32" to one lea statement }
begin
@ -3511,8 +3516,7 @@ unit aoptx86;
end;
end
{$ifndef x86_64}
else if (current_settings.optimizecputype < cpu_Pentium2) and
MatchOpType(taicpu(p),top_const,top_reg) then
else if (current_settings.optimizecputype < cpu_Pentium2) then
begin
{ changes "shl $1, %reg" to "add %reg, %reg", which is the same on a 386,
but faster on a 486, and Tairable in both U and V pipes on the Pentium
@ -3540,7 +3544,130 @@ unit aoptx86;
end;
end
{$endif x86_64}
;
else if
GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and MatchOpType(taicpu(hp1), top_const, top_reg) and
(
(
MatchInstruction(hp1, A_AND, [taicpu(p).opsize]) and
SetAndTest(hp1, hp2)
{$ifdef x86_64}
) or
(
MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_AND, [taicpu(p).opsize]) and
MatchOpType(taicpu(hp2), top_reg, top_reg) and
(taicpu(hp1).oper[1]^.reg = taicpu(hp2).oper[0]^.reg)
{$endif x86_64}
)
) and
(taicpu(p).oper[1]^.reg = taicpu(hp2).oper[1]^.reg) then
begin
{ Change:
shl x, %reg1
mov -(1<<x), %reg2
and %reg2, %reg1
Or:
shl x, %reg1
and -(1<<x), %reg1
To just:
shl x, %reg1
Since the and operation only zeroes bits that are already zero from the shl operation
}
case taicpu(p).oper[0]^.val of
8:
mask:=$FFFFFFFFFFFFFF00;
16:
mask:=$FFFFFFFFFFFF0000;
32:
mask:=$FFFFFFFF00000000;
63:
{ Constant pre-calculated to prevent overflow errors with Int64 }
mask:=$8000000000000000;
else
begin
if taicpu(p).oper[0]^.val >= 64 then
{ Shouldn't happen realistically, since the register
is guaranteed to be set to zero at this point }
mask := 0
else
mask := -(Int64(1 shl taicpu(p).oper[0]^.val));
end;
end;
if taicpu(hp1).oper[0]^.val = mask then
begin
{ Everything checks out, perform the optimisation, as long as
the FLAGS register isn't being used}
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.next));
{$ifdef x86_64}
if (hp1 <> hp2) then
begin
{ "shl/mov/and" version }
UpdateUsedRegs(TmpUsedRegs, tai(hp1.next));
{ Don't do the optimisation if the FLAGS register is in use }
if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp2, TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'ShlMovAnd2Shl', p);
{ Don't remove the 'mov' instruction if its register is used elsewhere }
if not(RegUsedAfterInstruction(taicpu(hp1).oper[1]^.reg, hp2, TmpUsedRegs)) then
begin
asml.Remove(hp1);
hp1.Free;
Result := True;
end;
{ Only set Result to True if the 'mov' instruction was removed }
asml.Remove(hp2);
hp2.Free;
end;
end
else
{$endif x86_64}
begin
{ "shl/and" version }
{ Don't do the optimisation if the FLAGS register is in use }
if not(RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs)) then
begin
DebugMsg(SPeepholeOptimization + 'ShlAnd2Shl', p);
asml.Remove(hp1);
hp1.Free;
Result := True;
end;
end;
Exit;
end
else {$ifdef x86_64}if (hp1 = hp2) then{$endif x86_64}
begin
{ Even if the mask doesn't allow for its removal, we might be
able to optimise the mask for the "shl/and" version, which
may permit other peephole optimisations }
{$ifdef DEBUG_AOPTCPU}
mask := taicpu(hp1).oper[0]^.val and mask;
if taicpu(hp1).oper[0]^.val <> mask then
begin
DebugMsg(
SPeepholeOptimization +
'Changed mask from $' + debug_tostr(taicpu(hp1).oper[0]^.val) +
' to $' + debug_tostr(mask) +
'based on previous instruction (ShlAnd2ShlAnd)', hp1);
taicpu(hp1).oper[0]^.val := mask;
end;
{$else DEBUG_AOPTCPU}
{ If debugging is off, just set the operand even if it's the same }
taicpu(hp1).oper[0]^.val := taicpu(hp1).oper[0]^.val and mask;
{$endif DEBUG_AOPTCPU}
end;
end;
end;
@ -5357,6 +5484,35 @@ unit aoptx86;
hp1.Free;
end;
end
else if reg_and_hp1_is_instr and
(taicpu(p).oper[0]^.typ = top_reg) and
(
(taicpu(hp1).opcode = A_SHL) or (taicpu(hp1).opcode = A_SAL)
) and
(taicpu(hp1).oper[0]^.typ = top_const) and
SuperRegistersEqual(taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg) and
MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) and
{ Minimum shift value allowed is the bit difference between the sizes }
(taicpu(hp1).oper[0]^.val >=
{ Multiply by 8 because tcgsize2size returns bytes, not bits }
8 * (
tcgsize2size[reg_cgsize(taicpu(p).oper[1]^.reg)] -
tcgsize2size[reg_cgsize(taicpu(p).oper[0]^.reg)]
)
) then
begin
{ For:
movsx/movzx %reg1,%reg1 (same register, just different sizes)
shl/sal ##, %reg1
Remove the movsx/movzx instruction if the shift overwrites the
extended bits of the register (e.g. movslq %eax,%rax; shlq $32,%rax
}
DebugMsg(SPeepholeOptimization + 'MovxShl2Shl',p);
RemoveCurrentP(p, hp1);
Result := True;
Exit;
end
else if taicpu(p).opcode=A_MOVZX then
begin
{ removes superfluous And's after movzx's }