mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-09-23 14:49:13 +02:00
* x86: New SHR-based optimisations
This commit is contained in:
parent
e21186cac0
commit
6757bf3832
@ -191,6 +191,8 @@ unit aoptcpu;
|
||||
end;
|
||||
A_SHL, A_SAL:
|
||||
Result:=OptPass1SHLSAL(p);
|
||||
A_SHR:
|
||||
Result:=OptPass1SHR(p);
|
||||
A_SUB:
|
||||
Result:=OptPass1Sub(p);
|
||||
A_Jcc:
|
||||
|
@ -135,6 +135,7 @@ unit aoptx86;
|
||||
|
||||
class function IsExitCode(p : tai) : boolean; static;
|
||||
class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static;
|
||||
class function IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean; static;
|
||||
procedure RemoveLastDeallocForFuncRes(p : tai);
|
||||
|
||||
function DoSubAddOpt(var p : tai) : Boolean;
|
||||
@ -157,6 +158,7 @@ unit aoptx86;
|
||||
function OptPass1LEA(var p : tai) : boolean;
|
||||
function OptPass1Sub(var p : tai) : boolean;
|
||||
function OptPass1SHLSAL(var p : tai) : boolean;
|
||||
function OptPass1SHR(var p : tai) : boolean;
|
||||
function OptPass1FSTP(var p : tai) : boolean;
|
||||
function OptPass1FLD(var p : tai) : boolean;
|
||||
function OptPass1Cmp(var p : tai) : boolean;
|
||||
@ -6448,6 +6450,146 @@ unit aoptx86;
|
||||
end;
|
||||
|
||||
|
||||
class function TX86AsmOptimizer.IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean;
|
||||
begin
|
||||
case shr_size of
|
||||
S_B:
|
||||
{ No valid combinations }
|
||||
Result := False;
|
||||
|
||||
S_W:
|
||||
Result := (Shift >= 8) and (movz_size = S_BW);
|
||||
|
||||
S_L:
|
||||
Result :=
|
||||
(Shift >= 24) { Any opsize is valid for this shift } or
|
||||
((Shift >= 16) and (movz_size = S_WL));
|
||||
{$ifdef x86_64}
|
||||
S_Q:
|
||||
Result :=
|
||||
(Shift >= 56) { Any opsize is valid for this shift } or
|
||||
((Shift >= 48) and (movz_size = S_WL));
|
||||
{$endif x86_64}
|
||||
else
|
||||
InternalError(2022081510);
|
||||
end;
|
||||
end;
|
||||
|
||||
function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean;
|
||||
var
|
||||
hp1, hp2: tai;
|
||||
Shift: TCGInt;
|
||||
LimitSize: Topsize;
|
||||
DoNotMerge: Boolean;
|
||||
begin
|
||||
Result := False;
|
||||
|
||||
{ All these optimisations work on "shr const,%reg" }
|
||||
if not MatchOpType(taicpu(p), top_const, top_reg) then
|
||||
Exit;
|
||||
|
||||
DoNotMerge := False;
|
||||
Shift := taicpu(p).oper[0]^.val;
|
||||
LimitSize := taicpu(p).opsize;
|
||||
|
||||
hp1 := p;
|
||||
repeat
|
||||
if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
|
||||
Exit;
|
||||
|
||||
case taicpu(hp1).opcode of
|
||||
A_TEST, A_CMP, A_Jcc:
|
||||
{ Skip over conditional jumps and relevant comparisons }
|
||||
Continue;
|
||||
|
||||
A_MOVZX:
|
||||
if MatchOpType(taicpu(hp1), top_reg, top_reg) and
|
||||
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
|
||||
begin
|
||||
{ Since the original register is being read as is, subsequent
|
||||
SHRs must not be merged at this point }
|
||||
DoNotMerge := True;
|
||||
|
||||
if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
|
||||
begin
|
||||
if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then { Different register target }
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1);
|
||||
taicpu(hp1).opcode := A_MOV;
|
||||
setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
|
||||
case taicpu(hp1).opsize of
|
||||
S_BW:
|
||||
taicpu(hp1).opsize := S_W;
|
||||
S_BL, S_WL:
|
||||
taicpu(hp1).opsize := S_L;
|
||||
else
|
||||
InternalError(2022081503);
|
||||
end;
|
||||
|
||||
{ p itself hasn't changed, so no need to set Result to True }
|
||||
Include(OptsToCheck, aoc_ForceNewIteration);
|
||||
|
||||
{ See if there's anything afterwards that can be
|
||||
optimised, since the input register hasn't changed }
|
||||
Continue;
|
||||
end;
|
||||
|
||||
{ NOTE: If the MOVZX instruction reads and writes the same
|
||||
register, defer this to the post-peephole optimisation stage }
|
||||
Exit;
|
||||
end;
|
||||
end;
|
||||
A_SHL, A_SAL, A_SHR:
|
||||
if (taicpu(hp1).opsize <= LimitSize) and
|
||||
MatchOpType(taicpu(hp1), top_const, top_reg) and
|
||||
SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
|
||||
begin
|
||||
{ Make sure the sizes don't exceed the register size limit
|
||||
(measured by the shift value falling below the limit) }
|
||||
|
||||
if taicpu(hp1).opsize < LimitSize then
|
||||
LimitSize := taicpu(hp1).opsize;
|
||||
|
||||
if taicpu(hp1).opcode = A_SHR then
|
||||
Inc(Shift, taicpu(hp1).oper[0]^.val)
|
||||
else
|
||||
begin
|
||||
Dec(Shift, taicpu(hp1).oper[0]^.val);
|
||||
DoNotMerge := True;
|
||||
end;
|
||||
|
||||
if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
|
||||
Exit;
|
||||
|
||||
{ Since we've established that the combined shift is within
|
||||
limits, we can actually combine the adjacent SHR
|
||||
instructions even if they're different sizes }
|
||||
if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
|
||||
begin
|
||||
hp2 := tai(hp1.Previous);
|
||||
DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 1', p);
|
||||
Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
|
||||
RemoveInstruction(hp1);
|
||||
hp1 := hp2;
|
||||
|
||||
{ Though p has changed, only the constant has, and its
|
||||
effects can still be detected on the next iteration of
|
||||
the repeat..until loop }
|
||||
Include(OptsToCheck, aoc_ForceNewIteration);
|
||||
end;
|
||||
|
||||
{ Move onto the next instruction }
|
||||
Continue;
|
||||
end;
|
||||
else
|
||||
;
|
||||
end;
|
||||
|
||||
Break;
|
||||
until False;
|
||||
end;
|
||||
|
||||
|
||||
function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean;
|
||||
var
|
||||
CurrentRef: TReference;
|
||||
@ -12930,36 +13072,193 @@ unit aoptx86;
|
||||
|
||||
function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean;
|
||||
var
|
||||
hp1: tai;
|
||||
hp1, hp2: tai;
|
||||
IdentityMask, Shift: TCGInt;
|
||||
LimitSize: Topsize;
|
||||
DoNotMerge: Boolean;
|
||||
begin
|
||||
{ Detect:
|
||||
shr x, %ax (x > 0)
|
||||
...
|
||||
movzwl %ax,%eax
|
||||
|
||||
Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
|
||||
}
|
||||
|
||||
Result := False;
|
||||
if MatchOpType(taicpu(p), top_const, top_reg) and
|
||||
(taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W }
|
||||
(taicpu(p).oper[0]^.val > 0) and
|
||||
GetNextInstructionUsingReg(p, hp1, NR_EAX) and
|
||||
MatchInstruction(hp1, A_MOVZX, [S_WL]) and
|
||||
MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and
|
||||
MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
|
||||
taicpu(hp1).opcode := A_CWDE;
|
||||
taicpu(hp1).clearop(0);
|
||||
taicpu(hp1).clearop(1);
|
||||
taicpu(hp1).ops := 0;
|
||||
|
||||
{ A change was made, but not with p, so move forward 1 }
|
||||
p := tai(p.Next);
|
||||
Result := True;
|
||||
{ All these optimisations work on "shr const,%reg" }
|
||||
if not MatchOpType(taicpu(p), top_const, top_reg) then
|
||||
Exit;
|
||||
|
||||
DoNotMerge := False;
|
||||
Shift := taicpu(p).oper[0]^.val;
|
||||
LimitSize := taicpu(p).opsize;
|
||||
|
||||
hp1 := p;
|
||||
repeat
|
||||
if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then
|
||||
Exit;
|
||||
|
||||
{ Detect:
|
||||
shr x, %reg
|
||||
and y, %reg
|
||||
|
||||
If and y, %reg doesn't actually change the value of %reg (e.g. with
|
||||
"shrl $24,%reg; andl $255,%reg", remove the AND instruction.
|
||||
}
|
||||
|
||||
case taicpu(hp1).opcode of
|
||||
A_AND:
|
||||
if (taicpu(hp1).opsize = taicpu(p).opsize) and
|
||||
MatchOpType(taicpu(hp1), top_const, top_reg) and
|
||||
(taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then
|
||||
begin
|
||||
{ Make sure the FLAGS register isn't in use }
|
||||
TransferUsedRegs(TmpUsedRegs);
|
||||
hp2 := p;
|
||||
repeat
|
||||
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
|
||||
until not GetNextInstruction(hp2, hp2) or (hp2 = hp1);
|
||||
|
||||
if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then
|
||||
begin
|
||||
{ Generate the identity mask }
|
||||
case taicpu(p).opsize of
|
||||
S_B:
|
||||
IdentityMask := $FF shr Shift;
|
||||
S_W:
|
||||
IdentityMask := $FFFF shr Shift;
|
||||
S_L:
|
||||
IdentityMask := $FFFFFFFF shr Shift;
|
||||
{$ifdef x86_64}
|
||||
S_Q:
|
||||
{ We need to force the operands to be unsigned 64-bit
|
||||
integers otherwise the wrong value is generated }
|
||||
IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift));
|
||||
{$endif x86_64}
|
||||
else
|
||||
InternalError(2022081501);
|
||||
end;
|
||||
|
||||
if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1);
|
||||
{ All the possible 1 bits are covered, so we can remove the AND }
|
||||
hp2 := tai(hp1.Previous);
|
||||
RemoveInstruction(hp1);
|
||||
|
||||
{ p wasn't actually changed, so don't set Result to True,
|
||||
but a change was nonetheless made elsewhere }
|
||||
Include(OptsToCheck, aoc_ForceNewIteration);
|
||||
|
||||
{ Do another pass in case other AND or MOVZX instructions
|
||||
follow }
|
||||
hp1 := hp2;
|
||||
Continue;
|
||||
end;
|
||||
|
||||
end;
|
||||
end;
|
||||
|
||||
A_TEST, A_CMP, A_Jcc:
|
||||
{ Skip over conditional jumps and relevant comparisons }
|
||||
Continue;
|
||||
|
||||
A_MOVZX:
|
||||
if MatchOpType(taicpu(hp1), top_reg, top_reg) and
|
||||
SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then
|
||||
begin
|
||||
{ Since the original register is being read as is, subsequent
|
||||
SHRs must not be merged at this point }
|
||||
DoNotMerge := True;
|
||||
|
||||
if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then
|
||||
begin
|
||||
if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1);
|
||||
{ All the possible 1 bits are covered, so we can remove the AND }
|
||||
hp2 := tai(hp1.Previous);
|
||||
RemoveInstruction(hp1);
|
||||
|
||||
hp1 := hp2;
|
||||
end
|
||||
else { Different register target }
|
||||
begin
|
||||
DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 2)', hp1);
|
||||
taicpu(hp1).opcode := A_MOV;
|
||||
setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg));
|
||||
case taicpu(hp1).opsize of
|
||||
S_BW:
|
||||
taicpu(hp1).opsize := S_W;
|
||||
S_BL, S_WL:
|
||||
taicpu(hp1).opsize := S_L;
|
||||
else
|
||||
InternalError(2022081503);
|
||||
end;
|
||||
end;
|
||||
end
|
||||
else if (Shift > 0) and
|
||||
(taicpu(p).opsize = S_W) and
|
||||
(taicpu(hp1).opsize = S_WL) and
|
||||
(taicpu(hp1).oper[0]^.reg = NR_AX) and
|
||||
(taicpu(hp1).oper[1]^.reg = NR_EAX) then
|
||||
begin
|
||||
{ Detect:
|
||||
shr x, %ax (x > 0)
|
||||
...
|
||||
movzwl %ax,%eax
|
||||
|
||||
Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax)
|
||||
}
|
||||
DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1);
|
||||
taicpu(hp1).opcode := A_CWDE;
|
||||
taicpu(hp1).clearop(0);
|
||||
taicpu(hp1).clearop(1);
|
||||
taicpu(hp1).ops := 0;
|
||||
end;
|
||||
|
||||
{ Move onto the next instruction }
|
||||
Continue;
|
||||
end;
|
||||
|
||||
A_SHL, A_SAL, A_SHR:
|
||||
if (taicpu(hp1).opsize <= LimitSize) and
|
||||
MatchOpType(taicpu(hp1), top_const, top_reg) and
|
||||
SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then
|
||||
begin
|
||||
{ Make sure the sizes don't exceed the register size limit
|
||||
(measured by the shift value falling below the limit) }
|
||||
|
||||
if taicpu(hp1).opsize < LimitSize then
|
||||
LimitSize := taicpu(hp1).opsize;
|
||||
|
||||
if taicpu(hp1).opcode = A_SHR then
|
||||
Inc(Shift, taicpu(hp1).oper[0]^.val)
|
||||
else
|
||||
begin
|
||||
Dec(Shift, taicpu(hp1).oper[0]^.val);
|
||||
DoNotMerge := True;
|
||||
end;
|
||||
|
||||
if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then
|
||||
Exit;
|
||||
|
||||
{ Since we've established that the combined shift is within
|
||||
limits, we can actually combine the adjacent SHR
|
||||
instructions even if they're different sizes }
|
||||
if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then
|
||||
begin
|
||||
hp2 := tai(hp1.Previous);
|
||||
DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 2', p);
|
||||
Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val);
|
||||
RemoveInstruction(hp1);
|
||||
hp1 := hp2;
|
||||
end;
|
||||
|
||||
{ Move onto the next instruction }
|
||||
Continue;
|
||||
end;
|
||||
else
|
||||
;
|
||||
end;
|
||||
|
||||
Break;
|
||||
until False;
|
||||
|
||||
end;
|
||||
|
||||
|
||||
|
@ -130,6 +130,8 @@ uses
|
||||
result:=OptPass1Sub(p);
|
||||
A_SHL,A_SAL:
|
||||
result:=OptPass1SHLSAL(p);
|
||||
A_SHR:
|
||||
result:=OptPass1SHR(p);
|
||||
A_FSTP,A_FISTP:
|
||||
result:=OptPass1FSTP(p);
|
||||
A_FLD:
|
||||
|
Loading…
Reference in New Issue
Block a user