* x86: Introduced TrySwapMovOp method, and redesigned TrySwapMovCmp

to use it while also trying to move one more instruction back
This commit is contained in:
J. Gareth "Curious Kit" Moreton 2022-04-17 05:40:40 +01:00 committed by FPK
parent 6af886c2b9
commit 5f3749dc49

View File

@ -211,6 +211,7 @@ unit aoptx86;
procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean;
function TrySwapMovOp(var p, hp1: tai): Boolean;
function TrySwapMovCmp(var p, hp1: tai): Boolean;
{ Processor-dependent reference optimisation }
@ -8453,10 +8454,10 @@ unit aoptx86;
Break;
case taicpu(hp2).opcode of
A_MOVSS:
A_MOVSD:
begin
if taicpu(hp2).ops = 0 then
{ Wrong MOVSS }
{ Wrong MOVSD }
Break;
Inc(Count);
if Count >= 5 then
@ -8475,7 +8476,7 @@ unit aoptx86;
A_MOVZX,
A_MOVAPS,
A_MOVUPS,
A_MOVSD,
A_MOVSS,
A_MOVAPD,
A_MOVUPD,
A_MOVDQA,
@ -8626,41 +8627,38 @@ unit aoptx86;
end;
function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
const
WriteOp: array[0..3] of set of TInsChange = (
[Ch_Wop1, Ch_RWop1, Ch_Mop1],
[Ch_Wop2, Ch_RWop2, Ch_Mop2],
[Ch_Wop3, Ch_RWop3, Ch_Mop3],
[Ch_Wop4, Ch_RWop4, Ch_Mop4]);
RegWriteFlags: array[0..7] of set of TInsChange = (
{ The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
[Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
[Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
[Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
[Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
[Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
[Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
[Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
[Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
function TX86AsmOptimizer.TrySwapMovOp(var p, hp1: tai): Boolean;
var
hp2: tai;
X: Integer;
const
WriteOp: array[0..3] of set of TInsChange = (
[Ch_Wop1, Ch_RWop1, Ch_Mop1],
[Ch_Wop2, Ch_RWop2, Ch_Mop2],
[Ch_Wop3, Ch_RWop3, Ch_Mop3],
[Ch_Wop4, Ch_RWop4, Ch_Mop4]);
RegWriteFlags: array[0..7] of set of TInsChange = (
{ The order is important: EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP }
[Ch_WEAX, Ch_RWEAX, Ch_MEAX{$ifdef x86_64}, Ch_WRAX, Ch_RWRAX, Ch_MRAX{$endif x86_64}],
[Ch_WECX, Ch_RWECX, Ch_MECX{$ifdef x86_64}, Ch_WRCX, Ch_RWRCX, Ch_MRCX{$endif x86_64}],
[Ch_WEDX, Ch_RWEDX, Ch_MEDX{$ifdef x86_64}, Ch_WRDX, Ch_RWRDX, Ch_MRDX{$endif x86_64}],
[Ch_WEBX, Ch_RWEBX, Ch_MEBX{$ifdef x86_64}, Ch_WRBX, Ch_RWRBX, Ch_MRBX{$endif x86_64}],
[Ch_WESI, Ch_RWESI, Ch_MESI{$ifdef x86_64}, Ch_WRSI, Ch_RWRSI, Ch_MRSI{$endif x86_64}],
[Ch_WEDI, Ch_RWEDI, Ch_MEDI{$ifdef x86_64}, Ch_WRDI, Ch_RWRDI, Ch_MRDI{$endif x86_64}],
[Ch_WEBP, Ch_RWEBP, Ch_MEBP{$ifdef x86_64}, Ch_WRBP, Ch_RWRBP, Ch_MRBP{$endif x86_64}],
[Ch_WESP, Ch_RWESP, Ch_MESP{$ifdef x86_64}, Ch_WRSP, Ch_RWRSP, Ch_MRSP{$endif x86_64}]);
begin
{ If we have something like:
cmp ###,%reg1
mov 0,%reg2
op ###,###
mov ###,###
And no modified registers are shared, move the instruction to before
the comparison as this means it can be optimised without worrying
about the FLAGS register. (CMP/MOV is generated by
"J(c)Mov1JmpMov0 -> Set(~c)", among other things).
Try to move the MOV instruction to before OP as long as OP and MOV don't
interfere in regards to what they write to.
As long as the second instruction doesn't use the flags or one of the
registers used by CMP or TEST (also check any references that use the
registers), then it can be moved prior to the comparison.
NOTE: p must be a 2-operand instruction
}
Result := False;
@ -8672,12 +8670,12 @@ unit aoptx86;
{ NOP is a pipeline fence, likely marking the beginning of the function
epilogue, so drop out. Similarly, drop out if POP or RET are
encountered }
if MatchInstruction(hp1, A_NOP, A_POP, []) then
if MatchInstruction(hp1, A_NOP, A_POP, A_RET, []) then
Exit;
if (taicpu(hp1).opcode = A_MOVSS) and
if (taicpu(hp1).opcode = A_MOVSD) and
(taicpu(hp1).ops = 0) then
{ Wrong MOVSS }
{ Wrong MOVSD }
Exit;
{ Check for writes to specific registers first }
@ -8705,6 +8703,25 @@ unit aoptx86;
Exit;
end;
{ Check p to make sure it doesn't write to something that affects hp1 }
{ Check for writes to specific registers first }
{ EAX, ECX, EDX, EBX, ESI, EDI, EBP, ESP in that order }
for X := 0 to 7 do
if (RegWriteFlags[X] * InsProp[taicpu(p).opcode].Ch <> [])
and RegInInstruction(newreg(R_INTREGISTER, TSuperRegister(X), R_SUBWHOLE), hp1) then
Exit;
for X := 0 to taicpu(p).ops - 1 do
begin
{ Check to see if this operand writes to something }
if ((WriteOp[X] * InsProp[taicpu(p).opcode].Ch) <> []) and
{ And matches something in hp1 }
(taicpu(p).oper[X]^.typ = top_reg) and
RegInInstruction(taicpu(p).oper[X]^.reg, hp1) then
Exit;
end;
{ The instruction can be safely moved }
asml.Remove(hp1);
@ -8712,6 +8729,17 @@ unit aoptx86;
can be optimised into "xor %reg,%reg" later }
if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then
asml.InsertBefore(hp1, hp2)
{ Failing that, try to insert after the last instructions where the
FLAGS register is not yet in use }
else if GetLastInstruction(p, hp2) and
(
(hp2.typ <> ait_instruction) or
{ Don't insert after an instruction that uses the flags when p doesn't use them }
RegInInstruction(NR_DEFAULTFLAGS, p) or
not RegInInstruction(NR_DEFAULTFLAGS, hp2)
) then
asml.InsertAfter(hp1, hp2)
else
{ Note, if p.Previous is nil (even if it should logically never be the
case), FindRegAllocBackward immediately exits with False and so we
@ -8721,26 +8749,90 @@ unit aoptx86;
DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);
{ We can't trust UsedRegs because we're looking backwards, although we
know the registers are allocated after p at the very least, so manually
create tai_regalloc objects if needed }
for X := 0 to taicpu(hp1).ops - 1 do
case taicpu(hp1).oper[X]^.typ of
top_reg:
AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
begin
asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.reg, nil), hp1);
IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.reg, UsedRegs);
AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs);
end;
top_ref:
begin
if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then
AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
begin
asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.base, nil), hp1);
IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.base, UsedRegs);
AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs);
end;
if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then
AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
begin
asml.InsertBefore(tai_regalloc.alloc(taicpu(hp1).oper[X]^.ref^.index, nil), hp1);
IncludeRegInUsedRegs(taicpu(hp1).oper[X]^.ref^.index, UsedRegs);
AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs);
end;
end;
else
;
end;
Result := True;
end;
function TX86AsmOptimizer.TrySwapMovCmp(var p, hp1: tai): Boolean;
var
hp2: tai;
X: Integer;
begin
{ If we have something like:
cmp ###,%reg1
mov 0,%reg2
And no modified registers are shared, move the instruction to before
the comparison as this means it can be optimised without worrying
about the FLAGS register. (CMP/MOV is generated by
"J(c)Mov1JmpMov0 -> Set(~c)", among other things).
As long as the second instruction doesn't use the flags or one of the
registers used by CMP or TEST (also check any references that use the
registers), then it can be moved prior to the comparison.
}
Result := False;
if not TrySwapMovOp(p, hp1) then
Exit;
if taicpu(hp1).opcode = A_LEA then
{ The flags will be overwritten by the CMP/TEST instruction }
ConvertLEA(taicpu(hp1));
Result := True;
{ Can we move it one further back? }
if GetLastInstruction(hp1, hp2) and (hp2.typ = ait_instruction) and
{ Check to see if CMP/TEST is a comparison against zero }
(
(
(taicpu(p).opcode = A_CMP) and
MatchOperand(taicpu(p).oper[0]^, 0)
) or
(
(taicpu(p).opcode = A_TEST) and
(
OpsEqual(taicpu(p).oper[0]^, taicpu(p).oper[1]^) or
MatchOperand(taicpu(p).oper[0]^, -1)
)
)
) and
{ These instructions set the zero flag if the result is zero }
MatchInstruction(hp2, [A_ADD, A_SUB, A_OR, A_XOR, A_AND, A_POPCNT, A_LZCNT], []) and
OpsEqual(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) then
{ Looks like we can - if successful, this benefits PostPeepholeOptTestOr }
TrySwapMovOp(hp2, hp1);
end;