* x86: New CMP optimisation that mimics some of the CMOV optimisations and removes unnecessary conditions.

This commit is contained in:
J. Gareth "Curious Kit" Moreton 2022-04-15 03:01:26 +01:00 committed by FPK
parent bab60d819f
commit b108608b29

View File

@ -7518,109 +7518,140 @@ unit aoptx86;
GetNextInstruction(p_jump, p_jump); GetNextInstruction(p_jump, p_jump);
end; end;
{ if (
Try to optimise the following: { Don't call GetNextInstruction again if we already have it }
cmp $x,### ($x and $y can be registers or constants)
je @lbl1 (only reference)
cmp $y,### (### are identical)
@Lbl:
sete %reg1
Change to:
cmp $x,###
sete %reg2 (allocate new %reg2)
cmp $y,###
sete %reg1
orb %reg2,%reg1
(dealloc %reg2)
This adds an instruction (so don't perform under -Os), but it removes
a conditional branch.
}
if not (cs_opt_size in current_settings.optimizerswitches) and
(
(hp1 = p_jump) or (hp1 = p_jump) or
GetNextInstruction(p, hp1) GetNextInstruction(p, hp1)
) and ) and
MatchInstruction(hp1, A_Jcc, []) and MatchInstruction(hp1, A_Jcc, []) and
IsJumpToLabel(taicpu(hp1)) and IsJumpToLabel(taicpu(hp1)) and
(taicpu(hp1).condition in [C_E, C_Z]) and (taicpu(hp1).condition in [C_E, C_Z, C_NE, C_NZ]) and
GetNextInstruction(hp1, hp2) and GetNextInstruction(hp1, hp2) then
MatchInstruction(hp2, A_CMP, A_TEST, [taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[1]^) and
{ The first operand of CMP instructions can only be a register or
immediate anyway, so no need to check }
GetNextInstruction(hp2, p_label) and
(
(p_label.typ = ait_label) or
(
{ Sometimes there's a zero-distance jump before the label, so deal with it here
to potentially cut down on the iterations of Pass 1 }
MatchInstruction(p_label, A_Jcc, []) and
IsJumpToLabel(taicpu(p_label)) and
{ Use p_dist to hold the jump briefly }
SetAndTest(p_label, p_dist) and
GetNextInstruction(p_dist, p_label) and
(p_label.typ = ait_label) and
(tai_label(p_label).labsym.getrefs >= 2) and
(JumpTargetOp(taicpu(p_dist))^.ref^.symbol = tai_label(p_label).labsym) and
{ We might as well collapse the jump now }
CollapseZeroDistJump(p_dist, tai_label(p_label).labsym)
)
) and
(tai_label(p_label).labsym.getrefs = 1) and
(JumpTargetOp(taicpu(hp1))^.ref^.symbol = tai_label(p_label).labsym) and
GetNextInstruction(p_label, p_dist) and
MatchInstruction(p_dist, A_SETcc, []) and
(taicpu(p_dist).condition in [C_E, C_Z]) and
(taicpu(p_dist).oper[0]^.typ = top_reg) and
{ Get the instruction after the SETcc instruction so we can
allocate a new register over the entire range }
GetNextInstruction(p_dist, hp1_dist) then
begin begin
TransferUsedRegs(TmpUsedRegs); {
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next)); cmp x, y (or "cmp y, x")
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next)); je @lbl
UpdateUsedRegs(TmpUsedRegs, tai(p_label.Next)); mov x, y
// UpdateUsedRegs(TmpUsedRegs, tai(p_dist.Next)); @lbl:
(x and y can be constants, registers or references)
{ RegUsedAfterInstruction modifies TmpUsedRegs } Change to:
if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, p_dist, TmpUsedRegs) then mov x, y (x and y will always be equal in the end)
@lbl: (may beceome a dead label)
Also:
cmp x, y (or "cmp y, x")
jne @lbl
mov x, y
@lbl:
(x and y can be constants, registers or references)
Change to:
Absolutely nothing! (Except @lbl if it's still live)
}
if MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
(
(
MatchOperand(taicpu(p).oper[0]^, taicpu(hp2).oper[0]^) and
MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[1]^)
) or (
MatchOperand(taicpu(p).oper[0]^, taicpu(hp2).oper[1]^) and
MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[0]^)
)
) and
GetNextInstruction(hp2, hp1_label) and
SkipAligns(hp1_label, hp1_label) and
(hp1_label.typ = ait_label) and
(tai_label(hp1_label).labsym = taicpu(hp1).oper[0]^.ref^.symbol) then
begin begin
{ Register can appear in p if it's not used afterwards, so only tai_label(hp1_label).labsym.DecRefs;
allocate between hp1 and hp1_dist } if (taicpu(hp1).condition in [C_NE, C_NZ]) then
NewReg := GetIntRegisterBetween(R_SUBL, TmpUsedRegs, hp1, p_dist);
if NewReg <> NR_NO then
begin begin
DebugMsg(SPeepholeOptimization + 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR, removing conditional branch', p); DebugMsg(SPeepholeOptimization + 'CMP/JNE/MOV/@Lbl -> NOP, since the MOV is only executed if the operands are equal (CmpJneMov2Nop)', p);
RemoveInstruction(hp2);
hp2 := hp1_label; { So RemoveCurrentp below can be set to something valid }
end
else
DebugMsg(SPeepholeOptimization + 'CMP/JE/MOV/@Lbl -> MOV, since the MOV is only executed if the operands aren''t equal (CmpJeMov2Mov)', p);
{ Change the jump instruction into a SETcc instruction } RemoveInstruction(hp1);
taicpu(hp1).opcode := A_SETcc; RemoveCurrentp(p, hp2);
taicpu(hp1).opsize := S_B; Result := True;
taicpu(hp1).loadreg(0, NewReg); Exit;
end;
{ This is now a dead label } {
tai_label(p_label).labsym.decrefs; Try to optimise the following:
cmp $x,### ($x and $y can be registers or constants)
je @lbl1 (only reference)
cmp $y,### (### are identical)
@Lbl:
sete %reg1
{ Prefer adding before the next instruction so the FLAGS Change to:
register is deallocated first } cmp $x,###
hp2 := taicpu.op_reg_reg(A_OR, S_B, NewReg, taicpu(p_dist).oper[0]^.reg); sete %reg2 (allocate new %reg2)
taicpu(hp2).fileinfo := taicpu(p_dist).fileinfo; cmp $y,###
sete %reg1
orb %reg2,%reg1
(dealloc %reg2)
AsmL.InsertBefore( This adds an instruction (so don't perform under -Os), but it removes
hp2, a conditional branch.
hp1_dist }
); if not (cs_opt_size in current_settings.optimizerswitches) and
MatchInstruction(hp2, A_CMP, A_TEST, [taicpu(p).opsize]) and
MatchOperand(taicpu(p).oper[1]^, taicpu(hp2).oper[1]^) and
{ The first operand of CMP instructions can only be a register or
immediate anyway, so no need to check }
GetNextInstruction(hp2, p_label) and
(p_label.typ = ait_label) and
(tai_label(p_label).labsym.getrefs = 1) and
(JumpTargetOp(taicpu(hp1))^.ref^.symbol = tai_label(p_label).labsym) and
GetNextInstruction(p_label, p_dist) and
MatchInstruction(p_dist, A_SETcc, []) and
(taicpu(p_dist).condition in [C_E, C_Z]) and
(taicpu(p_dist).oper[0]^.typ = top_reg) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
UpdateUsedRegs(TmpUsedRegs, tai(p_label.Next));
UpdateUsedRegs(TmpUsedRegs, tai(p_dist.Next));
{ Make sure the new register is in use over the new instruction if not RegInUsedRegs(NR_DEFAULTFLAGS, TmpUsedRegs) and
(long-winded, but things work best when the FLAGS register { Get the instruction after the SETcc instruction so we can
is not allocated here) } allocate a new register over the entire range }
AllocRegBetween(NewReg, p_dist, hp2, TmpUsedRegs); GetNextInstruction(p_dist, hp1_dist) then
begin
{ Register can appear in p if it's not used afterwards, so only
allocate between hp1 and hp1_dist }
NewReg := GetIntRegisterBetween(R_SUBL, TmpUsedRegs, hp1, hp1_dist);
if NewReg <> NR_NO then
begin
DebugMsg(SPeepholeOptimization + 'CMP/JE/CMP/@Lbl/SETE -> CMP/SETE/CMP/SETE/OR, removing conditional branch', p);
Result := True; { Change the jump instruction into a SETcc instruction }
{ Don't exit yet, as p wasn't changed and hp1, while taicpu(hp1).opcode := A_SETcc;
modified, is still intact and might be optimised by the taicpu(hp1).opsize := S_B;
SETcc optimisation below } taicpu(hp1).loadreg(0, NewReg);
{ This is now a dead label }
tai_label(p_label).labsym.decrefs;
{ Prefer adding before the next instruction so the FLAGS
register is deallicated first }
AsmL.InsertBefore(
taicpu.op_reg_reg(A_OR, S_B, NewReg, taicpu(p_dist).oper[0]^.reg),
hp1_dist
);
Result := True;
{ Don't exit yet, as p wasn't changed and hp1, while
modified, is still intact and might be optimised by the
SETcc optimisation below }
end;
end; end;
end; end;
end; end;
@ -8733,8 +8764,8 @@ unit aoptx86;
{ The instruction can be safely moved } { The instruction can be safely moved }
asml.Remove(hp1); asml.Remove(hp1);
{ Try to insert before the FLAGS register is allocated, so "mov $0,%reg" { Try to insert after the last instructions where the FLAGS register is not
can be optimised into "xor %reg,%reg" later } yet in use, so "mov $0,%reg" can be optimised into "xor %reg,%reg" later }
if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then if SetAndTest(FindRegAllocBackward(NR_DEFAULTFLAGS, tai(p.Previous)), hp2) then
asml.InsertBefore(hp1, hp2) asml.InsertBefore(hp1, hp2)
@ -8750,9 +8781,9 @@ unit aoptx86;
asml.InsertAfter(hp1, hp2) asml.InsertAfter(hp1, hp2)
else else
{ Note, if p.Previous is nil (even if it should logically never be the { Note, if p.Previous is nil (even if it should logically never be the
case), FindRegAllocBackward immediately exits with False and so we case), FindRegAllocBackward immediately exits with False and so we
safely land here (we can't just pass p because FindRegAllocBackward safely land here (we can't just pass p because FindRegAllocBackward
immediately exits on an instruction). [Kit] } immediately exits on an instruction). [Kit] }
asml.InsertBefore(hp1, p); asml.InsertBefore(hp1, p);
DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1); DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and ' + debug_op2str(taicpu(hp1).opcode) + ' instructions to improve optimisation potential', hp1);