* x86: Additional optimisation in OptPass2MOV to help with sub-optimal MOV/CMP/MOV triplets (and a minor refactor).

This commit is contained in:
J. Gareth "Curious Kit" Moreton 2024-03-08 06:43:53 +00:00 committed by FPK
parent f4e955d04f
commit 755d221230

View File

@ -10003,13 +10003,64 @@ unit aoptx86;
if not GetNextInstruction(p, hp1) then if not GetNextInstruction(p, hp1) then
Exit; Exit;
if MatchInstruction(hp1, A_CMP, A_TEST, [taicpu(p).opsize]) if MatchInstruction(hp1, A_CMP, A_TEST, []) then
and DoMovCmpMemOpt(p, hp1) then begin
if (taicpu(hp1).opsize = taicpu(p).opsize) and DoMovCmpMemOpt(p, hp1) then
begin begin
Result := True; Result := True;
Exit; Exit;
end end;
else if MatchInstruction(hp1, A_JMP, [S_NO]) then
{ This optimisation is only effective on a second run of Pass 2,
hence -O3 or above.
Change:
mov %reg1,%reg2
cmp/test (contains %reg1)
mov x, %reg1
(another mov or a j(c))
To:
mov %reg1,%reg2
mov x, %reg1
cmp (%reg1 replaced with %reg2)
(another mov or a j(c))
The requirement of an additional MOV or a jump ensures there
isn't performance loss, since a j(c) will permit macro-fusion
with the cmp instruction, while another MOV likely means it's
not all being executed in a single cycle due to parallelisation.
}
if (cs_opt_level3 in current_settings.optimizerswitches) and
MatchOpType(taicpu(p), top_reg, top_reg) and
RegInInstruction(taicpu(p).oper[0]^.reg, taicpu(hp1)) and
GetNextInstruction(hp1, hp2) and
MatchInstruction(hp2, A_MOV, []) and
(taicpu(hp2).oper[1]^.typ = top_reg) and
{ Registers don't have to be the same size in this case }
SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[0]^.reg) and
GetNextInstruction(hp2, hp3) and
MatchInstruction(hp3, A_MOV, A_Jcc, []) and
{ Make sure the operands in the camparison can be safely replaced }
(
not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[0]^) or
ReplaceRegisterInOper(taicpu(hp1), 0, taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg)
) and
(
not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^) or
ReplaceRegisterInOper(taicpu(hp1), 1, taicpu(p).oper[0]^.reg, taicpu(p).oper[1]^.reg)
) then
begin
DebugMsg(SPeepholeOptimization + 'MOV/CMP/MOV -> MOV/MOV/CMP', p);
AsmL.Remove(hp2);
AsmL.InsertAfter(hp2, p);
Result := True;
Exit;
end;
end;
if MatchInstruction(hp1, A_JMP, [S_NO]) then
begin begin
{ Sometimes the MOVs that OptPass2JMP produces can be improved { Sometimes the MOVs that OptPass2JMP produces can be improved
further, but we can't just put this jump optimisation in pass 1 further, but we can't just put this jump optimisation in pass 1
@ -10019,21 +10070,30 @@ unit aoptx86;
UpdateUsedRegs(tai(p.Next)); UpdateUsedRegs(tai(p.Next));
if OptPass2JMP(hp1) then if OptPass2JMP(hp1) then
begin
{ Restore register state }
RestoreUsedRegs(TempTracking);
ReleaseUsedRegs(TempTracking);
{ call OptPass1MOV once to potentially merge any MOVs that were created } { call OptPass1MOV once to potentially merge any MOVs that were created }
Result := OptPass1MOV(p); OptPass1MOV(p);
{ OptPass2MOV will now exit but will be called again if OptPass1MOV Result := True;
returned True and the instruction is still a MOV, thus checking Exit;
the optimisations below } end;
{ If OptPass2JMP returned False, no optimisations were done to { If OptPass2JMP returned False, no optimisations were done to
the jump and there are no further optimisations that can be done the jump and there are no further optimisations that can be done
to the MOV instruction on this pass } to the MOV instruction on this pass other than FuncMov2Func }
{ Restore register state } { Restore register state }
RestoreUsedRegs(TempTracking); RestoreUsedRegs(TempTracking);
ReleaseUsedRegs(TempTracking); ReleaseUsedRegs(TempTracking);
end
else if MatchOpType(taicpu(p),top_reg,top_reg) and Result := FuncMov2Func(p, hp1);
Exit;
end;
if MatchOpType(taicpu(p),top_reg,top_reg) and
(taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
(taicpu(hp1).oper[1]^.typ = top_reg) and (taicpu(hp1).oper[1]^.typ = top_reg) and
@ -10076,8 +10136,9 @@ unit aoptx86;
Exit; Exit;
end; end;
end; end;
end end;
else if MatchOpType(taicpu(p),top_reg,top_reg) and
if MatchOpType(taicpu(p),top_reg,top_reg) and
{$ifdef x86_64} {$ifdef x86_64}
MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
{$else x86_64} {$else x86_64}
@ -10105,11 +10166,12 @@ unit aoptx86;
Result:=true; Result:=true;
end; end;
exit; Exit;
end end;
else if MatchOpType(taicpu(p),top_reg,top_reg) and
if MatchOpType(taicpu(p),top_reg,top_reg) and
IsXCHGAcceptable and IsXCHGAcceptable and
{ XCHG doesn't support 8-byte registers } { XCHG doesn't support 8-bit registers }
(taicpu(p).opsize <> S_B) and (taicpu(p).opsize <> S_B) and
MatchInstruction(hp1, A_MOV, []) and MatchInstruction(hp1, A_MOV, []) and
MatchOpType(taicpu(hp1),top_reg,top_reg) and MatchOpType(taicpu(hp1),top_reg,top_reg) and
@ -10146,8 +10208,9 @@ unit aoptx86;
Result := True; Result := True;
Exit; Exit;
end; end;
end end;
else if MatchOpType(taicpu(p),top_reg,top_reg) and
if MatchOpType(taicpu(p),top_reg,top_reg) and
MatchInstruction(hp1, A_SAR, []) then MatchInstruction(hp1, A_SAR, []) then
begin begin
if MatchOperand(taicpu(hp1).oper[0]^, 31) then if MatchOperand(taicpu(hp1).oper[0]^, 31) then
@ -10172,7 +10235,9 @@ unit aoptx86;
taicpu(p).clearop(1); taicpu(p).clearop(1);
taicpu(p).clearop(0); taicpu(p).clearop(0);
taicpu(p).ops:=0; taicpu(p).ops:=0;
Result := True; Result := True;
Exit;
end end
else if (cs_opt_size in current_settings.optimizerswitches) and else if (cs_opt_size in current_settings.optimizerswitches) and
(taicpu(p).oper[0]^.reg = NR_EDX) and (taicpu(p).oper[0]^.reg = NR_EDX) and
@ -10194,6 +10259,9 @@ unit aoptx86;
taicpu(hp1).clearop(1); taicpu(hp1).clearop(1);
taicpu(hp1).clearop(0); taicpu(hp1).clearop(0);
taicpu(hp1).ops:=0; taicpu(hp1).ops:=0;
Include(OptsToCheck, aoc_ForceNewIteration);
Exit;
end; end;
{$ifndef x86_64} {$ifndef x86_64}
end end
@ -10273,6 +10341,9 @@ unit aoptx86;
else else
; ;
end; end;
Result := True;
Exit;
end; end;
end; end;
{$else x86_64} {$else x86_64}
@ -10299,7 +10370,9 @@ unit aoptx86;
taicpu(p).clearop(1); taicpu(p).clearop(1);
taicpu(p).clearop(0); taicpu(p).clearop(0);
taicpu(p).ops:=0; taicpu(p).ops:=0;
Result := True; Result := True;
Exit;
end end
else if (cs_opt_size in current_settings.optimizerswitches) and else if (cs_opt_size in current_settings.optimizerswitches) and
(taicpu(p).oper[0]^.reg = NR_RDX) and (taicpu(p).oper[0]^.reg = NR_RDX) and
@ -10321,11 +10394,15 @@ unit aoptx86;
taicpu(hp1).clearop(1); taicpu(hp1).clearop(1);
taicpu(hp1).clearop(0); taicpu(hp1).clearop(0);
taicpu(hp1).ops:=0; taicpu(hp1).ops:=0;
Include(OptsToCheck, aoc_ForceNewIteration);
Exit;
{$endif x86_64} {$endif x86_64}
end; end;
end; end;
end end;
else if MatchInstruction(hp1, A_MOV, []) and
if MatchInstruction(hp1, A_MOV, []) and
(taicpu(hp1).oper[1]^.typ = top_reg) then (taicpu(hp1).oper[1]^.typ = top_reg) then
{ Though "GetNextInstruction" could be factored out, along with { Though "GetNextInstruction" could be factored out, along with
the instructions that depend on hp2, it is an expensive call that the instructions that depend on hp2, it is an expensive call that
@ -10376,6 +10453,8 @@ unit aoptx86;
taicpu(hp1).ops:=0; taicpu(hp1).ops:=0;
RemoveInstruction(hp2); RemoveInstruction(hp2);
Include(OptsToCheck, aoc_ForceNewIteration);
(* (*
{$ifdef x86_64} {$ifdef x86_64}
end end
@ -10423,13 +10502,16 @@ unit aoptx86;
taicpu(hp1).ops:=0; taicpu(hp1).ops:=0;
RemoveInstruction(hp2); RemoveInstruction(hp2);
Include(OptsToCheck, aoc_ForceNewIteration);
{$endif x86_64} {$endif x86_64}
*) *)
end; end;
end; end;
{$ifdef x86_64} {$ifdef x86_64}
end end;
else if (taicpu(p).opsize = S_L) and
if (taicpu(p).opsize = S_L) and
(taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[1]^.typ = top_reg) and
( (
MatchInstruction(hp1, A_MOV,[]) and MatchInstruction(hp1, A_MOV,[]) and
@ -10502,10 +10584,17 @@ unit aoptx86;
DebugMsg(SPeepholeOptimization + 'MovMov*Shr2MovMov*Rcr', p); DebugMsg(SPeepholeOptimization + 'MovMov*Shr2MovMov*Rcr', p);
if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then if (getsupreg(taicpu(hp2).oper[1]^.reg) = getsupreg(taicpu(hp1).oper[1]^.reg)) then
begin
{ Change first MOV command to have the same register as the final output } { Change first MOV command to have the same register as the final output }
taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg taicpu(p).oper[1]^.reg := taicpu(hp1).oper[1]^.reg;
AllocRegBetween(taicpu(hp1).oper[1]^.reg, p, hp1, UsedRegs);
Result := True;
end
else else
begin
taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg; taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg;
Include(OptsToCheck, aoc_ForceNewIteration);
end;
{ Change second MOV command to an ADD command. This is easier than { Change second MOV command to an ADD command. This is easier than
converting the existing command because it means we don't have to converting the existing command because it means we don't have to
@ -10520,6 +10609,8 @@ unit aoptx86;
taicpu(hp3).opcode := A_RCR; taicpu(hp3).opcode := A_RCR;
taicpu(hp3).changeopsize(S_L); taicpu(hp3).changeopsize(S_L);
setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD); setsubreg(taicpu(hp3).oper[1]^.reg, R_SUBD);
{ Don't need to Exit yet as p is still a MOV and hp1 hasn't been
called, so FuncMov2Func below is safe to call }
{$endif x86_64} {$endif x86_64}
end; end;