* x86: New peephole optimisation for improving newly inserted (V)MOVD/(V)MOVQ instructions

This commit is contained in:
J. Gareth "Curious Kit" Moreton 2025-04-20 09:29:56 +01:00 committed by FPK
parent 67ea121250
commit e7b6a08eae
3 changed files with 123 additions and 1 deletions

View File

@ -185,8 +185,11 @@ unit aoptcpu;
Result:=OptPass1LEA(p);
A_MOV:
Result:=OptPass1MOV(p);
A_MOVD,
A_VMOVD:
Result:=OptPass1MOVD(p);
A_MOVSX,
A_MOVZX :
A_MOVZX:
Result:=OptPass1Movx(p);
A_TEST:
Result:=OptPass1Test(p);

View File

@ -177,6 +177,7 @@ unit aoptx86;
function OptPass1_V_MOVAP(var p : tai) : boolean;
function OptPass1VOP(var p : tai) : boolean;
function OptPass1MOV(var p : tai) : boolean;
function OptPass1MOVD(var p : tai) : boolean;
function OptPass1Movx(var p : tai) : boolean;
function OptPass1MOVXX(var p : tai) : boolean;
function OptPass1OP(var p : tai) : boolean;
@ -4480,6 +4481,38 @@ unit aoptx86;
if not GetNextInstruction_p or (hp1.typ <> ait_instruction) then
Exit;
{ Change:
movl/q (ref), %reg
movd/q %reg, %xmm0
(dealloc %reg)
To:
movd/q (ref), %xmm0
}
if MatchOpType(taicpu(p),top_ref,top_reg) and
MatchInstruction(hp1,[A_MOVD,A_VMOVD{$ifdef x86_64},A_MOVQ,A_VMOVQ{$endif x86_64}],[]) and
MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^.reg) and
(taicpu(hp1).oper[1]^.typ=top_reg) and
(GetRegType(taicpu(hp1).oper[1]^.reg)=R_MMREGISTER) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegs(TmpUsedRegs, tai(p.Next));
if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs) then
begin
taicpu(hp1).loadref(0,taicpu(p).oper[0]^.ref^);
{ loadref increases the reference count, so decrement it again }
if Assigned(taicpu(p).oper[0]^.ref^.symbol) then
taicpu(p).oper[0]^.ref^.symbol.decrefs;
if Assigned(taicpu(p).oper[0]^.ref^.relsymbol) then
taicpu(p).oper[0]^.ref^.relsymbol.decrefs;
DebugMsg(SPeepholeOptimization+'Merged MOV and (V)MOVD/(V)MOVQ to eliminate intermediate register (MovMovD/Q2MovD/Q)',p);
RemoveCurrentP(p,hp1);
Result:=True;
Exit;
end;
end;
{ Next instruction is also a MOV ? }
if MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
begin
@ -5568,6 +5601,87 @@ unit aoptx86;
end;
function TX86AsmOptimizer.OptPass1MOVD(var p : tai) : boolean;
{ This function also handles the 64-bit version, MOVQ }
var
hp1: tai;
begin
Result:=false;
{ Change:
movd/q %xmm0, %reg
...
movl/q %reg, (ref)
(dealloc %reg)
To:
movd/q %xmm0, (ref)
}
if MatchOpType(taicpu(p),top_reg,top_reg) and
(GetRegType(taicpu(p).oper[0]^.reg)=R_MMREGISTER) and
(GetRegType(taicpu(p).oper[1]^.reg)=R_INTREGISTER) and
GetNextInstructionUsingReg(p,hp1,taicpu(p).oper[1]^.reg) and
MatchInstruction(hp1, A_MOV, []) and
MatchOperand(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^.reg) and
(taicpu(hp1).oper[1]^.typ=top_ref) and
not RegInRef(taicpu(p).oper[1]^.reg,taicpu(hp1).oper[1]^.ref^) then
begin
TransferUsedRegs(TmpUsedRegs);
UpdateUsedRegsBetween(TmpUsedRegs,p,hp1);
if not RegUsedAfterInstruction(taicpu(p).oper[1]^.reg,hp1,TmpUsedRegs) then
begin
if (
{ Instructions are always adjacent under -O2 and under }
not(cs_opt_level3 in current_settings.optimizerswitches) or
(
(
(taicpu(hp1).oper[1]^.ref^.base=NR_NO) or
not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.base,p,hp1)
) and
(
(taicpu(hp1).oper[1]^.ref^.index=NR_NO) or
not RegModifiedBetween(taicpu(hp1).oper[1]^.ref^.index,p,hp1)
)
)
) then
begin
DebugMsg(SPeepholeOptimization+'Merged (V)MOVD/(V)MOVQ and MOV to eliminate intermediate register (MovD/QMov2MovD/Q 1a)',p);
taicpu(p).loadref(1,taicpu(hp1).oper[1]^.ref^);
{ loadref increases the reference count, so decrement it again }
if Assigned(taicpu(hp1).oper[1]^.ref^.symbol) then
taicpu(hp1).oper[1]^.ref^.symbol.decrefs;
if Assigned(taicpu(hp1).oper[1]^.ref^.relsymbol) then
taicpu(hp1).oper[1]^.ref^.relsymbol.decrefs;
RemoveInstruction(hp1);
Include(OptsToCheck, aoc_ForceNewIteration);
end
else if not RegModifiedBetween(taicpu(p).oper[0]^.reg,p,hp1) then
begin
{ Still possible to optimise if hp1 is converted instead }
DebugMsg(SPeepholeOptimization+'Merged (V)MOVD/(V)MOVQ and MOV to eliminate intermediate register (MovD/QMov2MovD/Q 1b)',hp1);
{ Decrement the reference prior to replacing it }
if Assigned(taicpu(hp1).oper[1]^.ref^.symbol) then
taicpu(hp1).oper[1]^.ref^.symbol.decrefs;
if Assigned(taicpu(hp1).oper[1]^.ref^.relsymbol) then
taicpu(hp1).oper[1]^.ref^.relsymbol.decrefs;
taicpu(hp1).opcode:=taicpu(p).opcode;
taicpu(hp1).opsize:=taicpu(p).opsize;
taicpu(hp1).loadreg(0,taicpu(p).oper[0]^.reg);
TransferUsedRegs(TmpUsedRegs);
AllocRegBetween(taicpu(p).oper[0]^.reg,p,hp1,TmpUsedRegs);
RemoveCurrentP(p);
Result:=True;
Exit;
end;
end;
end;
end;
function TX86AsmOptimizer.OptPass1MOVXX(var p : tai) : boolean;
var
hp1 : tai;

View File

@ -100,6 +100,11 @@ uses
Result:=OptPass1Imul(p);
A_MOV:
Result:=OptPass1MOV(p);
A_MOVD,
A_MOVQ,
A_VMOVD,
A_VMOVQ:
Result:=OptPass1MOVD(p);
A_MOVSX,
A_MOVSXD,
A_MOVZX: