mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-19 05:29:18 +02:00
o manually applied merge request 46 by J. Gareth "Kit" Moreton:
+ x86_64: XMM-based block move optimisation
This commit is contained in:
parent
860e4487c3
commit
a3666aa8c9
@ -84,6 +84,15 @@ unit aoptx86;
|
||||
protected
|
||||
class function IsMOVZXAcceptable: Boolean; static; inline;
|
||||
|
||||
{ Attempts to allocate a volatile integer register for use between p and hp,
|
||||
using AUsedRegs for the current register usage information. Returns NR_NO
|
||||
if no free register could be found }
|
||||
function GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
|
||||
{ Attempts to allocate a volatile MM register for use between p and hp,
|
||||
using AUsedRegs for the current register usage information. Returns NR_NO
|
||||
if no free register could be found }
|
||||
function GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
|
||||
|
||||
{ checks whether loading a new value in reg1 overwrites the entirety of reg2 }
|
||||
function Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
|
||||
{ checks whether reading the value in reg1 depends on the value of reg2. This
|
||||
@ -1003,6 +1012,121 @@ unit aoptx86;
|
||||
{$endif x86_64}
|
||||
end;
|
||||
|
||||
|
||||
{ Attempts to allocate a volatile integer register for use between p and hp,
|
||||
using AUsedRegs for the current register usage information. Returns NR_NO
|
||||
if no free register could be found }
|
||||
function TX86AsmOptimizer.GetIntRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
|
||||
var
|
||||
RegSet: TCPURegisterSet;
|
||||
CurrentSuperReg: Integer;
|
||||
CurrentReg: TRegister;
|
||||
Currentp: tai;
|
||||
Breakout: Boolean;
|
||||
begin
|
||||
{ TODO: Currently, only the volatile registers are checked - can this be extended to use any register the procedure has preserved? }
|
||||
Result := NR_NO;
|
||||
RegSet := paramanager.get_volatile_registers_int(current_procinfo.procdef.proccalloption);
|
||||
for CurrentSuperReg := Low(RegSet) to High(RegSet) do
|
||||
begin
|
||||
CurrentReg := newreg(R_INTREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
|
||||
if not AUsedRegs[R_INTREGISTER].IsUsed(CurrentReg) then
|
||||
begin
|
||||
Currentp := p;
|
||||
Breakout := False;
|
||||
while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
|
||||
begin
|
||||
case hp.typ of
|
||||
ait_instruction:
|
||||
begin
|
||||
if RegInInstruction(CurrentReg, Currentp) then
|
||||
begin
|
||||
Breakout := True;
|
||||
Break;
|
||||
|
||||
end;
|
||||
{ Cannot allocate across an unconditional jump }
|
||||
if is_calljmpuncondret(taicpu(Currentp).opcode) then
|
||||
Exit;
|
||||
end;
|
||||
ait_marker:
|
||||
{ Don't try anything more if a marker is hit }
|
||||
Exit;
|
||||
else
|
||||
;
|
||||
end;
|
||||
end;
|
||||
|
||||
if Breakout then
|
||||
{ Try the next register }
|
||||
Continue;
|
||||
|
||||
{ We have a free register available }
|
||||
Result := CurrentReg;
|
||||
AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
|
||||
Exit;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
{ Attempts to allocate a volatile MM register for use between p and hp,
|
||||
using AUsedRegs for the current register usage information. Returns NR_NO
|
||||
if no free register could be found }
|
||||
function TX86AsmOptimizer.GetMMRegisterBetween(RegSize: TSubRegister; var AUsedRegs: TAllUsedRegs; p, hp: tai): TRegister;
|
||||
var
|
||||
RegSet: TCPURegisterSet;
|
||||
CurrentSuperReg: Integer;
|
||||
CurrentReg: TRegister;
|
||||
Currentp: tai;
|
||||
Breakout: Boolean;
|
||||
begin
|
||||
{ TODO: Currently, only the volatile registers are checked - can this be extended to use any register the procedure has preserved? }
|
||||
Result := NR_NO;
|
||||
RegSet := paramanager.get_volatile_registers_mm(current_procinfo.procdef.proccalloption);
|
||||
for CurrentSuperReg := Low(RegSet) to High(RegSet) do
|
||||
begin
|
||||
CurrentReg := newreg(R_MMREGISTER, TSuperRegister(CurrentSuperReg), RegSize);
|
||||
if not AUsedRegs[R_MMREGISTER].IsUsed(CurrentReg) then
|
||||
begin
|
||||
Currentp := p;
|
||||
Breakout := False;
|
||||
while not Breakout and GetNextInstruction(Currentp, Currentp) and (Currentp <> hp) do
|
||||
begin
|
||||
case hp.typ of
|
||||
ait_instruction:
|
||||
begin
|
||||
if RegInInstruction(CurrentReg, Currentp) then
|
||||
begin
|
||||
Breakout := True;
|
||||
Break;
|
||||
|
||||
end;
|
||||
{ Cannot allocate across an unconditional jump }
|
||||
if is_calljmpuncondret(taicpu(Currentp).opcode) then
|
||||
Exit;
|
||||
end;
|
||||
ait_marker:
|
||||
{ Don't try anything more if a marker is hit }
|
||||
Exit;
|
||||
else
|
||||
;
|
||||
end;
|
||||
end;
|
||||
|
||||
if Breakout then
|
||||
{ Try the next register }
|
||||
Continue;
|
||||
|
||||
{ We have a free register available }
|
||||
Result := CurrentReg;
|
||||
AllocRegBetween(CurrentReg, p, hp, AUsedRegs);
|
||||
Exit;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
|
||||
|
||||
function TX86AsmOptimizer.Reg1WriteOverwritesReg2Entirely(reg1, reg2: tregister): boolean;
|
||||
begin
|
||||
if not SuperRegistersEqual(reg1,reg2) then
|
||||
@ -2212,6 +2336,8 @@ unit aoptx86;
|
||||
PreMessage, RegName1, RegName2, InputVal, MaskNum: string;
|
||||
NewSize: topsize;
|
||||
CurrentReg, ActiveReg: TRegister;
|
||||
SourceRef, TargetRef: TReference;
|
||||
MovAligned, MovUnaligned: TAsmOp;
|
||||
begin
|
||||
Result:=false;
|
||||
|
||||
@ -2802,23 +2928,175 @@ unit aoptx86;
|
||||
end;
|
||||
end;
|
||||
top_ref:
|
||||
if (taicpu(hp1).oper[1]^.typ = top_reg) then
|
||||
begin
|
||||
{ change
|
||||
mov mem, %treg
|
||||
mov %treg, %reg
|
||||
case taicpu(hp1).oper[1]^.typ of
|
||||
top_reg:
|
||||
begin
|
||||
{ change
|
||||
mov mem, %treg
|
||||
mov %treg, %reg
|
||||
|
||||
to
|
||||
to
|
||||
|
||||
mov mem, %reg"
|
||||
}
|
||||
AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
|
||||
taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
|
||||
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done',p);
|
||||
RemoveInstruction(hp1);
|
||||
Result:=true;
|
||||
Exit;
|
||||
end;
|
||||
mov mem, %reg"
|
||||
}
|
||||
AllocRegBetween(taicpu(hp1).oper[1]^.reg,p,hp1,usedregs);
|
||||
taicpu(p).loadreg(1, taicpu(hp1).oper[1]^.reg);
|
||||
DebugMsg(SPeepholeOptimization + 'MovMov2Mov 3 done',p);
|
||||
RemoveInstruction(hp1);
|
||||
Result:=true;
|
||||
Exit;
|
||||
end;
|
||||
top_ref:
|
||||
begin
|
||||
{$ifdef x86_64}
|
||||
{ Look for the following to simplify:
|
||||
|
||||
mov x(mem1), %reg
|
||||
mov %reg, y(mem2)
|
||||
mov x+8(mem1), %reg
|
||||
mov %reg, y+8(mem2)
|
||||
|
||||
Change to:
|
||||
movdqu x(mem1), %xmmreg
|
||||
movdqu %xmmreg, y(mem2)
|
||||
}
|
||||
SourceRef := taicpu(p).oper[0]^.ref^;
|
||||
TargetRef := taicpu(hp1).oper[1]^.ref^;
|
||||
if (taicpu(p).opsize = S_Q) and
|
||||
GetNextInstruction(hp1, hp2) and
|
||||
MatchInstruction(hp2, A_MOV, [taicpu(p).opsize]) and
|
||||
MatchOpType(taicpu(hp2), top_ref, top_reg) then
|
||||
begin
|
||||
{ Delay calling GetNextInstruction(hp2, hp3) for as long as possible }
|
||||
|
||||
UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next));
|
||||
|
||||
Inc(SourceRef.offset, 8);
|
||||
|
||||
if UseAVX then
|
||||
begin
|
||||
MovAligned := A_VMOVDQA;
|
||||
MovUnaligned := A_VMOVDQU;
|
||||
end
|
||||
else
|
||||
begin
|
||||
MovAligned := A_MOVDQA;
|
||||
MovUnaligned := A_MOVDQU;
|
||||
end;
|
||||
|
||||
if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
|
||||
begin
|
||||
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
|
||||
Inc(TargetRef.offset, 8);
|
||||
if GetNextInstruction(hp2, hp3) and
|
||||
MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
|
||||
MatchOpType(taicpu(hp3), top_reg, top_ref) and
|
||||
(taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
|
||||
RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
|
||||
not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
|
||||
begin
|
||||
CurrentReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
|
||||
if CurrentReg <> NR_NO then
|
||||
begin
|
||||
{ Remember that the offsets are 8 ahead }
|
||||
if ((SourceRef.offset mod 16) = 8) and
|
||||
(
|
||||
{ Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
|
||||
(SourceRef.base = current_procinfo.framepointer) or
|
||||
((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
|
||||
) then
|
||||
taicpu(p).opcode := MovAligned
|
||||
else
|
||||
taicpu(p).opcode := MovUnaligned;
|
||||
|
||||
taicpu(p).opsize := S_XMM;
|
||||
taicpu(p).oper[1]^.reg := CurrentReg;
|
||||
|
||||
if ((TargetRef.offset mod 16) = 8) and
|
||||
(
|
||||
{ Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
|
||||
(TargetRef.base = current_procinfo.framepointer) or
|
||||
((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
|
||||
) then
|
||||
taicpu(hp1).opcode := MovAligned
|
||||
else
|
||||
taicpu(hp1).opcode := MovUnaligned;
|
||||
|
||||
taicpu(hp1).opsize := S_XMM;
|
||||
taicpu(hp1).oper[0]^.reg := CurrentReg;
|
||||
|
||||
DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 1)', p);
|
||||
|
||||
RemoveInstruction(hp2);
|
||||
RemoveInstruction(hp3);
|
||||
Result := True;
|
||||
Exit;
|
||||
end;
|
||||
end;
|
||||
end
|
||||
else
|
||||
begin
|
||||
{ See if the next references are 8 less rather than 8 greater }
|
||||
|
||||
Dec(SourceRef.offset, 16); { -8 the other way }
|
||||
if RefsEqual(SourceRef, taicpu(hp2).oper[0]^.ref^) then
|
||||
begin
|
||||
UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next));
|
||||
Dec(TargetRef.offset, 8); { Only 8, not 16, as it wasn't incremented unlike SourceRef }
|
||||
if GetNextInstruction(hp2, hp3) and
|
||||
MatchInstruction(hp3, A_MOV, [taicpu(p).opsize]) and
|
||||
MatchOpType(taicpu(hp3), top_reg, top_ref) and
|
||||
(taicpu(hp2).oper[1]^.reg = taicpu(hp3).oper[0]^.reg) and
|
||||
RefsEqual(TargetRef, taicpu(hp3).oper[1]^.ref^) and
|
||||
not RegUsedAfterInstruction(taicpu(hp2).oper[1]^.reg, hp3, TmpUsedRegs) then
|
||||
begin
|
||||
CurrentReg := GetMMRegisterBetween(R_SUBMMX, UsedRegs, p, hp3);
|
||||
if CurrentReg <> NR_NO then
|
||||
begin
|
||||
{ hp2 and hp3 are the starting offsets, so mod 0 this time }
|
||||
if ((SourceRef.offset mod 16) = 0) and
|
||||
(
|
||||
{ Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
|
||||
(SourceRef.base = current_procinfo.framepointer) or
|
||||
((SourceRef.alignment >= 16) and ((SourceRef.alignment mod 16) = 0))
|
||||
) then
|
||||
taicpu(hp2).opcode := MovAligned
|
||||
else
|
||||
taicpu(hp2).opcode := MovUnaligned;
|
||||
|
||||
taicpu(hp2).opsize := S_XMM;
|
||||
taicpu(hp2).oper[1]^.reg := CurrentReg;
|
||||
|
||||
if ((TargetRef.offset mod 16) = 0) and
|
||||
(
|
||||
{ Base pointer is always aligned (stack pointer won't be if there's no stack frame) }
|
||||
(TargetRef.base = current_procinfo.framepointer) or
|
||||
((TargetRef.alignment >= 16) and ((TargetRef.alignment mod 16) = 0))
|
||||
) then
|
||||
taicpu(hp3).opcode := MovAligned
|
||||
else
|
||||
taicpu(hp3).opcode := MovUnaligned;
|
||||
|
||||
taicpu(hp3).opsize := S_XMM;
|
||||
taicpu(hp3).oper[0]^.reg := CurrentReg;
|
||||
|
||||
DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (MovMovMovMov2MovdqMovdq 2)', p);
|
||||
|
||||
RemoveInstruction(hp1);
|
||||
RemoveCurrentP(p, hp2);
|
||||
Result := True;
|
||||
Exit;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
end;
|
||||
{$endif x86_64}
|
||||
end;
|
||||
else
|
||||
{ The write target should be a reg or a ref }
|
||||
InternalError(2021091601);
|
||||
end;
|
||||
else
|
||||
;
|
||||
end
|
||||
@ -2921,6 +3199,7 @@ unit aoptx86;
|
||||
taicpu(p).loadoper(1,taicpu(hp2).oper[1]^);
|
||||
taicpu(hp1).loadoper(0,taicpu(hp2).oper[1]^);
|
||||
RemoveInstruction(hp2);
|
||||
Result := True;
|
||||
end
|
||||
{$ifdef i386}
|
||||
{ this is enabled for i386 only, as the rules to create the reg sets below
|
||||
|
@ -184,8 +184,8 @@ unit cgcpu;
|
||||
begin
|
||||
inc(regsize,sizeof(aint));
|
||||
inc(stackmisalignment,sizeof(aint));
|
||||
push_one_reg(newreg(R_INTREGISTER,regs_to_save_int[r],R_SUBWHOLE));
|
||||
hreg:=newreg(R_INTREGISTER,regs_to_save_int[r],R_SUBWHOLE);
|
||||
push_one_reg(hreg);
|
||||
if current_procinfo.framepointer<>NR_STACK_POINTER_REG then
|
||||
current_asmdata.asmcfi.cfa_offset(list,hreg,-(regsize+sizeof(pint)*2+localsize))
|
||||
else
|
||||
|
Loading…
Reference in New Issue
Block a user