Better handling of zeroing upper parts of registers

Better handling of zeroing upper parts of registers
This commit is contained in:
J. Gareth "Curious Kit" Moreton 2021-10-10 03:42:03 +01:00 committed by florian
parent 674ed4069a
commit fd28cc0db0
3 changed files with 67 additions and 14 deletions

View File

@ -365,6 +365,8 @@ unit aoptcpu;
Result:=PostPeepholeOptMOVSX(p);
A_SHR:
Result:=PostPeepholeOptShr(p);
A_VPXOR:
Result:=PostPeepholeOptVPXOR(p);
else
;
end;

View File

@ -188,6 +188,7 @@ unit aoptx86;
function PostPeepholeOptLea(var p : tai) : Boolean;
function PostPeepholeOptPush(var p: tai): Boolean;
function PostPeepholeOptShr(var p : tai) : boolean;
function PostPeepholeOptVPXOR(var p: tai): Boolean;
procedure ConvertJumpToRET(const p: tai; const ret_p: tai);
@ -6187,12 +6188,18 @@ unit aoptx86;
DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 1)', p);
taicpu(hp2).opcode := A_VPXOR;
taicpu(hp2).opsize := S_YMM;
taicpu(hp2).loadreg(0, CurrentReg);
taicpu(hp2).loadreg(1, CurrentReg);
taicpu(hp2).loadreg(2, CurrentReg);
taicpu(hp2).ops := 3;
{ If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
if (pi_uses_ymm in current_procinfo.flags) then
RemoveInstruction(hp2)
else
begin
taicpu(hp2).opcode := A_VPXOR;
taicpu(hp2).opsize := S_YMM;
taicpu(hp2).loadreg(0, CurrentReg);
taicpu(hp2).loadreg(1, CurrentReg);
taicpu(hp2).loadreg(2, CurrentReg);
taicpu(hp2).ops := 3;
end;
RemoveInstruction(hp3);
Result := True;
@ -6241,15 +6248,21 @@ unit aoptx86;
DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 2)', p);
taicpu(hp1).opcode := A_VPXOR;
taicpu(hp1).opsize := S_YMM;
taicpu(hp1).loadreg(0, CurrentReg);
taicpu(hp1).loadreg(1, CurrentReg);
taicpu(hp1).loadreg(2, CurrentReg);
taicpu(hp1).ops := 3;
{ If pi_uses_ymm is set, VZEROUPPER is present to do this for us }
if (pi_uses_ymm in current_procinfo.flags) then
RemoveInstruction(hp1)
else
begin
taicpu(hp1).opcode := A_VPXOR;
taicpu(hp1).opsize := S_YMM;
taicpu(hp1).loadreg(0, CurrentReg);
taicpu(hp1).loadreg(1, CurrentReg);
taicpu(hp1).loadreg(2, CurrentReg);
taicpu(hp1).ops := 3;
Asml.Remove(hp1);
Asml.InsertAfter(hp1, hp3); { Register deallocations will be after hp3 }
Asml.Remove(hp1);
Asml.InsertAfter(hp1, hp3); { Register deallocations will be after hp3 }
end;
RemoveCurrentP(p, hp2);
Result := True;
@ -10569,6 +10582,42 @@ unit aoptx86;
end;
{$endif}
function TX86AsmOptimizer.PostPeepholeOptVPXOR(var p : tai) : Boolean;
var
XReg: TRegister;
begin
Result := False;
{ Turn "vpxor %ymmreg2,%ymmreg2,%ymmreg1" to "vpxor %xmmreg2,%xmmreg2,%xmmreg1"
Smaller encoding and slightly faster on some platforms (also works for
ZMM-sized registers) }
if (taicpu(p).opsize in [S_YMM, S_ZMM]) and
MatchOpType(taicpu(p), top_reg, top_reg, top_reg) then
begin
XReg := taicpu(p).oper[0]^.reg;
if (taicpu(p).oper[1]^.reg = XReg) then
begin
taicpu(p).changeopsize(S_XMM);
setsubreg(taicpu(p).oper[2]^.reg, R_SUBMMX);
if (cs_opt_size in current_settings.optimizerswitches) then
begin
{ Change input registers to %xmm0 to reduce size. Note that
there's a risk of a false dependency doing this, so only
optimise for size here }
XReg := NR_XMM0;
DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM and changed input registers to %xmm0 to reduce size', p);
end
else
begin
setsubreg(XReg, R_SUBMMX);
DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM to reduce size and increase efficiency', p);
end;
taicpu(p).oper[0]^.reg := XReg;
taicpu(p).oper[1]^.reg := XReg;
Result := True;
end;
end;
end;
class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu);
var

View File

@ -234,6 +234,8 @@ uses
Result:=PostPeepholeOptPush(p);
A_SHR:
Result:=PostPeepholeOptShr(p);
A_VPXOR:
Result:=PostPeepholeOptVPXOR(p);
else
;
end;