diff --git a/compiler/i386/aoptcpu.pas b/compiler/i386/aoptcpu.pas index d3f49547af..2e12498367 100644 --- a/compiler/i386/aoptcpu.pas +++ b/compiler/i386/aoptcpu.pas @@ -365,6 +365,8 @@ unit aoptcpu; Result:=PostPeepholeOptMOVSX(p); A_SHR: Result:=PostPeepholeOptShr(p); + A_VPXOR: + Result:=PostPeepholeOptVPXOR(p); else ; end; diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index 26d288d18d..ac169c31d9 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -188,6 +188,7 @@ unit aoptx86; function PostPeepholeOptLea(var p : tai) : Boolean; function PostPeepholeOptPush(var p: tai): Boolean; function PostPeepholeOptShr(var p : tai) : boolean; + function PostPeepholeOptVPXOR(var p: tai): Boolean; procedure ConvertJumpToRET(const p: tai; const ret_p: tai); @@ -6187,12 +6188,18 @@ unit aoptx86; DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 1)', p); - taicpu(hp2).opcode := A_VPXOR; - taicpu(hp2).opsize := S_YMM; - taicpu(hp2).loadreg(0, CurrentReg); - taicpu(hp2).loadreg(1, CurrentReg); - taicpu(hp2).loadreg(2, CurrentReg); - taicpu(hp2).ops := 3; + { If pi_uses_ymm is set, VZEROUPPER is present to do this for us } + if (pi_uses_ymm in current_procinfo.flags) then + RemoveInstruction(hp2) + else + begin + taicpu(hp2).opcode := A_VPXOR; + taicpu(hp2).opsize := S_YMM; + taicpu(hp2).loadreg(0, CurrentReg); + taicpu(hp2).loadreg(1, CurrentReg); + taicpu(hp2).loadreg(2, CurrentReg); + taicpu(hp2).ops := 3; + end; RemoveInstruction(hp3); Result := True; @@ -6241,15 +6248,21 @@ unit aoptx86; DebugMsg(SPeepholeOptimization + 'Used ' + debug_regname(CurrentReg) + ' to merge a pair of memory moves (VmovdqxVmovdqxVmovdqxVmovdqx2VmovdqyVmovdqy 2)', p); - taicpu(hp1).opcode := A_VPXOR; - taicpu(hp1).opsize := S_YMM; - taicpu(hp1).loadreg(0, CurrentReg); - taicpu(hp1).loadreg(1, CurrentReg); - taicpu(hp1).loadreg(2, CurrentReg); - taicpu(hp1).ops := 3; + { If pi_uses_ymm is set, VZEROUPPER is present to do this for us } + if (pi_uses_ymm in current_procinfo.flags) then + RemoveInstruction(hp1) + else + begin + taicpu(hp1).opcode := A_VPXOR; + taicpu(hp1).opsize := S_YMM; + taicpu(hp1).loadreg(0, CurrentReg); + taicpu(hp1).loadreg(1, CurrentReg); + taicpu(hp1).loadreg(2, CurrentReg); + taicpu(hp1).ops := 3; - Asml.Remove(hp1); - Asml.InsertAfter(hp1, hp3); { Register deallocations will be after hp3 } + Asml.Remove(hp1); + Asml.InsertAfter(hp1, hp3); { Register deallocations will be after hp3 } + end; RemoveCurrentP(p, hp2); Result := True; @@ -10569,6 +10582,42 @@ unit aoptx86; end; {$endif} + function TX86AsmOptimizer.PostPeepholeOptVPXOR(var p : tai) : Boolean; + var + XReg: TRegister; + begin + Result := False; + { Turn "vpxor %ymmreg2,%ymmreg2,%ymmreg1" to "vpxor %xmmreg2,%xmmreg2,%xmmreg1" + Smaller encoding and slightly faster on some platforms (also works for + ZMM-sized registers) } + if (taicpu(p).opsize in [S_YMM, S_ZMM]) and + MatchOpType(taicpu(p), top_reg, top_reg, top_reg) then + begin + XReg := taicpu(p).oper[0]^.reg; + if (taicpu(p).oper[1]^.reg = XReg) then + begin + taicpu(p).changeopsize(S_XMM); + setsubreg(taicpu(p).oper[2]^.reg, R_SUBMMX); + if (cs_opt_size in current_settings.optimizerswitches) then + begin + { Change input registers to %xmm0 to reduce size. Note that + there's a risk of a false dependency doing this, so only + optimise for size here } + XReg := NR_XMM0; + DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM and changed input registers to %xmm0 to reduce size', p); + end + else + begin + setsubreg(XReg, R_SUBMMX); + DebugMsg(SPeepholeOptimization + 'Changed zero-setting vpxor from Y/ZMM to XMM to reduce size and increase efficiency', p); + end; + taicpu(p).oper[0]^.reg := XReg; + taicpu(p).oper[1]^.reg := XReg; + Result := True; + end; + end; + end; + class procedure TX86AsmOptimizer.OptimizeRefs(var p: taicpu); var diff --git a/compiler/x86_64/aoptcpu.pas b/compiler/x86_64/aoptcpu.pas index d60fe79b0f..c4810c8903 100644 --- a/compiler/x86_64/aoptcpu.pas +++ b/compiler/x86_64/aoptcpu.pas @@ -234,6 +234,8 @@ uses Result:=PostPeepholeOptPush(p); A_SHR: Result:=PostPeepholeOptShr(p); + A_VPXOR: + Result:=PostPeepholeOptVPXOR(p); else ; end;