diff --git a/compiler/i386/aoptcpu.pas b/compiler/i386/aoptcpu.pas index 44343d45c3..3866b7517e 100644 --- a/compiler/i386/aoptcpu.pas +++ b/compiler/i386/aoptcpu.pas @@ -191,6 +191,8 @@ unit aoptcpu; end; A_SHL, A_SAL: Result:=OptPass1SHLSAL(p); + A_SHR: + Result:=OptPass1SHR(p); A_SUB: Result:=OptPass1Sub(p); A_Jcc: diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index d46d9ca7e8..a1aa0e3573 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -135,6 +135,7 @@ unit aoptx86; class function IsExitCode(p : tai) : boolean; static; class function isFoldableArithOp(hp1 : taicpu; reg : tregister) : boolean; static; + class function IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean; static; procedure RemoveLastDeallocForFuncRes(p : tai); function DoSubAddOpt(var p : tai) : Boolean; @@ -157,6 +158,7 @@ unit aoptx86; function OptPass1LEA(var p : tai) : boolean; function OptPass1Sub(var p : tai) : boolean; function OptPass1SHLSAL(var p : tai) : boolean; + function OptPass1SHR(var p : tai) : boolean; function OptPass1FSTP(var p : tai) : boolean; function OptPass1FLD(var p : tai) : boolean; function OptPass1Cmp(var p : tai) : boolean; @@ -6448,6 +6450,146 @@ unit aoptx86; end; + class function TX86AsmOptimizer.IsShrMovZFoldable(shr_size, movz_size: topsize; Shift: TCGInt): Boolean; + begin + case shr_size of + S_B: + { No valid combinations } + Result := False; + + S_W: + Result := (Shift >= 8) and (movz_size = S_BW); + + S_L: + Result := + (Shift >= 24) { Any opsize is valid for this shift } or + ((Shift >= 16) and (movz_size = S_WL)); +{$ifdef x86_64} + S_Q: + Result := + (Shift >= 56) { Any opsize is valid for this shift } or + ((Shift >= 48) and (movz_size = S_WL)); +{$endif x86_64} + else + InternalError(2022081510); + end; + end; + + function TX86AsmOptimizer.OptPass1SHR(var p : tai) : boolean; + var + hp1, hp2: tai; + Shift: TCGInt; + LimitSize: Topsize; + DoNotMerge: Boolean; + begin + Result := False; + + { All these optimisations work on "shr const,%reg" } + if not MatchOpType(taicpu(p), top_const, top_reg) then + Exit; + + DoNotMerge := False; + Shift := taicpu(p).oper[0]^.val; + LimitSize := taicpu(p).opsize; + + hp1 := p; + repeat + if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then + Exit; + + case taicpu(hp1).opcode of + A_TEST, A_CMP, A_Jcc: + { Skip over conditional jumps and relevant comparisons } + Continue; + + A_MOVZX: + if MatchOpType(taicpu(hp1), top_reg, top_reg) and + SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then + begin + { Since the original register is being read as is, subsequent + SHRs must not be merged at this point } + DoNotMerge := True; + + if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then + begin + if not SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then { Different register target } + begin + DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 1)', hp1); + taicpu(hp1).opcode := A_MOV; + setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg)); + case taicpu(hp1).opsize of + S_BW: + taicpu(hp1).opsize := S_W; + S_BL, S_WL: + taicpu(hp1).opsize := S_L; + else + InternalError(2022081503); + end; + + { p itself hasn't changed, so no need to set Result to True } + Include(OptsToCheck, aoc_ForceNewIteration); + + { See if there's anything afterwards that can be + optimised, since the input register hasn't changed } + Continue; + end; + + { NOTE: If the MOVZX instruction reads and writes the same + register, defer this to the post-peephole optimisation stage } + Exit; + end; + end; + A_SHL, A_SAL, A_SHR: + if (taicpu(hp1).opsize <= LimitSize) and + MatchOpType(taicpu(hp1), top_const, top_reg) and + SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then + begin + { Make sure the sizes don't exceed the register size limit + (measured by the shift value falling below the limit) } + + if taicpu(hp1).opsize < LimitSize then + LimitSize := taicpu(hp1).opsize; + + if taicpu(hp1).opcode = A_SHR then + Inc(Shift, taicpu(hp1).oper[0]^.val) + else + begin + Dec(Shift, taicpu(hp1).oper[0]^.val); + DoNotMerge := True; + end; + + if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then + Exit; + + { Since we've established that the combined shift is within + limits, we can actually combine the adjacent SHR + instructions even if they're different sizes } + if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then + begin + hp2 := tai(hp1.Previous); + DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 1', p); + Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val); + RemoveInstruction(hp1); + hp1 := hp2; + + { Though p has changed, only the constant has, and its + effects can still be detected on the next iteration of + the repeat..until loop } + Include(OptsToCheck, aoc_ForceNewIteration); + end; + + { Move onto the next instruction } + Continue; + end; + else + ; + end; + + Break; + until False; + end; + + function TX86AsmOptimizer.CheckMemoryWrite(var first_mov, second_mov: taicpu): Boolean; var CurrentRef: TReference; @@ -12930,36 +13072,193 @@ unit aoptx86; function TX86AsmOptimizer.PostPeepholeOptShr(var p : tai) : boolean; var - hp1: tai; + hp1, hp2: tai; + IdentityMask, Shift: TCGInt; + LimitSize: Topsize; + DoNotMerge: Boolean; begin - { Detect: - shr x, %ax (x > 0) - ... - movzwl %ax,%eax - - Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax) - } - Result := False; - if MatchOpType(taicpu(p), top_const, top_reg) and - (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W } - (taicpu(p).oper[0]^.val > 0) and - GetNextInstructionUsingReg(p, hp1, NR_EAX) and - MatchInstruction(hp1, A_MOVZX, [S_WL]) and - MatchOperand(taicpu(hp1).oper[0]^, NR_AX) and - MatchOperand(taicpu(hp1).oper[1]^, NR_EAX) then - begin - DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1); - taicpu(hp1).opcode := A_CWDE; - taicpu(hp1).clearop(0); - taicpu(hp1).clearop(1); - taicpu(hp1).ops := 0; - { A change was made, but not with p, so move forward 1 } - p := tai(p.Next); - Result := True; + { All these optimisations work on "shr const,%reg" } + if not MatchOpType(taicpu(p), top_const, top_reg) then + Exit; + + DoNotMerge := False; + Shift := taicpu(p).oper[0]^.val; + LimitSize := taicpu(p).opsize; + + hp1 := p; + repeat + if not GetNextInstructionUsingReg(hp1, hp1, taicpu(p).oper[1]^.reg) or (hp1.typ <> ait_instruction) then + Exit; + + { Detect: + shr x, %reg + and y, %reg + + If and y, %reg doesn't actually change the value of %reg (e.g. with + "shrl $24,%reg; andl $255,%reg", remove the AND instruction. + } + + case taicpu(hp1).opcode of + A_AND: + if (taicpu(hp1).opsize = taicpu(p).opsize) and + MatchOpType(taicpu(hp1), top_const, top_reg) and + (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) then + begin + { Make sure the FLAGS register isn't in use } + TransferUsedRegs(TmpUsedRegs); + hp2 := p; + repeat + UpdateUsedRegs(TmpUsedRegs, tai(hp2.Next)); + until not GetNextInstruction(hp2, hp2) or (hp2 = hp1); + + if not RegUsedAfterInstruction(NR_DEFAULTFLAGS, hp1, TmpUsedRegs) then + begin + { Generate the identity mask } + case taicpu(p).opsize of + S_B: + IdentityMask := $FF shr Shift; + S_W: + IdentityMask := $FFFF shr Shift; + S_L: + IdentityMask := $FFFFFFFF shr Shift; +{$ifdef x86_64} + S_Q: + { We need to force the operands to be unsigned 64-bit + integers otherwise the wrong value is generated } + IdentityMask := TCGInt(QWord($FFFFFFFFFFFFFFFF) shr QWord(Shift)); +{$endif x86_64} + else + InternalError(2022081501); + end; + + if (taicpu(hp1).oper[0]^.val and IdentityMask) = IdentityMask then + begin + DebugMsg(SPeepholeOptimization + 'Removed AND instruction since previous SHR makes this an identity operation (ShrAnd2Shr)', hp1); + { All the possible 1 bits are covered, so we can remove the AND } + hp2 := tai(hp1.Previous); + RemoveInstruction(hp1); + + { p wasn't actually changed, so don't set Result to True, + but a change was nonetheless made elsewhere } + Include(OptsToCheck, aoc_ForceNewIteration); + + { Do another pass in case other AND or MOVZX instructions + follow } + hp1 := hp2; + Continue; + end; + + end; + end; + + A_TEST, A_CMP, A_Jcc: + { Skip over conditional jumps and relevant comparisons } + Continue; + + A_MOVZX: + if MatchOpType(taicpu(hp1), top_reg, top_reg) and + SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(p).oper[1]^.reg) then + begin + { Since the original register is being read as is, subsequent + SHRs must not be merged at this point } + DoNotMerge := True; + + if IsShrMovZFoldable(taicpu(p).opsize, taicpu(hp1).opsize, Shift) then + begin + if SuperRegistersEqual(taicpu(hp1).oper[0]^.reg, taicpu(hp1).oper[1]^.reg) then + begin + DebugMsg(SPeepholeOptimization + 'Removed MOVZX instruction since previous SHR makes it unnecessary (ShrMovz2Shr)', hp1); + { All the possible 1 bits are covered, so we can remove the AND } + hp2 := tai(hp1.Previous); + RemoveInstruction(hp1); + + hp1 := hp2; + end + else { Different register target } + begin + DebugMsg(SPeepholeOptimization + 'Converted MOVZX instruction to MOV since previous SHR makes zero-extension unnecessary (ShrMovz2ShrMov 2)', hp1); + taicpu(hp1).opcode := A_MOV; + setsubreg(taicpu(hp1).oper[0]^.reg, getsubreg(taicpu(hp1).oper[1]^.reg)); + case taicpu(hp1).opsize of + S_BW: + taicpu(hp1).opsize := S_W; + S_BL, S_WL: + taicpu(hp1).opsize := S_L; + else + InternalError(2022081503); + end; + end; + end + else if (Shift > 0) and + (taicpu(p).opsize = S_W) and + (taicpu(hp1).opsize = S_WL) and + (taicpu(hp1).oper[0]^.reg = NR_AX) and + (taicpu(hp1).oper[1]^.reg = NR_EAX) then + begin + { Detect: + shr x, %ax (x > 0) + ... + movzwl %ax,%eax + + Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax) + } + DebugMsg(SPeepholeOptimization + 'Converted movzwl %ax,%eax to cwtl (via ShrMovz2ShrCwtl)', hp1); + taicpu(hp1).opcode := A_CWDE; + taicpu(hp1).clearop(0); + taicpu(hp1).clearop(1); + taicpu(hp1).ops := 0; + end; + + { Move onto the next instruction } + Continue; + end; + + A_SHL, A_SAL, A_SHR: + if (taicpu(hp1).opsize <= LimitSize) and + MatchOpType(taicpu(hp1), top_const, top_reg) and + SuperRegistersEqual(taicpu(hp1).oper[1]^.reg, taicpu(p).oper[1]^.reg) then + begin + { Make sure the sizes don't exceed the register size limit + (measured by the shift value falling below the limit) } + + if taicpu(hp1).opsize < LimitSize then + LimitSize := taicpu(hp1).opsize; + + if taicpu(hp1).opcode = A_SHR then + Inc(Shift, taicpu(hp1).oper[0]^.val) + else + begin + Dec(Shift, taicpu(hp1).oper[0]^.val); + DoNotMerge := True; + end; + + if Shift < topsize2memsize[taicpu(p).opsize] - topsize2memsize[LimitSize] then + Exit; + + { Since we've established that the combined shift is within + limits, we can actually combine the adjacent SHR + instructions even if they're different sizes } + if not DoNotMerge and (taicpu(hp1).opcode = A_SHR) then + begin + hp2 := tai(hp1.Previous); + DebugMsg(SPeepholeOptimization + 'ShrShr2Shr 2', p); + Inc(taicpu(p).oper[0]^.val, taicpu(hp1).oper[0]^.val); + RemoveInstruction(hp1); + hp1 := hp2; + end; + + { Move onto the next instruction } + Continue; + end; + else + ; end; + Break; + until False; + end; diff --git a/compiler/x86_64/aoptcpu.pas b/compiler/x86_64/aoptcpu.pas index f4939258d8..4c9b52908c 100644 --- a/compiler/x86_64/aoptcpu.pas +++ b/compiler/x86_64/aoptcpu.pas @@ -130,6 +130,8 @@ uses result:=OptPass1Sub(p); A_SHL,A_SAL: result:=OptPass1SHLSAL(p); + A_SHR: + result:=OptPass1SHR(p); A_FSTP,A_FISTP: result:=OptPass1FSTP(p); A_FLD: