From bae583118a0e685032f2712943d2ec5bcdbb900b Mon Sep 17 00:00:00 2001 From: florian Date: Wed, 26 May 2021 21:03:56 +0000 Subject: [PATCH] * patch by J. Gareth Moreton: x86 JccMovJmpMov2MovSetcc improvement, resolves #38761 git-svn-id: trunk@49402 - --- compiler/aoptutils.pas | 19 + compiler/i386/aoptcpu.pas | 2 + compiler/x86/aoptx86.pas | 1236 +++++++++++++++++++++++++++-------- compiler/x86/cpubase.pas | 15 + compiler/x86_64/aoptcpu.pas | 2 + 5 files changed, 1003 insertions(+), 271 deletions(-) diff --git a/compiler/aoptutils.pas b/compiler/aoptutils.pas index a2fdada729..e72d75d50d 100644 --- a/compiler/aoptutils.pas +++ b/compiler/aoptutils.pas @@ -35,6 +35,10 @@ unit aoptutils; function MatchOpType(const p : taicpu; type0,type1,type2 : toptype) : Boolean; {$endif max_operands>2} + { skips all alignment fields and returns the next label (or non-align). + returns immediately with true if hp is a label } + function SkipAligns(hp: tai; out hp2: tai): boolean; + { skips all labels and returns the next "real" instruction } function SkipLabels(hp: tai; out hp2: tai): boolean; @@ -67,6 +71,21 @@ unit aoptutils; {$endif max_operands>2} + { skips all alignment fields and returns the next label (or non-align). + Returns immediately with True if hp is a label } + function SkipAligns(hp: tai; out hp2: tai): boolean; + begin + while assigned(hp) and + (hp.typ in SkipInstr + [ait_label,ait_align]) Do + begin + { Check that the label is actually live } + if (hp.typ = ait_label) and tai_label(hp).labsym.is_used then + Break; + hp := tai(hp.next); + end; + SkipAligns := SetAndTest(hp, hp2); + end; + { skips all labels and returns the next "real" instruction } function SkipLabels(hp: tai; out hp2: tai): boolean; begin diff --git a/compiler/i386/aoptcpu.pas b/compiler/i386/aoptcpu.pas index d1ee73f105..1fb2caa032 100644 --- a/compiler/i386/aoptcpu.pas +++ b/compiler/i386/aoptcpu.pas @@ -189,6 +189,8 @@ unit aoptcpu; Result:=OptPass1SHLSAL(p); A_SUB: Result:=OptPass1Sub(p); + A_Jcc: + Result:=OptPass1Jcc(p); A_MOVAPD, A_MOVAPS, A_MOVUPD, diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas index a4aab212c9..2c86bc5654 100644 --- a/compiler/x86/aoptx86.pas +++ b/compiler/x86/aoptx86.pas @@ -49,6 +49,12 @@ unit aoptx86; function RegReadByInstruction(reg : TRegister; hp : tai) : boolean; function RegInInstruction(Reg: TRegister; p1: tai): Boolean;override; function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean; + + { This version of GetNextInstructionUsingReg will look across conditional jumps, + potentially allowing further optimisation (although it might need to know if + it crossed a conditional jump. } + function GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var CrossJump: Boolean): Boolean; + { In comparison with GetNextInstructionUsingReg, GetNextInstructionUsingRegTrackingUse tracks the use of a register by allocs/dealloc, so it can ignore calls. @@ -140,6 +146,7 @@ unit aoptx86; function OptPass1PXor(var p : tai) : boolean; function OptPass1VPXor(var p: tai): boolean; function OptPass1Imul(var p : tai) : boolean; + function OptPass1Jcc(var p : tai) : boolean; function OptPass2Movx(var p : tai): Boolean; function OptPass2MOV(var p : tai) : boolean; @@ -169,6 +176,9 @@ unit aoptx86; procedure ConvertJumpToRET(const p: tai; const ret_p: tai); + function CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean; + procedure SwapMovCmp(var p, hp1: tai); + { Processor-dependent reference optimisation } class procedure OptimizeRefs(var p: taicpu); static; end; @@ -392,6 +402,27 @@ unit aoptx86; end; + function TX86AsmOptimizer.GetNextInstructionUsingRegCond(Current: tai; out Next: tai; reg: TRegister; var CrossJump: Boolean): Boolean; + begin + { Note, CrossJump keeps its input value if a conditional jump is not found - it doesn't get set to False } + Next := Current; + repeat + Result := GetNextInstruction(Next,Next); + if Result and (Next.typ=ait_instruction) and is_calljmp(taicpu(Next).opcode) then + if is_calljmpuncond(taicpu(Next).opcode) then + begin + Result := False; + Exit; + end + else + CrossJump := True; + until not Result or + not (cs_opt_level3 in current_settings.optimizerswitches) or + (Next.typ <> ait_instruction) or + RegInInstruction(reg,Next); + end; + + function TX86AsmOptimizer.GetNextInstructionUsingRegTrackingUse(Current: tai; out Next: tai; reg: TRegister): Boolean; begin if not(cs_opt_level3 in current_settings.optimizerswitches) then @@ -1925,7 +1956,7 @@ unit aoptx86; end; var - GetNextInstruction_p, TempRegUsed: Boolean; + GetNextInstruction_p, TempRegUsed, CrossJump: Boolean; PreMessage, RegName1, RegName2, InputVal, MaskNum: string; NewSize: topsize; CurrentReg: TRegister; @@ -1975,9 +2006,7 @@ unit aoptx86; DebugMsg(SPeepholeOptimization + 'Mov2Nop 3 done',p); RemoveCurrentp(p, hp1); - { TmpUsedRegs contains the results of "UpdateUsedRegs(tai(p.Next))" already, - so just restore it to UsedRegs instead of calculating it again } - RestoreUsedRegs(TmpUsedRegs); + { UsedRegs got updated by RemoveCurrentp } Result := True; Exit; end; @@ -2593,176 +2622,232 @@ unit aoptx86; { search further than the next instruction for a mov } if - { check as much as possible before the expensive GetNextInstructionUsingReg call } + { check as much as possible before the expensive GetNextInstructionUsingRegCond call } (taicpu(p).oper[1]^.typ = top_reg) and (taicpu(p).oper[0]^.typ in [top_reg,top_const]) and - not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) and - { we work with hp2 here, so hp1 can be still used later on when - checking for GetNextInstruction_p } - { GetNextInstructionUsingReg only searches one instruction ahead unless -O3 is specified } - GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[1]^.reg) and - (hp2.typ=ait_instruction) then + not RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then begin - case taicpu(hp2).opcode of - A_MOV: - if MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^.reg) and - ((taicpu(p).oper[0]^.typ=top_const) or - ((taicpu(p).oper[0]^.typ=top_reg) and - not(RegUsedBetween(taicpu(p).oper[0]^.reg, p, hp2)) - ) - ) then - begin - { we have - mov x, %treg - mov %treg, y - } + { we work with hp2 here, so hp1 can be still used later on when + checking for GetNextInstruction_p } + hp3 := hp1; - TransferUsedRegs(TmpUsedRegs); - TmpUsedRegs[R_INTREGISTER].Update(tai(p.Next)); + { Initialise CrossJump (if it becomes True at any point, it will remain True) } + CrossJump := False; - { We don't need to call UpdateUsedRegs for every instruction between - p and hp2 because the register we're concerned about will not - become deallocated (otherwise GetNextInstructionUsingReg would - have stopped at an earlier instruction). [Kit] } + while GetNextInstructionUsingRegCond(hp3,hp2,taicpu(p).oper[1]^.reg,CrossJump) and + { GetNextInstructionUsingRegCond only searches one instruction ahead unless -O3 is specified } + (hp2.typ=ait_instruction) do + begin + case taicpu(hp2).opcode of + A_MOV: + if MatchOperand(taicpu(hp2).oper[0]^,taicpu(p).oper[1]^.reg) and + ((taicpu(p).oper[0]^.typ=top_const) or + ((taicpu(p).oper[0]^.typ=top_reg) and + not(RegModifiedBetween(taicpu(p).oper[0]^.reg, p, hp2)) + ) + ) then + begin + { we have + mov x, %treg + mov %treg, y + } - TempRegUsed := - RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) or - RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1); + TransferUsedRegs(TmpUsedRegs); + TmpUsedRegs[R_INTREGISTER].Update(tai(p.Next)); - case taicpu(p).oper[0]^.typ Of - top_reg: - begin - { change - mov %reg, %treg - mov %treg, y + { We don't need to call UpdateUsedRegs for every instruction between + p and hp2 because the register we're concerned about will not + become deallocated (otherwise GetNextInstructionUsingReg would + have stopped at an earlier instruction). [Kit] } - to + TempRegUsed := + CrossJump { Assume the register is in use if it crossed a conditional jump } or + RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp2, TmpUsedRegs) or + RegReadByInstruction(taicpu(p).oper[1]^.reg, hp1); - mov %reg, y - } - CurrentReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences } - RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg); - if taicpu(hp2).oper[1]^.reg = CurrentReg then + case taicpu(p).oper[0]^.typ Of + top_reg: begin - { %reg = y - remove hp2 completely (doing it here instead of relying on - the "mov %reg,%reg" optimisation might cut down on a pass iteration) } + { change + mov %reg, %treg + mov %treg, y - if TempRegUsed then + to + + mov %reg, y + } + CurrentReg := taicpu(p).oper[0]^.reg; { Saves on a handful of pointer dereferences } + RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg); + if taicpu(hp2).oper[1]^.reg = CurrentReg then + begin + { %reg = y - remove hp2 completely (doing it here instead of relying on + the "mov %reg,%reg" optimisation might cut down on a pass iteration) } + + if TempRegUsed then + begin + DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2); + AllocRegBetween(CurrentReg, p, hp2, UsedRegs); + { Set the start of the next GetNextInstructionUsingRegCond search + to start at the entry right before hp2 (which is about to be removed) } + hp3 := tai(hp2.Previous); + RemoveInstruction(hp2); + + { See if there's more we can optimise } + Continue; + end + else + begin + RemoveInstruction(hp2); + + { We can remove the original MOV too } + DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p); + RemoveCurrentP(p, hp1); + Result:=true; + Exit; + end; + end + else begin - DebugMsg(SPeepholeOptimization + debug_regname(CurrentReg) + ' = ' + RegName1 + '; removed unnecessary instruction (MovMov2MovNop 6b}',hp2); AllocRegBetween(CurrentReg, p, hp2, UsedRegs); - RemoveInstruction(hp2); - end - else - begin - RemoveInstruction(hp2); + taicpu(hp2).loadReg(0, CurrentReg); + if TempRegUsed then + begin + { Don't remove the first instruction if the temporary register is in use } + DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(CurrentReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2); - { We can remove the original MOV too } - DebugMsg(SPeepholeOptimization + 'MovMov2NopNop 6b done',p); - RemoveCurrentP(p, hp1); - Result:=true; - Exit; - end; - end - else - begin - AllocRegBetween(CurrentReg, p, hp2, UsedRegs); - taicpu(hp2).loadReg(0, CurrentReg); - if TempRegUsed then - begin - { Don't remove the first instruction if the temporary register is in use } - DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_regname(CurrentReg) + '; changed to minimise pipeline stall (MovMov2Mov 6a}',hp2); - - { No need to set Result to True. If there's another instruction later on - that can be optimised, it will be detected when the main Pass 1 loop - reaches what is now hp2 and passes it through OptPass1MOV. [Kit] }; - end - else - begin - DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p); - RemoveCurrentP(p, hp1); - Result:=true; - Exit; + { No need to set Result to True. If there's another instruction later on + that can be optimised, it will be detected when the main Pass 1 loop + reaches what is now hp2 and passes it through OptPass1MOV. [Kit] }; + end + else + begin + DebugMsg(SPeepholeOptimization + 'MovMov2Mov 6 done',p); + RemoveCurrentP(p, hp1); + Result:=true; + Exit; + end; end; end; - end; - top_const: - if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then - begin - { change - mov const, %treg - mov %treg, y - - to - - mov const, y - } - if (taicpu(hp2).oper[1]^.typ=top_reg) or - ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then + top_const: + if not (cs_opt_size in current_settings.optimizerswitches) or (taicpu(hp2).opsize = S_B) then begin - RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg); - taicpu(hp2).loadOper(0,taicpu(p).oper[0]^); + { change + mov const, %treg + mov %treg, y - if TempRegUsed then + to + + mov const, y + } + if (taicpu(hp2).oper[1]^.typ=top_reg) or + ((taicpu(p).oper[0]^.val>=low(longint)) and (taicpu(p).oper[0]^.val<=high(longint))) then begin - { Don't remove the first instruction if the temporary register is in use } - DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2); + RegName1 := debug_regname(taicpu(hp2).oper[0]^.reg); + taicpu(hp2).loadOper(0,taicpu(p).oper[0]^); - { No need to set Result to True. If there's another instruction later on - that can be optimised, it will be detected when the main Pass 1 loop - reaches what is now hp2 and passes it through OptPass1MOV. [Kit] }; + if TempRegUsed then + begin + { Don't remove the first instruction if the temporary register is in use } + DebugMsg(SPeepholeOptimization + RegName1 + ' = ' + debug_tostr(taicpu(p).oper[0]^.val) + '; changed to minimise pipeline stall (MovMov2Mov 7a)',hp2); + + { No need to set Result to True. If there's another instruction later on + that can be optimised, it will be detected when the main Pass 1 loop + reaches what is now hp2 and passes it through OptPass1MOV. [Kit] }; + end + else + begin + DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p); + RemoveCurrentP(p, hp1); + Result:=true; + Exit; + end; + end; + end; + else + Internalerror(2019103001); + end; + end; + A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}: + if MatchOpType(taicpu(hp2), top_reg, top_reg) and + MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and + SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg) then + begin + { + Change from: + mov ###, %reg + ... + movs/z %reg,%reg (Same register, just different sizes) + + To: + movs/z ###, %reg (Longer version) + ... + (remove) + } + DebugMsg(SPeepholeOptimization + 'MovMovs/z2Mov/s/z done', p); + taicpu(p).oper[1]^.reg := taicpu(hp2).oper[1]^.reg; + + { Keep the first instruction as mov if ### is a constant } + if taicpu(p).oper[0]^.typ = top_const then + taicpu(p).opsize := reg2opsize(taicpu(hp2).oper[1]^.reg) + else + begin + taicpu(p).opcode := taicpu(hp2).opcode; + taicpu(p).opsize := taicpu(hp2).opsize; + end; + + DebugMsg(SPeepholeOptimization + 'Removed movs/z instruction and extended earlier write (MovMovs/z2Mov/s/z)', hp2); + AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp2, UsedRegs); + RemoveInstruction(hp2); + + Result := True; + Exit; + end; + else + if MatchOpType(taicpu(p), top_reg, top_reg) then + begin + CurrentReg := taicpu(p).oper[1]^.reg; + TransferUsedRegs(TmpUsedRegs); + TmpUsedRegs[R_INTREGISTER].Update(tai(p.Next)); + if + not RegModifiedByInstruction(taicpu(p).oper[0]^.reg, hp1) and + not RegModifiedBetween(taicpu(p).oper[0]^.reg, hp1, hp2) and + DeepMovOpt(taicpu(p), taicpu(hp2)) then + begin + { Just in case something didn't get modified (e.g. an + implicit register) } + if not RegReadByInstruction(CurrentReg, hp2) and + { If a conditional jump was crossed, do not delete + the original MOV no matter what } + not CrossJump then + begin + TransferUsedRegs(TmpUsedRegs); + UpdateUsedRegs(TmpUsedRegs, tai(p.Next)); + UpdateUsedRegs(TmpUsedRegs, tai(hp1.Next)); + + if not RegUsedAfterInstruction(CurrentReg, hp2, TmpUsedRegs) then + begin + { We can remove the original MOV } + DebugMsg(SPeepholeOptimization + 'Mov2Nop 3b done',p); + RemoveCurrentp(p, hp1); + Result := True; + Exit; end else begin - DebugMsg(SPeepholeOptimization + 'MovMov2Mov 7 done',p); - RemoveCurrentP(p, hp1); - Result:=true; - Exit; + { See if there's more we can optimise } + hp3 := hp2; + Continue; end; - end; + + end; end; - else - Internalerror(2019103001); end; - end; - A_MOVZX, A_MOVSX{$ifdef x86_64}, A_MOVSXD{$endif x86_64}: - if MatchOpType(taicpu(hp2), top_reg, top_reg) and - MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and - SuperRegistersEqual(taicpu(hp2).oper[1]^.reg, taicpu(p).oper[1]^.reg) then - begin - { - Change from: - mov ###, %reg - ... - movs/z %reg,%reg (Same register, just different sizes) + end; - To: - movs/z ###, %reg (Longer version) - ... - (remove) - } - DebugMsg(SPeepholeOptimization + 'MovMovs/z2Mov/s/z done', p); - taicpu(p).oper[1]^.reg := taicpu(hp2).oper[1]^.reg; + { Break out of the while loop under normal circumstances } + Break; + end; - { Keep the first instruction as mov if ### is a constant } - if taicpu(p).oper[0]^.typ = top_const then - taicpu(p).opsize := reg2opsize(taicpu(hp2).oper[1]^.reg) - else - begin - taicpu(p).opcode := taicpu(hp2).opcode; - taicpu(p).opsize := taicpu(hp2).opsize; - end; - - DebugMsg(SPeepholeOptimization + 'Removed movs/z instruction and extended earlier write (MovMovs/z2Mov/s/z)', hp2); - AllocRegBetween(taicpu(hp2).oper[1]^.reg, p, hp2, UsedRegs); - RemoveInstruction(hp2); - - Result := True; - Exit; - end; - else - ; - end; end; if (aoc_MovAnd2Mov_3 in OptsToCheck) and @@ -3277,65 +3362,90 @@ unit aoptx86; JumpLabel, JumpLabel_dist: TAsmLabel; begin Result := False; - { Search for: - test %reg,%reg - j(c1) @lbl1 - ... - @lbl: - test %reg,%reg (same register) - j(c2) @lbl2 - If c2 is a subset of c1, change to: - test %reg,%reg - j(c1) @lbl2 - (@lbl1 may become a dead label as a result) - } - - if MatchOpType(taicpu(p), top_reg, top_reg) and - (taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and - GetNextInstruction(p, hp1) and - MatchInstruction(hp1, A_JCC, []) and - (taicpu(hp1).oper[0]^.typ = top_ref) then + if (taicpu(p).oper[1]^.typ = top_reg) then begin - JumpLabel := TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol); - p_label := nil; - if Assigned(JumpLabel) then - p_label := getlabelwithsym(JumpLabel); - - if Assigned(p_label) and - GetNextInstruction(p_label, p_dist) and - MatchInstruction(p_dist, A_TEST, []) and - { It's fine if the second test uses smaller sub-registers } - (taicpu(p_dist).opsize <= taicpu(p).opsize) and - MatchOpType(taicpu(p_dist), top_reg, top_reg) and - SuperRegistersEqual(taicpu(p_dist).oper[0]^.reg, taicpu(p).oper[0]^.reg) and - SuperRegistersEqual(taicpu(p_dist).oper[1]^.reg, taicpu(p).oper[1]^.reg) and - GetNextInstruction(p_dist, hp1_dist) and - MatchInstruction(hp1_dist, A_JCC, []) then + if GetNextInstruction(p, hp1) and + MatchInstruction(hp1,A_MOV,[]) and + not RegInInstruction(taicpu(p).oper[1]^.reg, hp1) and + ( + (taicpu(p).oper[0]^.typ <> top_reg) or + not RegInInstruction(taicpu(p).oper[0]^.reg, hp1) + ) then begin - JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol); + { If we have something like: + test %reg1,%reg1 + mov 0,%reg2 - if JumpLabel = JumpLabel_dist then - { This is an infinite loop } - Exit; + And no registers are shared (the two %reg1's can be different, as + long as neither of them are also %reg2), move the MOV command to + before the comparison as this means it can be optimised without + worrying about the FLAGS register. (This combination is generated + by "J(c)Mov1JmpMov0 -> Set(~c)", among other things). + } + SwapMovCmp(p, hp1); + Result := True; + Exit; + end; - { Best optimisation when the second condition is a subset (or equal) to the first } - if condition_in(taicpu(hp1_dist).condition, taicpu(hp1).condition) then + { Search for: + test %reg,%reg + j(c1) @lbl1 + ... + @lbl: + test %reg,%reg (same register) + j(c2) @lbl2 + + If c2 is a subset of c1, change to: + test %reg,%reg + j(c1) @lbl2 + (@lbl1 may become a dead label as a result) + } + + if (taicpu(p).oper[0]^.typ = top_reg) and + (taicpu(p).oper[0]^.reg = taicpu(p).oper[1]^.reg) and + MatchInstruction(hp1, A_JCC, []) and + (taicpu(hp1).oper[0]^.typ = top_ref) then + begin + JumpLabel := TAsmLabel(taicpu(hp1).oper[0]^.ref^.symbol); + p_label := nil; + if Assigned(JumpLabel) then + p_label := getlabelwithsym(JumpLabel); + + if Assigned(p_label) and + GetNextInstruction(p_label, p_dist) and + MatchInstruction(p_dist, A_TEST, []) and + { It's fine if the second test uses smaller sub-registers } + (taicpu(p_dist).opsize <= taicpu(p).opsize) and + MatchOpType(taicpu(p_dist), top_reg, top_reg) and + SuperRegistersEqual(taicpu(p_dist).oper[0]^.reg, taicpu(p).oper[0]^.reg) and + SuperRegistersEqual(taicpu(p_dist).oper[1]^.reg, taicpu(p).oper[1]^.reg) and + GetNextInstruction(p_dist, hp1_dist) and + MatchInstruction(hp1_dist, A_JCC, []) then begin - if Assigned(JumpLabel_dist) then - JumpLabel_dist.IncRefs; + JumpLabel_dist := TAsmLabel(taicpu(hp1_dist).oper[0]^.ref^.symbol); - if Assigned(JumpLabel) then - JumpLabel.DecRefs; + if JumpLabel = JumpLabel_dist then + { This is an infinite loop } + Exit; - DebugMsg(SPeepholeOptimization + 'TEST/Jcc/@Lbl/TEST/Jcc -> TEST/Jcc, redirecting first jump', hp1); - taicpu(hp1).loadref(0, taicpu(hp1_dist).oper[0]^.ref^); - Result := True; - Exit; + { Best optimisation when the second condition is a subset (or equal) to the first } + if condition_in(taicpu(hp1_dist).condition, taicpu(hp1).condition) then + begin + if Assigned(JumpLabel_dist) then + JumpLabel_dist.IncRefs; + + if Assigned(JumpLabel) then + JumpLabel.DecRefs; + + DebugMsg(SPeepholeOptimization + 'TEST/Jcc/@Lbl/TEST/Jcc -> TEST/Jcc, redirecting first jump', hp1); + taicpu(hp1).loadref(0, taicpu(hp1_dist).oper[0]^.ref^); + Result := True; + Exit; + end; end; end; end; - end; @@ -4626,6 +4736,29 @@ unit aoptx86; end; end; end; + + if (taicpu(p).oper[1]^.typ = top_reg) and + GetNextInstruction(p, hp1) and + MatchInstruction(hp1,A_MOV,[]) and + not RegInInstruction(taicpu(p).oper[1]^.reg, hp1) and + ( + (taicpu(p).oper[0]^.typ <> top_reg) or + not RegInInstruction(taicpu(p).oper[0]^.reg, hp1) + ) then + begin + { If we have something like: + cmp ###,%reg1 + mov 0,%reg2 + + And no registers are shared, move the MOV command to before the + comparison as this means it can be optimised without worrying + about the FLAGS register. (This combination is generated by + "J(c)Mov1JmpMov0 -> Set(~c)", among other things). + } + SwapMovCmp(p, hp1); + Result := True; + Exit; + end; end; @@ -4747,8 +4880,428 @@ unit aoptx86; end; + function TX86AsmOptimizer.OptPass1Jcc(var p : tai) : boolean; + var + hp1, hp2, hp3, hp4, hp5: tai; + ThisReg: TRegister; + begin + Result := False; + if not GetNextInstruction(p,hp1) or (hp1.typ <> ait_instruction) then + Exit; - function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean; + { + convert + j .L1 + mov 1,reg + jmp .L2 + .L1 + mov 0,reg + .L2 + + into + mov 0,reg + set reg + + take care of alignment and that the mov 0,reg is not converted into a xor as this + would destroy the flag contents + + Use MOVZX if size is preferred, since while mov 0,reg is bigger, it can be + executed at the same time as a previous comparison. + set reg + movzx reg, reg + } + + if MatchInstruction(hp1,A_MOV,[]) and + (taicpu(hp1).oper[0]^.typ = top_const) and + ( + ( + (taicpu(hp1).oper[1]^.typ = top_reg) +{$ifdef i386} + { Under i386, ESI, EDI, EBP and ESP + don't have an 8-bit representation } + and not (getsupreg(taicpu(hp1).oper[1]^.reg) in [RS_ESI, RS_EDI, RS_EBP, RS_ESP]) + +{$endif i386} + ) or ( +{$ifdef i386} + (taicpu(hp1).oper[1]^.typ <> top_reg) and +{$endif i386} + (taicpu(hp1).opsize = S_B) + ) + ) and + GetNextInstruction(hp1,hp2) and + MatchInstruction(hp2,A_JMP,[]) and (taicpu(hp2).oper[0]^.ref^.refaddr=addr_full) and + GetNextInstruction(hp2,hp3) and + SkipAligns(hp3, hp3) and + (hp3.typ=ait_label) and + (tasmlabel(taicpu(p).oper[0]^.ref^.symbol)=tai_label(hp3).labsym) and + GetNextInstruction(hp3,hp4) and + MatchInstruction(hp4,A_MOV,[taicpu(hp1).opsize]) and + (taicpu(hp4).oper[0]^.typ = top_const) and + ( + ((taicpu(hp1).oper[0]^.val = 0) and (taicpu(hp4).oper[0]^.val = 1)) or + ((taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0)) + ) and + MatchOperand(taicpu(hp1).oper[1]^,taicpu(hp4).oper[1]^) and + GetNextInstruction(hp4,hp5) and + SkipAligns(hp5, hp5) and + (hp5.typ=ait_label) and + (tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol)=tai_label(hp5).labsym) then + begin + if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then + taicpu(p).condition := inverse_cond(taicpu(p).condition); + + tai_label(hp3).labsym.DecRefs; + + { If this isn't the only reference to the middle label, we can + still make a saving - only that the first jump and everything + that follows will remain. } + if (tai_label(hp3).labsym.getrefs = 0) then + begin + if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then + DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c)',p) + else + DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c)',p); + + { remove jump, first label and second MOV (also catching any aligns) } + repeat + if not GetNextInstruction(hp2, hp3) then + InternalError(2021040810); + + RemoveInstruction(hp2); + + hp2 := hp3; + until hp2 = hp5; + + { Don't decrement reference count before the removal loop + above, otherwise GetNextInstruction won't stop on the + the label } + tai_label(hp5).labsym.DecRefs; + end + else + begin + if (taicpu(hp1).oper[0]^.val = 1) and (taicpu(hp4).oper[0]^.val = 0) then + DebugMsg(SPeepholeOptimization + 'J(c)Mov1JmpMov0 -> Set(~c) (partial)',p) + else + DebugMsg(SPeepholeOptimization + 'J(c)Mov0JmpMov1 -> Set(c) (partial)',p); + end; + + taicpu(p).opcode:=A_SETcc; + taicpu(p).opsize:=S_B; + taicpu(p).is_jmp:=False; + + if taicpu(hp1).opsize=S_B then + begin + taicpu(p).loadoper(0, taicpu(hp1).oper[1]^); + RemoveInstruction(hp1); + end + else + begin + { Will be a register because the size can't be S_B otherwise } + ThisReg := newreg(R_INTREGISTER,getsupreg(taicpu(hp1).oper[1]^.reg), R_SUBL); + taicpu(p).loadreg(0, ThisReg); + + if (cs_opt_size in current_settings.optimizerswitches) and IsMOVZXAcceptable then + begin + case taicpu(hp1).opsize of + S_W: + taicpu(hp1).opsize := S_BW; + S_L: + taicpu(hp1).opsize := S_BL; +{$ifdef x86_64} + S_Q: + begin + taicpu(hp1).opsize := S_BL; + { Change the destination register to 32-bit } + taicpu(hp1).loadreg(1, newreg(R_INTREGISTER,getsupreg(ThisReg), R_SUBD)); + end; +{$endif x86_64} + else + InternalError(2021040820); + end; + + taicpu(hp1).opcode := A_MOVZX; + taicpu(hp1).loadreg(0, ThisReg); + end + else + begin + AllocRegBetween(NR_FLAGS,p,hp1,UsedRegs); + + { hp1 is already a MOV instruction with the correct register } + taicpu(hp1).loadconst(0, 0); + + { Inserting it right before p will guarantee that the flags are also tracked } + asml.Remove(hp1); + asml.InsertBefore(hp1, p); + end; + end; + + Result:=true; + exit; + end + end; + + + function TX86AsmOptimizer.CheckJumpMovTransferOpt(var p: tai; hp1: tai; LoopCount: Integer; out Count: Integer): Boolean; + var + hp2, hp3, first_assignment: tai; + IncCount, OperIdx: Integer; + OrigLabel: TAsmLabel; + begin + Count := 0; + Result := False; + first_assignment := nil; + if (LoopCount >= 20) then + begin + { Guard against infinite loops } + Exit; + end; + if (taicpu(p).oper[0]^.typ <> top_ref) or + (taicpu(p).oper[0]^.ref^.refaddr <> addr_full) or + (taicpu(p).oper[0]^.ref^.base <> NR_NO) or + (taicpu(p).oper[0]^.ref^.index <> NR_NO) or + not (taicpu(p).oper[0]^.ref^.symbol is TAsmLabel) then + Exit; + + OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol); + + { + change + jmp .L1 + ... + .L1: + mov ##, ## ( multiple movs possible ) + jmp/ret + into + mov ##, ## + jmp/ret + } + + if not Assigned(hp1) then + begin + hp1 := GetLabelWithSym(OrigLabel); + if not Assigned(hp1) or not SkipLabels(hp1, hp1) then + Exit; + + end; + + hp2 := hp1; + + while Assigned(hp2) do + begin + if Assigned(hp2) and (hp2.typ in [ait_label, ait_align]) then + SkipLabels(hp2,hp2); + + if not Assigned(hp2) or (hp2.typ <> ait_instruction) then + Break; + + case taicpu(hp2).opcode of + A_MOVSS: + begin + if taicpu(hp2).ops = 0 then + { Wrong MOVSS } + Break; + Inc(Count); + if Count >= 5 then + { Too many to be worthwhile } + Break; + GetNextInstruction(hp2, hp2); + Continue; + end; + A_MOV, + A_MOVD, + A_MOVQ, + A_MOVSX, +{$ifdef x86_64} + A_MOVSXD, +{$endif x86_64} + A_MOVZX, + A_MOVAPS, + A_MOVUPS, + A_MOVSD, + A_MOVAPD, + A_MOVUPD, + A_MOVDQA, + A_MOVDQU, + A_VMOVSS, + A_VMOVAPS, + A_VMOVUPS, + A_VMOVSD, + A_VMOVAPD, + A_VMOVUPD, + A_VMOVDQA, + A_VMOVDQU: + begin + Inc(Count); + if Count >= 5 then + { Too many to be worthwhile } + Break; + GetNextInstruction(hp2, hp2); + Continue; + end; + A_JMP: + begin + { Guard against infinite loops } + if taicpu(hp2).oper[0]^.ref^.symbol = OrigLabel then + Exit; + + { Analyse this jump first in case it also duplicates assignments } + if CheckJumpMovTransferOpt(hp2, nil, LoopCount + 1, IncCount) then + begin + { Something did change! } + Result := True; + + Inc(Count, IncCount); + if Count >= 5 then + begin + { Too many to be worthwhile } + Exit; + end; + + if MatchInstruction(hp2, [A_JMP, A_RET], []) then + Break; + end; + + Result := True; + Break; + end; + A_RET: + begin + Result := True; + Break; + end; + else + Break; + end; + end; + + if Result then + begin + { A count of zero can happen when CheckJumpMovTransferOpt is called recursively } + if Count = 0 then + begin + Result := False; + Exit; + end; + + hp3 := p; + DebugMsg(SPeepholeOptimization + 'Duplicated ' + debug_tostr(Count) + ' assignment(s) and redirected jump', p); + while True do + begin + if Assigned(hp1) and (hp1.typ in [ait_label, ait_align]) then + SkipLabels(hp1,hp1); + + if (hp1.typ <> ait_instruction) then + InternalError(2021040720); + + case taicpu(hp1).opcode of + A_JMP: + begin + { Change the original jump to the new destination } + OrigLabel.decrefs; + taicpu(hp1).oper[0]^.ref^.symbol.increfs; + taicpu(p).loadref(0, taicpu(hp1).oper[0]^.ref^); + + { Set p to the first duplicated assignment so it can get optimised if needs be } + if not Assigned(first_assignment) then + InternalError(2021040810) + else + p := first_assignment; + + Exit; + end; + A_RET: + begin + { Now change the jump into a RET instruction } + ConvertJumpToRET(p, hp1); + + { Set p to the first duplicated assignment so it can get optimised if needs be } + if not Assigned(first_assignment) then + InternalError(2021040811) + else + p := first_assignment; + + Exit; + end; + else + begin + { Duplicate the MOV instruction } + hp3:=tai(hp1.getcopy); + if first_assignment = nil then + first_assignment := hp3; + + asml.InsertBefore(hp3, p); + + { Make sure the compiler knows about any final registers written here } + for OperIdx := 0 to taicpu(hp3).ops - 1 do + with taicpu(hp3).oper[OperIdx]^ do + begin + case typ of + top_ref: + begin + if (ref^.base <> NR_NO) and + (getsupreg(ref^.base) <> RS_ESP) and + (getsupreg(ref^.base) <> RS_EBP) + {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64} + then + AllocRegBetween(ref^.base, hp3, tai(p.Next), UsedRegs); + if (ref^.index <> NR_NO) and + (getsupreg(ref^.index) <> RS_ESP) and + (getsupreg(ref^.index) <> RS_EBP) + {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} and + (ref^.index <> ref^.base) then + AllocRegBetween(ref^.index, hp3, tai(p.Next), UsedRegs); + end; + top_reg: + AllocRegBetween(reg, hp3, tai(p.Next), UsedRegs); + else + ; + end; + end; + end; + end; + + if not GetNextInstruction(hp1, hp1) then + { Should have dropped out earlier } + InternalError(2021040710); + end; + end; + end; + + + procedure TX86AsmOptimizer.SwapMovCmp(var p, hp1: tai); + var + hp2: tai; + X: Integer; + begin + asml.Remove(hp1); + + { Try to insert after the last instructions where the FLAGS register is not yet in use } + if not GetLastInstruction(p, hp2) then + asml.InsertBefore(hp1, p) + else + asml.InsertAfter(hp1, hp2); + + DebugMsg(SPeepholeOptimization + 'Swapped ' + debug_op2str(taicpu(p).opcode) + ' and mov instructions to improve optimisation potential', hp1); + + for X := 0 to 1 do + case taicpu(hp1).oper[X]^.typ of + top_reg: + AllocRegBetween(taicpu(hp1).oper[X]^.reg, hp1, p, UsedRegs); + top_ref: + begin + if taicpu(hp1).oper[X]^.ref^.base <> NR_NO then + AllocRegBetween(taicpu(hp1).oper[X]^.ref^.base, hp1, p, UsedRegs); + if taicpu(hp1).oper[X]^.ref^.index <> NR_NO then + AllocRegBetween(taicpu(hp1).oper[X]^.ref^.index, hp1, p, UsedRegs); + end; + else + ; + end; + end; + + + function TX86AsmOptimizer.OptPass2MOV(var p : tai) : boolean; function IsXCHGAcceptable: Boolean; inline; begin @@ -4769,13 +5322,156 @@ unit aoptx86; var NewRef: TReference; - hp1,hp2,hp3: tai; + hp1, hp2, hp3, hp4: Tai; {$ifndef x86_64} - hp4: tai; - OperIdx: Integer; + OperIdx: Integer; {$endif x86_64} - begin + NewInstr : Taicpu; + NewAligh : Tai_align; + DestLabel: TAsmLabel; + begin Result:=false; + + { This optimisation adds an instruction, so only do it for speed } + if not (cs_opt_size in current_settings.optimizerswitches) and + MatchOpType(taicpu(p), top_const, top_reg) and + (taicpu(p).oper[0]^.val = 0) then + begin + + { To avoid compiler warning } + DestLabel := nil; + + if (p.typ <> ait_instruction) or (taicpu(p).oper[1]^.typ <> top_reg) then + InternalError(2021040750); + + if not GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[1]^.reg) then + Exit; + + case hp1.typ of + ait_label: + begin + { Change: + mov $0,%reg mov $0,%reg + @Lbl1: @Lbl1: + test %reg,%reg / cmp $0,%reg test %reg,%reg / mov $0,%reg + je @Lbl2 jne @Lbl2 + + To: To: + mov $0,%reg mov $0,%reg + jmp @Lbl2 jmp @Lbl3 + (align) (align) + @Lbl1: @Lbl1: + test %reg,%reg / cmp $0,%reg test %reg,%reg / cmp $0,%reg + je @Lbl2 je @Lbl2 + @Lbl3: <-- Only if label exists + + (Not if it's optimised for size) + } + if not GetNextInstruction(hp1, hp2) then + Exit; + + if not (cs_opt_size in current_settings.optimizerswitches) and + (hp2.typ = ait_instruction) and + ( + { Register sizes must exactly match } + ( + (taicpu(hp2).opcode = A_CMP) and + MatchOperand(taicpu(hp2).oper[0]^, 0) and + MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg) + ) or ( + (taicpu(hp2).opcode = A_TEST) and + MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[1]^.reg) and + MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^.reg) + ) + ) and GetNextInstruction(hp2, hp3) and + (hp3.typ = ait_instruction) and + (taicpu(hp3).opcode = A_JCC) and + (taicpu(hp3).oper[0]^.typ=top_ref) and (taicpu(hp3).oper[0]^.ref^.refaddr=addr_full) and (taicpu(hp3).oper[0]^.ref^.base=NR_NO) and + (taicpu(hp3).oper[0]^.ref^.index=NR_NO) and (taicpu(hp3).oper[0]^.ref^.symbol is tasmlabel) then + begin + { Check condition of jump } + + { Always true? } + if condition_in(C_E, taicpu(hp3).condition) then + begin + { Copy label symbol and obtain matching label entry for the + conditional jump, as this will be our destination} + DestLabel := tasmlabel(taicpu(hp3).oper[0]^.ref^.symbol); + DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Je -> Mov0JmpLblCmp0Je', p); + Result := True; + end + + { Always false? } + else if condition_in(C_NE, taicpu(hp3).condition) and GetNextInstruction(hp3, hp2) then + begin + { This is only worth it if there's a jump to take } + + case hp2.typ of + ait_instruction: + begin + if taicpu(hp2).opcode = A_JMP then + begin + DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol); + { An unconditional jump follows the conditional jump which will always be false, + so use this jump's destination for the new jump } + DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with JMP)', p); + Result := True; + end + else if taicpu(hp2).opcode = A_JCC then + begin + DestLabel := tasmlabel(taicpu(hp2).oper[0]^.ref^.symbol); + if condition_in(C_E, taicpu(hp2).condition) then + begin + { A second conditional jump follows the conditional jump which will always be false, + while the second jump is always True, so use this jump's destination for the new jump } + DebugMsg(SPeepholeOptimization + 'Mov0LblCmp0Jne -> Mov0JmpLblCmp0Jne (with second Jcc)', p); + Result := True; + end; + + { Don't risk it if the jump isn't always true (Result remains False) } + end; + end; + else + { If anything else don't optimise }; + end; + end; + + if Result then + begin + { Just so we have something to insert as a paremeter} + reference_reset(NewRef, 1, []); + NewInstr := taicpu.op_ref(A_JMP, S_NO, NewRef); + + { Now actually load the correct parameter } + NewInstr.loadsymbol(0, DestLabel, 0); + + { Get instruction before original label (may not be p under -O3) } + if not GetLastInstruction(hp1, hp2) then + { Shouldn't fail here } + InternalError(2021040701); + + DestLabel.increfs; + + AsmL.InsertAfter(NewInstr, hp2); + { Add new alignment field } + (* AsmL.InsertAfter( + cai_align.create_max( + current_settings.alignment.jumpalign, + current_settings.alignment.jumpalignskipmax + ), + NewInstr + ); *) + end; + + Exit; + end; + end; + else + ; + end; + + end; + if not GetNextInstruction(p, hp1) then Exit; @@ -6352,14 +7048,28 @@ unit aoptx86; function TX86AsmOptimizer.OptPass2Jmp(var p : tai) : boolean; var - hp1, hp2, hp3: tai; - OperIdx: Integer; + hp1: tai; + Count: Integer; + OrigLabel: TAsmLabel; begin - result:=false; + result := False; + + { Sometimes, the optimisations below can permit this } + RemoveDeadCodeAfterJump(p); + if (taicpu(p).oper[0]^.typ=top_ref) and (taicpu(p).oper[0]^.ref^.refaddr=addr_full) and (taicpu(p).oper[0]^.ref^.base=NR_NO) and (taicpu(p).oper[0]^.ref^.index=NR_NO) and (taicpu(p).oper[0]^.ref^.symbol is tasmlabel) then begin - hp1:=getlabelwithsym(tasmlabel(taicpu(p).oper[0]^.ref^.symbol)); + OrigLabel := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol); + + { Also a side-effect of optimisations } + if CollapseZeroDistJump(p, OrigLabel) then + begin + Result := True; + Exit; + end; + + hp1 := GetLabelWithSym(OrigLabel); if (taicpu(p).condition=C_None) and assigned(hp1) and SkipLabels(hp1,hp1) and (hp1.typ = ait_instruction) then begin case taicpu(hp1).opcode of @@ -6377,58 +7087,35 @@ unit aoptx86; ConvertJumpToRET(p, hp1); result:=true; end; - A_MOV: - { - change - jmp .L1 - ... - .L1: - mov ##, ## - ret - into - mov ##, ## - ret - } - { This optimisation tends to increase code size if the pass 1 MOV optimisations aren't - re-run, so only do this particular optimisation if optimising for speed or when - optimisations are very in-depth. [Kit] } - if (current_settings.optimizerswitches * [cs_opt_level3, cs_opt_size]) <> [cs_opt_size] then + { Check any kind of direct assignment instruction } + A_MOV, + A_MOVD, + A_MOVQ, + A_MOVSX, +{$ifdef x86_64} + A_MOVSXD, +{$endif x86_64} + A_MOVZX, + A_MOVAPS, + A_MOVUPS, + A_MOVSD, + A_MOVAPD, + A_MOVUPD, + A_MOVDQA, + A_MOVDQU, + A_VMOVSS, + A_VMOVAPS, + A_VMOVUPS, + A_VMOVSD, + A_VMOVAPD, + A_VMOVUPD, + A_VMOVDQA, + A_VMOVDQU: + if ((current_settings.optimizerswitches * [cs_opt_level3, cs_opt_size]) <> [cs_opt_size]) and + CheckJumpMovTransferOpt(p, hp1, 0, Count) then begin - GetNextInstruction(hp1, hp2); - if not Assigned(hp2) then - Exit; - - if (hp2.typ in [ait_label, ait_align]) then - SkipLabels(hp2,hp2); - if Assigned(hp2) and MatchInstruction(hp2, A_RET, [S_NO]) then - begin - { Duplicate the MOV instruction } - hp3:=tai(hp1.getcopy); - asml.InsertBefore(hp3, p); - - { Make sure the compiler knows about any final registers written here } - for OperIdx := 0 to 1 do - with taicpu(hp3).oper[OperIdx]^ do - begin - case typ of - top_ref: - begin - if (ref^.base <> NR_NO) {$ifdef x86_64} and (ref^.base <> NR_RIP) {$endif x86_64} then - AllocRegBetween(ref^.base, hp3, tai(p.Next), UsedRegs); - if (ref^.index <> NR_NO) {$ifdef x86_64} and (ref^.index <> NR_RIP) {$endif x86_64} then - AllocRegBetween(ref^.index, hp3, tai(p.Next), UsedRegs); - end; - top_reg: - AllocRegBetween(reg, hp3, tai(p.Next), UsedRegs); - else - ; - end; - end; - - { Now change the jump into a RET instruction } - ConvertJumpToRET(p, hp2); - result:=true; - end; + Result := True; + Exit; end; else ; @@ -6478,22 +7165,18 @@ unit aoptx86; if GetNextInstruction(p,hp1) and (hp1.typ=ait_instruction) then begin symbol := TAsmLabel(taicpu(p).oper[0]^.ref^.symbol); - - if GetNextInstruction(hp1,hp2) and - ( - (hp2.typ=ait_label) or - { trick to skip align } - ((hp2.typ=ait_align) and GetNextInstruction(hp2,hp2) and (hp2.typ=ait_label)) - ) and - (Tasmlabel(symbol) = Tai_label(hp2).labsym) and - ( + if ( ( ((Taicpu(hp1).opcode=A_ADD) or (Taicpu(hp1).opcode=A_SUB)) and MatchOptype(Taicpu(hp1),top_const,top_reg) and (Taicpu(hp1).oper[0]^.val=1) ) or ((Taicpu(hp1).opcode=A_INC) or (Taicpu(hp1).opcode=A_DEC)) - ) then + ) and + GetNextInstruction(hp1,hp2) and + SkipAligns(hp2, hp2) and + (hp2.typ = ait_label) and + (Tasmlabel(symbol) = Tai_label(hp2).labsym) then { jb @@1 cmc inc/dec operand --> adc/sbb operand,0 @@1: @@ -8135,8 +8818,7 @@ unit aoptx86; Change movzwl %ax,%eax to cwtl (shorter encoding for movswl %ax,%eax) } - Result := False; - if MatchOpType(taicpu(p), top_const, top_reg) and + Result := False; if MatchOpType(taicpu(p), top_const, top_reg) and (taicpu(p).oper[1]^.reg = NR_AX) and { This is also enough to determine that opsize = S_W } ((taicpu(p).oper[0]^.val and $7FFF) = taicpu(p).oper[0]^.val) and GetNextInstructionUsingReg(p, hp1, NR_EAX) and @@ -8282,6 +8964,7 @@ unit aoptx86; begin RemoveCurrentP(p, hp2); Result:=true; + Exit; end; end; A_SHL, A_SAL, A_SHR, A_SAR: @@ -8298,6 +8981,7 @@ unit aoptx86; begin RemoveCurrentP(p, hp2); Result:=true; + Exit; end; end; A_DEC, A_INC, A_NEG: @@ -8326,16 +9010,26 @@ unit aoptx86; end; RemoveCurrentP(p, hp2); Result:=true; + Exit; end; end else - { change "test $-1,%reg" into "test %reg,%reg" } - if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then - taicpu(p).loadoper(0,taicpu(p).oper[1]^); - end { case } + ; + end; { case } + { change "test $-1,%reg" into "test %reg,%reg" } - else if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then + if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then taicpu(p).loadoper(0,taicpu(p).oper[1]^); + + { Change "or %reg,%reg" to "test %reg,%reg" as OR generates a false dependency } + if MatchInstruction(p, A_OR, []) and + { Can only match if they're both registers } + MatchOperand(taicpu(p).oper[0]^, taicpu(p).oper[1]^) then + begin + DebugMsg(SPeepholeOptimization + 'or %reg,%reg -> test %reg,%reg to remove false dependency (Or2Test)', p); + taicpu(p).opcode := A_TEST; + { No need to set Result to True, as we've done all the optimisations we can } + end; end; diff --git a/compiler/x86/cpubase.pas b/compiler/x86/cpubase.pas index 7f80c4ea50..70b13d0fc1 100644 --- a/compiler/x86/cpubase.pas +++ b/compiler/x86/cpubase.pas @@ -340,6 +340,7 @@ topsize2memsize: array[topsize] of integer = function reg2opsize(r:Tregister):topsize; function reg_cgsize(const reg: tregister): tcgsize; function is_calljmp(o:tasmop):boolean; + function is_calljmpuncond(o:tasmop):boolean; {$ifdef USEINLINE}inline;{$endif USEINLINE} procedure inverse_flags(var f: TResFlags); function flags_to_cond(const f: TResFlags) : TAsmCond; function is_segment_reg(r:tregister):boolean; @@ -577,6 +578,20 @@ implementation end; + function is_calljmpuncond(o:tasmop):boolean; {$ifdef USEINLINE}inline;{$endif USEINLINE} + begin + case o of + A_CALL, + A_JMP, + A_LCALL, + A_LJMP: + is_calljmpuncond:=true; + else + is_calljmpuncond:=false; + end; + end; + + procedure inverse_flags(var f: TResFlags); const inv_flags: array[TResFlags] of TResFlags = diff --git a/compiler/x86_64/aoptcpu.pas b/compiler/x86_64/aoptcpu.pas index 6957d69b41..b5c0894bf6 100644 --- a/compiler/x86_64/aoptcpu.pas +++ b/compiler/x86_64/aoptcpu.pas @@ -145,6 +145,8 @@ uses Result:=OptPass1PXor(p); A_TEST: Result:=OptPass1Test(p); + A_Jcc: + Result:=OptPass1Jcc(p); else ; end;