diff --git a/compiler/arm/aoptcpu.pas b/compiler/arm/aoptcpu.pas index e5f8f93aee..12160f85bc 100644 --- a/compiler/arm/aoptcpu.pas +++ b/compiler/arm/aoptcpu.pas @@ -1284,504 +1284,512 @@ Implementation { All the optimisations from this point on require GetNextInstructionUsingReg to return True } - if not ( + while ( GetNextInstructionUsingReg(p, hpfar1, taicpu(p).oper[0]^.reg) and (hpfar1.typ = ait_instruction) - ) then - Exit; - - { Change the common - mov r0, r0, lsr #xxx - and r0, r0, #yyy/bic r0, r0, #xxx - - and remove the superfluous and/bic if possible - - This could be extended to handle more cases. - } - - { Change - mov rx, ry, lsr/ror #xxx - uxtb/uxth rz,rx/and rz,rx,0xFF - dealloc rx - - to - - uxtb/uxth rz,ry,ror #xxx - } - if (GenerateThumb2Code) and - (taicpu(p).ops=3) and - (taicpu(p).oper[2]^.typ = top_shifterop) and - (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and - (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and - RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then - begin - if MatchInstruction(hpfar1, A_UXTB, [C_None], [PF_None]) and - (taicpu(hpfar1).ops = 2) and - (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and - MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then - begin - taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg; - taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^); - taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR; - taicpu(hpfar1).ops := 3; - - if not Assigned(hp1) then - GetNextInstruction(p,hp1); - - RemoveCurrentP(p, hp1); - - result:=true; - exit; - end - else if MatchInstruction(hpfar1, A_UXTH, [C_None], [PF_None]) and - (taicpu(hpfar1).ops=2) and - (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and - MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then - begin - taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg; - taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^); - taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR; - taicpu(hpfar1).ops := 3; - - if not Assigned(hp1) then - GetNextInstruction(p,hp1); - - RemoveCurrentP(p, hp1); - - result:=true; - exit; - end - else if MatchInstruction(hpfar1, A_AND, [C_None], [PF_None]) and - (taicpu(hpfar1).ops = 3) and - (taicpu(hpfar1).oper[2]^.typ = top_const) and - (taicpu(hpfar1).oper[2]^.val = $FF) and - (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and - MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then - begin - taicpu(hpfar1).ops := 3; - taicpu(hpfar1).opcode := A_UXTB; - taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg; - taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^); - taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR; - - if not Assigned(hp1) then - GetNextInstruction(p,hp1); - - RemoveCurrentP(p, hp1); - - result:=true; - exit; - end; - end; - - { 2-operald mov optimisations } - if (taicpu(p).ops = 2) then + ) do begin - { - This removes the mul from - mov rX,0 - ... - mul ...,rX,... + + { Change the common + mov r0, r0, lsr #xxx + and r0, r0, #yyy/bic r0, r0, #xxx + + and remove the superfluous and/bic if possible + + This could be extended to handle more cases. } - if (taicpu(p).oper[1]^.typ = top_const) then - begin -(* if false and - (taicpu(p).oper[1]^.val=0) and - MatchInstruction(hpfar1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and - (((taicpu(hpfar1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^)) or - ((taicpu(hpfar1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[2]^))) then - begin - TransferUsedRegs(TmpUsedRegs); - UpdateUsedRegs(TmpUsedRegs, tai(p.next)); - UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next)); - DebugMsg('Peephole Optimization: MovMUL/MLA2Mov0 done', p); - if taicpu(hpfar1).opcode=A_MUL then - taicpu(hpfar1).loadconst(1,0) - else - taicpu(hpfar1).loadreg(1,taicpu(hpfar1).oper[3]^.reg); - taicpu(hpfar1).ops:=2; - taicpu(hpfar1).opcode:=A_MOV; - if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then - RemoveCurrentP(p); - Result:=true; - exit; - end - else*) if (taicpu(p).oper[1]^.val=0) and - MatchInstruction(hpfar1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and - MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[3]^) then - begin - TransferUsedRegs(TmpUsedRegs); - UpdateUsedRegs(TmpUsedRegs, tai(p.next)); - UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next)); - DebugMsg('Peephole Optimization: MovMLA2MUL 1 done', p); - taicpu(hpfar1).ops:=3; - taicpu(hpfar1).opcode:=A_MUL; - if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then - begin - RemoveCurrentP(p); - Result:=true; - end; - exit; - end - { - This changes the very common - mov r0, #0 - str r0, [...] - mov r0, #0 - str r0, [...] - and removes all superfluous mov instructions - } - else if (taicpu(hpfar1).opcode=A_STR) then - begin - hp1 := hpfar1; - while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and - MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^) and - GetNextInstruction(hp1, hp2) and - MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and - (taicpu(hp2).ops = 2) and - MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and - MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do - begin - DebugMsg('Peephole Optimization: MovStrMov done', hp2); - GetNextInstruction(hp2,hp1); - asml.remove(hp2); - hp2.free; - result:=true; - if not assigned(hp1) then break; - end; + { Change + mov rx, ry, lsr/ror #xxx + uxtb/uxth rz,rx/and rz,rx,0xFF + dealloc rx - if Result then - Exit; - end; - end; - { - This removes the first mov from - mov rX,... - mov rX,... - } - if taicpu(hpfar1).opcode=A_MOV then - begin - hp1 := p; - while MatchInstruction(hpfar1, A_MOV, [taicpu(hp1).condition], [taicpu(hp1).oppostfix]) and - (taicpu(hpfar1).ops = 2) and - MatchOperand(taicpu(hp1).oper[0]^, taicpu(hpfar1).oper[0]^) and - { don't remove the first mov if the second is a mov rX,rX } - not(MatchOperand(taicpu(hpfar1).oper[0]^, taicpu(hpfar1).oper[1]^)) do - begin - { Defer removing the first p until after the while loop } - if p <> hp1 then - begin - DebugMsg('Peephole Optimization: MovMov done', hp1); - asml.remove(hp1); - hp1.free; - end; - hp1:=hpfar1; - GetNextInstruction(hpfar1,hpfar1); - result:=true; - if not assigned(hpfar1) then - Break; - end; - - if Result then - begin - DebugMsg('Peephole Optimization: MovMov done', p); - RemoveCurrentp(p); - Exit; - end; - end; - - if RedundantMovProcess(p,hpfar1) then - begin - Result:=true; - { p might not point at a mov anymore } - exit; - end; - - { Fold the very common sequence - mov regA, regB - ldr* regA, [regA] to - ldr* regA, [regB] - CAUTION! If this one is successful p might not be a mov instruction anymore! + + uxtb/uxth rz,ry,ror #xxx } - if - // Make sure that Thumb code doesn't propagate a high register into a reference - ( - ( - GenerateThumbCode and - (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8) - ) or (not GenerateThumbCode) - ) and - (taicpu(p).oper[1]^.typ = top_reg) and - (taicpu(p).oppostfix = PF_NONE) and - MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], []) and - (taicpu(hpfar1).oper[1]^.typ = top_ref) and - { We can change the base register only when the instruction uses AM_OFFSET } - ((taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or - ((taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and - (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)) - ) and - not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and + if (GenerateThumb2Code) and + (taicpu(p).ops=3) and + (taicpu(p).oper[2]^.typ = top_shifterop) and + (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and + (taicpu(p).oper[2]^.shifterop^.shiftmode in [SM_LSR,SM_ROR]) and RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then + begin + if MatchInstruction(hpfar1, A_UXTB, [C_None], [PF_None]) and + (taicpu(hpfar1).ops = 2) and + (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and + MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then + begin + taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg; + taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^); + taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR; + taicpu(hpfar1).ops := 3; + + if not Assigned(hp1) then + GetNextInstruction(p,hp1); + + RemoveCurrentP(p, hp1); + + result:=true; + exit; + end + else if MatchInstruction(hpfar1, A_UXTH, [C_None], [PF_None]) and + (taicpu(hpfar1).ops=2) and + (taicpu(p).oper[2]^.shifterop^.shiftimm in [16]) and + MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then + begin + taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg; + taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^); + taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR; + taicpu(hpfar1).ops := 3; + + if not Assigned(hp1) then + GetNextInstruction(p,hp1); + + RemoveCurrentP(p, hp1); + + result:=true; + exit; + end + else if MatchInstruction(hpfar1, A_AND, [C_None], [PF_None]) and + (taicpu(hpfar1).ops = 3) and + (taicpu(hpfar1).oper[2]^.typ = top_const) and + (taicpu(hpfar1).oper[2]^.val = $FF) and + (taicpu(p).oper[2]^.shifterop^.shiftimm in [8,16,24]) and + MatchOperand(taicpu(hpfar1).oper[1]^, taicpu(p).oper[0]^.reg) then + begin + taicpu(hpfar1).ops := 3; + taicpu(hpfar1).opcode := A_UXTB; + taicpu(hpfar1).oper[1]^.reg := taicpu(p).oper[1]^.reg; + taicpu(hpfar1).loadshifterop(2,taicpu(p).oper[2]^.shifterop^); + taicpu(hpfar1).oper[2]^.shifterop^.shiftmode:=SM_ROR; + + if not Assigned(hp1) then + GetNextInstruction(p,hp1); + + RemoveCurrentP(p, hp1); + + result:=true; + exit; + end; + end; + + { 2-operald mov optimisations } + if (taicpu(p).ops = 2) then begin - DebugMsg('Peephole Optimization: MovLdr2Ldr done', hpfar1); - if (taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and - (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then - taicpu(hpfar1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg; - - if taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then - taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg; - - dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next)); - if Assigned(dealloc) then + { + This removes the mul from + mov rX,0 + ... + mul ...,rX,... + } + if (taicpu(p).oper[1]^.typ = top_const) then begin - asml.remove(dealloc); - asml.InsertAfter(dealloc,hpfar1); - end; - - if not Assigned(hp1) then - GetNextInstruction(p, hp1); - - RemoveCurrentP(p, hp1); - - result:=true; - Exit; - end - end - - { 3-operald mov optimisations } - else if (taicpu(p).ops = 3) then - begin - - if (taicpu(p).oper[2]^.typ = top_shifterop) and - (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and - (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and - (taicpu(hpfar1).ops>=1) and - (taicpu(hpfar1).oper[0]^.typ=top_reg) and - (not RegModifiedBetween(taicpu(hpfar1).oper[0]^.reg, p, hpfar1)) and - RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then - begin - if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and - MatchInstruction(hpfar1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and - (taicpu(hpfar1).ops=3) and - MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and - (taicpu(hpfar1).oper[2]^.typ = top_const) and - { Check if the AND actually would only mask out bits being already zero because of the shift - } - ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hpfar1).oper[2]^.val) = - ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then - begin - DebugMsg('Peephole Optimization: LsrAnd2Lsr done', hpfar1); - taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg; - asml.remove(hpfar1); - hpfar1.free; - result:=true; - Exit; - end - else if MatchInstruction(hpfar1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and - (taicpu(hpfar1).ops=3) and - MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and - (taicpu(hpfar1).oper[2]^.typ = top_const) and - { Check if the BIC actually would only mask out bits beeing already zero because of the shift } - (taicpu(hpfar1).oper[2]^.val<>0) and - (BsfDWord(taicpu(hpfar1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then - begin - DebugMsg('Peephole Optimization: LsrBic2Lsr done', hpfar1); - taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg; - asml.remove(hpfar1); - hpfar1.free; - result:=true; - Exit; - end; - end; - { This folds shifterops into following instructions - mov r0, r1, lsl #8 - add r2, r3, r0 - - to - - add r2, r3, r1, lsl #8 - CAUTION! If this one is successful p might not be a mov instruction anymore! - } - if (taicpu(p).oper[1]^.typ = top_reg) and - (taicpu(p).oper[2]^.typ = top_shifterop) and - (taicpu(p).oppostfix = PF_NONE) and - MatchInstruction(hpfar1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC, - A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST, - A_CMP, A_CMN], - [taicpu(p).condition], [PF_None]) and - (not ((GenerateThumb2Code) and - (taicpu(hpfar1).opcode in [A_SBC]) and - (((taicpu(hpfar1).ops=3) and - MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^.reg)) or - ((taicpu(hpfar1).ops=2) and - MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^.reg))))) and - RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) and - (taicpu(hpfar1).ops >= 2) and - {Currently we can't fold into another shifterop} - (taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^.typ = top_reg) and - {Folding rrx is problematic because of the C-Flag, as we currently can't check - NR_DEFAULTFLAGS for modification} - ( - {Everything is fine if we don't use RRX} - (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or - ( - {If it is RRX, then check if we're just accessing the next instruction} - Assigned(hp1) and - (hpfar1 = hp1) - ) - ) and - { reg1 might not be modified inbetween } - not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and - { The shifterop can contain a register, might not be modified} - ( - (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or - not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hpfar1)) - ) and - ( - {Only ONE of the two src operands is allowed to match} - MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-2]^) xor - MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^) - ) then - begin - if taicpu(hpfar1).opcode in [A_TST, A_TEQ, A_CMN] then - I2:=0 - else - I2:=1; - for I:=I2 to taicpu(hpfar1).ops-1 do - if MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[I]^.reg) then - begin - { If the parameter matched on the second op from the RIGHT - we have to switch the parameters, this will not happen for CMP - were we're only evaluating the most right parameter - } - if I <> taicpu(hpfar1).ops-1 then - begin - {The SUB operators need to be changed when we swap parameters} - case taicpu(hpfar1).opcode of - A_SUB: tempop:=A_RSB; - A_SBC: tempop:=A_RSC; - A_RSB: tempop:=A_SUB; - A_RSC: tempop:=A_SBC; - else tempop:=taicpu(hpfar1).opcode; - end; - if taicpu(hpfar1).ops = 3 then - hp2:=taicpu.op_reg_reg_reg_shifterop(tempop, - taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[2]^.reg, - taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^) - else - hp2:=taicpu.op_reg_reg_shifterop(tempop, - taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg, - taicpu(p).oper[2]^.shifterop^); - end - else - if taicpu(hpfar1).ops = 3 then - hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hpfar1).opcode, - taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[1]^.reg, - taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^) + (* if false and + (taicpu(p).oper[1]^.val=0) and + MatchInstruction(hpfar1, [A_MUL,A_MLA], [taicpu(p).condition], [taicpu(p).oppostfix]) and + (((taicpu(hpfar1).oper[1]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^)) or + ((taicpu(hpfar1).oper[2]^.typ=top_reg) and MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[2]^))) then + begin + TransferUsedRegs(TmpUsedRegs); + UpdateUsedRegs(TmpUsedRegs, tai(p.next)); + UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next)); + DebugMsg('Peephole Optimization: MovMUL/MLA2Mov0 done', p); + if taicpu(hpfar1).opcode=A_MUL then + taicpu(hpfar1).loadconst(1,0) else - hp2:=taicpu.op_reg_reg_shifterop(taicpu(hpfar1).opcode, - taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg, - taicpu(p).oper[2]^.shifterop^); - if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then - AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hpfar1,UsedRegs); - AllocRegBetween(taicpu(p).oper[1]^.reg,p,hpfar1,UsedRegs); - asml.insertbefore(hp2, hpfar1); - asml.remove(hpfar1); - hpfar1.free; - DebugMsg('Peephole Optimization: FoldShiftProcess done', hp2); + taicpu(hpfar1).loadreg(1,taicpu(hpfar1).oper[3]^.reg); + taicpu(hpfar1).ops:=2; + taicpu(hpfar1).opcode:=A_MOV; + if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then + RemoveCurrentP(p); + Result:=true; + exit; + end + else*) if (taicpu(p).oper[1]^.val=0) and + MatchInstruction(hpfar1, A_MLA, [taicpu(p).condition], [taicpu(p).oppostfix]) and + MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[3]^) then + begin + TransferUsedRegs(TmpUsedRegs); + UpdateUsedRegs(TmpUsedRegs, tai(p.next)); + UpdateUsedRegs(TmpUsedRegs, tai(hpfar1.next)); + DebugMsg('Peephole Optimization: MovMLA2MUL 1 done', p); + taicpu(hpfar1).ops:=3; + taicpu(hpfar1).opcode:=A_MUL; + if not(RegUsedAfterInstruction(taicpu(p).oper[0]^.reg,hpfar1,TmpUsedRegs)) then + begin + RemoveCurrentP(p); + Result:=true; + end; + exit; + end + { + This changes the very common + mov r0, #0 + str r0, [...] + mov r0, #0 + str r0, [...] + + and removes all superfluous mov instructions + } + else if (taicpu(hpfar1).opcode=A_STR) then + begin + hp1 := hpfar1; + while MatchInstruction(hp1, A_STR, [taicpu(p).condition], []) and + MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^) and + GetNextInstruction(hp1, hp2) and + MatchInstruction(hp2, A_MOV, [taicpu(p).condition], [PF_None]) and + (taicpu(hp2).ops = 2) and + MatchOperand(taicpu(hp2).oper[0]^, taicpu(p).oper[0]^) and + MatchOperand(taicpu(hp2).oper[1]^, taicpu(p).oper[1]^) do + begin + DebugMsg('Peephole Optimization: MovStrMov done', hp2); + GetNextInstruction(hp2,hp1); + asml.remove(hp2); + hp2.free; + result:=true; + if not assigned(hp1) then break; + end; + + if Result then + Exit; + end; + end; + { + This removes the first mov from + mov rX,... + mov rX,... + } + if taicpu(hpfar1).opcode=A_MOV then + begin + hp1 := p; + while MatchInstruction(hpfar1, A_MOV, [taicpu(hp1).condition], [taicpu(hp1).oppostfix]) and + (taicpu(hpfar1).ops = 2) and + MatchOperand(taicpu(hp1).oper[0]^, taicpu(hpfar1).oper[0]^) and + { don't remove the first mov if the second is a mov rX,rX } + not(MatchOperand(taicpu(hpfar1).oper[0]^, taicpu(hpfar1).oper[1]^)) do + begin + { Defer removing the first p until after the while loop } + if p <> hp1 then + begin + DebugMsg('Peephole Optimization: MovMov done', hp1); + asml.remove(hp1); + hp1.free; + end; + hp1:=hpfar1; + GetNextInstruction(hpfar1,hpfar1); + result:=true; + if not assigned(hpfar1) then + Break; + end; + + if Result then + begin + DebugMsg('Peephole Optimization: MovMov done', p); + RemoveCurrentp(p); + Exit; + end; + end; + + if RedundantMovProcess(p,hpfar1) then + begin + Result:=true; + { p might not point at a mov anymore } + exit; + end; + + { If hpfar1 is nil after the call to RedundantMovProcess, it is + because it would have become a dangling pointer, so reinitialise it. } + if not Assigned(hpfar1) then + Continue; + + { Fold the very common sequence + mov regA, regB + ldr* regA, [regA] + to + ldr* regA, [regB] + CAUTION! If this one is successful p might not be a mov instruction anymore! + } + if + // Make sure that Thumb code doesn't propagate a high register into a reference + ( + ( + GenerateThumbCode and + (getsupreg(taicpu(p).oper[1]^.reg) < RS_R8) + ) or (not GenerateThumbCode) + ) and + (taicpu(p).oper[1]^.typ = top_reg) and + (taicpu(p).oppostfix = PF_NONE) and + MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], []) and + (taicpu(hpfar1).oper[1]^.typ = top_ref) and + { We can change the base register only when the instruction uses AM_OFFSET } + ((taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) or + ((taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and + (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg)) + ) and + not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and + RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then + begin + DebugMsg('Peephole Optimization: MovLdr2Ldr done', hpfar1); + if (taicpu(hpfar1).oper[1]^.ref^.addressmode = AM_OFFSET) and + (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then + taicpu(hpfar1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg; + + if taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then + taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg; + + dealloc:=FindRegDeAlloc(taicpu(p).oper[1]^.reg, tai(p.Next)); + if Assigned(dealloc) then + begin + asml.remove(dealloc); + asml.InsertAfter(dealloc,hpfar1); + end; if not Assigned(hp1) then - GetNextInstruction(p, hp1) - else if hp1 = hpfar1 then - { If hp1 = hpfar1, then it's a dangling pointer } - hp1 := hp2; + GetNextInstruction(p, hp1); RemoveCurrentP(p, hp1); - Result:=true; - Exit; - end; - end; - { - Fold - mov r1, r1, lsl #2 - ldr/ldrb r0, [r0, r1] - to - ldr/ldrb r0, [r0, r1, lsl #2] - XXX: This still needs some work, as we quite often encounter something like - mov r1, r2, lsl #2 - add r2, r3, #imm - ldr r0, [r2, r1] - which can't be folded because r2 is overwritten between the shift and the ldr. - We could try to shuffle the registers around and fold it into. - add r1, r3, #imm - ldr r0, [r1, r2, lsl #2] - } - if (not(GenerateThumbCode)) and - { thumb2 allows only lsl #0..#3 } - (not(GenerateThumb2Code) or - ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and - (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) - ) - ) and - (taicpu(p).oper[1]^.typ = top_reg) and - (taicpu(p).oper[2]^.typ = top_shifterop) and - { RRX is tough to handle, because it requires tracking the C-Flag, - it is also extremly unlikely to be emitted this way} - (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and - (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and - (taicpu(p).oppostfix = PF_NONE) and - {Only LDR, LDRB, STR, STRB can handle scaled register indexing} - (MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or - (GenerateThumb2Code and - MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH])) - ) and - ( - {If this is address by offset, one of the two registers can be used} - ((taicpu(hpfar1).oper[1]^.ref^.addressmode=AM_OFFSET) and + result:=true; + Exit; + end + end + + { 3-operald mov optimisations } + else if (taicpu(p).ops = 3) then + begin + + if (taicpu(p).oper[2]^.typ = top_shifterop) and + (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) and + (taicpu(p).oper[2]^.shifterop^.shiftmode = SM_LSR) and + (taicpu(hpfar1).ops>=1) and + (taicpu(hpfar1).oper[0]^.typ=top_reg) and + (not RegModifiedBetween(taicpu(hpfar1).oper[0]^.reg, p, hpfar1)) and + RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then + begin + if (taicpu(p).oper[2]^.shifterop^.shiftimm >= 24 ) and + MatchInstruction(hpfar1, A_AND, [taicpu(p).condition], [taicpu(p).oppostfix]) and + (taicpu(hpfar1).ops=3) and + MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and + (taicpu(hpfar1).oper[2]^.typ = top_const) and + { Check if the AND actually would only mask out bits being already zero because of the shift + } + ((($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm) and taicpu(hpfar1).oper[2]^.val) = + ($ffffffff shr taicpu(p).oper[2]^.shifterop^.shiftimm)) then + begin + DebugMsg('Peephole Optimization: LsrAnd2Lsr done', hpfar1); + taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg; + asml.remove(hpfar1); + hpfar1.free; + result:=true; + Exit; + end + else if MatchInstruction(hpfar1, A_BIC, [taicpu(p).condition], [taicpu(p).oppostfix]) and + (taicpu(hpfar1).ops=3) and + MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^) and + (taicpu(hpfar1).oper[2]^.typ = top_const) and + { Check if the BIC actually would only mask out bits beeing already zero because of the shift } + (taicpu(hpfar1).oper[2]^.val<>0) and + (BsfDWord(taicpu(hpfar1).oper[2]^.val)>=32-taicpu(p).oper[2]^.shifterop^.shiftimm) then + begin + DebugMsg('Peephole Optimization: LsrBic2Lsr done', hpfar1); + taicpu(p).oper[0]^.reg:=taicpu(hpfar1).oper[0]^.reg; + asml.remove(hpfar1); + hpfar1.free; + result:=true; + Exit; + end; + end; + { This folds shifterops into following instructions + mov r0, r1, lsl #8 + add r2, r3, r0 + + to + + add r2, r3, r1, lsl #8 + CAUTION! If this one is successful p might not be a mov instruction anymore! + } + if (taicpu(p).oper[1]^.typ = top_reg) and + (taicpu(p).oper[2]^.typ = top_shifterop) and + (taicpu(p).oppostfix = PF_NONE) and + MatchInstruction(hpfar1, [A_ADD, A_ADC, A_RSB, A_RSC, A_SUB, A_SBC, + A_AND, A_BIC, A_EOR, A_ORR, A_TEQ, A_TST, + A_CMP, A_CMN], + [taicpu(p).condition], [PF_None]) and + (not ((GenerateThumb2Code) and + (taicpu(hpfar1).opcode in [A_SBC]) and + (((taicpu(hpfar1).ops=3) and + MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[1]^.reg)) or + ((taicpu(hpfar1).ops=2) and + MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[0]^.reg))))) and + RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) and + (taicpu(hpfar1).ops >= 2) and + {Currently we can't fold into another shifterop} + (taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^.typ = top_reg) and + {Folding rrx is problematic because of the C-Flag, as we currently can't check + NR_DEFAULTFLAGS for modification} ( - (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor - (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) - ) - ) or - {For post and preindexed only the index register can be used} - ((taicpu(hpfar1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and - ( - (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and - (taicpu(hpfar1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg) + {Everything is fine if we don't use RRX} + (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) or + ( + {If it is RRX, then check if we're just accessing the next instruction} + Assigned(hp1) and + (hpfar1 = hp1) + ) ) and - (not GenerateThumb2Code) - ) - ) and - { Only fold if both registers are used. Otherwise we are folding p with itself } - (taicpu(hpfar1).oper[1]^.ref^.index<>NR_NO) and - (taicpu(hpfar1).oper[1]^.ref^.base<>NR_NO) and - { Only fold if there isn't another shifterop already, and offset is zero. } - (taicpu(hpfar1).oper[1]^.ref^.offset = 0) and - (taicpu(hpfar1).oper[1]^.ref^.shiftmode = SM_None) and - not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and - RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then - begin - { If the register we want to do the shift for resides in base, we need to swap that} - if (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then - taicpu(hpfar1).oper[1]^.ref^.base := taicpu(hpfar1).oper[1]^.ref^.index; - taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg; - taicpu(hpfar1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode; - taicpu(hpfar1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm; - DebugMsg('Peephole Optimization: FoldShiftLdrStr done', hpfar1); - RemoveCurrentP(p); - Result:=true; - Exit; - end; + { reg1 might not be modified inbetween } + not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and + { The shifterop can contain a register, might not be modified} + ( + (taicpu(p).oper[2]^.shifterop^.rs = NR_NO) or + not(RegModifiedBetween(taicpu(p).oper[2]^.shifterop^.rs, p, hpfar1)) + ) and + ( + {Only ONE of the two src operands is allowed to match} + MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-2]^) xor + MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[taicpu(hpfar1).ops-1]^) + ) then + begin + if taicpu(hpfar1).opcode in [A_TST, A_TEQ, A_CMN] then + I2:=0 + else + I2:=1; + for I:=I2 to taicpu(hpfar1).ops-1 do + if MatchOperand(taicpu(p).oper[0]^, taicpu(hpfar1).oper[I]^.reg) then + begin + { If the parameter matched on the second op from the RIGHT + we have to switch the parameters, this will not happen for CMP + were we're only evaluating the most right parameter + } + if I <> taicpu(hpfar1).ops-1 then + begin + {The SUB operators need to be changed when we swap parameters} + case taicpu(hpfar1).opcode of + A_SUB: tempop:=A_RSB; + A_SBC: tempop:=A_RSC; + A_RSB: tempop:=A_SUB; + A_RSC: tempop:=A_SBC; + else tempop:=taicpu(hpfar1).opcode; + end; + if taicpu(hpfar1).ops = 3 then + hp2:=taicpu.op_reg_reg_reg_shifterop(tempop, + taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[2]^.reg, + taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^) + else + hp2:=taicpu.op_reg_reg_shifterop(tempop, + taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg, + taicpu(p).oper[2]^.shifterop^); + end + else + if taicpu(hpfar1).ops = 3 then + hp2:=taicpu.op_reg_reg_reg_shifterop(taicpu(hpfar1).opcode, + taicpu(hpfar1).oper[0]^.reg, taicpu(hpfar1).oper[1]^.reg, + taicpu(p).oper[1]^.reg, taicpu(p).oper[2]^.shifterop^) + else + hp2:=taicpu.op_reg_reg_shifterop(taicpu(hpfar1).opcode, + taicpu(hpfar1).oper[0]^.reg, taicpu(p).oper[1]^.reg, + taicpu(p).oper[2]^.shifterop^); + if taicpu(p).oper[2]^.shifterop^.rs<>NR_NO then + AllocRegBetween(taicpu(p).oper[2]^.shifterop^.rs,p,hpfar1,UsedRegs); + AllocRegBetween(taicpu(p).oper[1]^.reg,p,hpfar1,UsedRegs); + asml.insertbefore(hp2, hpfar1); + asml.remove(hpfar1); + hpfar1.free; + DebugMsg('Peephole Optimization: FoldShiftProcess done', hp2); + + if not Assigned(hp1) then + GetNextInstruction(p, hp1) + else if hp1 = hpfar1 then + { If hp1 = hpfar1, then it's a dangling pointer } + hp1 := hp2; + + RemoveCurrentP(p, hp1); + Result:=true; + Exit; + end; + end; + { + Fold + mov r1, r1, lsl #2 + ldr/ldrb r0, [r0, r1] + to + ldr/ldrb r0, [r0, r1, lsl #2] + + XXX: This still needs some work, as we quite often encounter something like + mov r1, r2, lsl #2 + add r2, r3, #imm + ldr r0, [r2, r1] + which can't be folded because r2 is overwritten between the shift and the ldr. + We could try to shuffle the registers around and fold it into. + add r1, r3, #imm + ldr r0, [r1, r2, lsl #2] + } + if (not(GenerateThumbCode)) and + { thumb2 allows only lsl #0..#3 } + (not(GenerateThumb2Code) or + ((taicpu(p).oper[2]^.shifterop^.shiftimm in [0..3]) and + (taicpu(p).oper[2]^.shifterop^.shiftmode=SM_LSL) + ) + ) and + (taicpu(p).oper[1]^.typ = top_reg) and + (taicpu(p).oper[2]^.typ = top_shifterop) and + { RRX is tough to handle, because it requires tracking the C-Flag, + it is also extremly unlikely to be emitted this way} + (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and + (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and + (taicpu(p).oppostfix = PF_NONE) and + {Only LDR, LDRB, STR, STRB can handle scaled register indexing} + (MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B]) or + (GenerateThumb2Code and + MatchInstruction(hpfar1, [A_LDR, A_STR], [taicpu(p).condition], [PF_None, PF_B, PF_SB, PF_H, PF_SH])) + ) and + ( + {If this is address by offset, one of the two registers can be used} + ((taicpu(hpfar1).oper[1]^.ref^.addressmode=AM_OFFSET) and + ( + (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) xor + (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) + ) + ) or + {For post and preindexed only the index register can be used} + ((taicpu(hpfar1).oper[1]^.ref^.addressmode in [AM_POSTINDEXED, AM_PREINDEXED]) and + ( + (taicpu(hpfar1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and + (taicpu(hpfar1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg) + ) and + (not GenerateThumb2Code) + ) + ) and + { Only fold if both registers are used. Otherwise we are folding p with itself } + (taicpu(hpfar1).oper[1]^.ref^.index<>NR_NO) and + (taicpu(hpfar1).oper[1]^.ref^.base<>NR_NO) and + { Only fold if there isn't another shifterop already, and offset is zero. } + (taicpu(hpfar1).oper[1]^.ref^.offset = 0) and + (taicpu(hpfar1).oper[1]^.ref^.shiftmode = SM_None) and + not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hpfar1)) and + RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hpfar1)) then + begin + { If the register we want to do the shift for resides in base, we need to swap that} + if (taicpu(hpfar1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) then + taicpu(hpfar1).oper[1]^.ref^.base := taicpu(hpfar1).oper[1]^.ref^.index; + taicpu(hpfar1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg; + taicpu(hpfar1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode; + taicpu(hpfar1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm; + DebugMsg('Peephole Optimization: FoldShiftLdrStr done', hpfar1); + RemoveCurrentP(p); + Result:=true; + Exit; + end; + end; + { + Often we see shifts and then a superfluous mov to another register + In the future this might be handled in RedundantMovProcess when it uses RegisterTracking + } + if RemoveSuperfluousMove(p, hpfar1, 'MovMov2Mov') then + Result:=true; + + Exit; end; - { - Often we see shifts and then a superfluous mov to another register - In the future this might be handled in RedundantMovProcess when it uses RegisterTracking - } - if RemoveSuperfluousMove(p, hpfar1, 'MovMov2Mov') then - Result:=true; end; diff --git a/compiler/armgen/aoptarm.pas b/compiler/armgen/aoptarm.pas index 99d6512805..2581f34a5b 100644 --- a/compiler/armgen/aoptarm.pas +++ b/compiler/armgen/aoptarm.pas @@ -40,7 +40,7 @@ Type procedure DebugMsg(const s : string; p : tai); function RemoveSuperfluousMove(const p: tai; movp: tai; const optimizer: string): boolean; - function RedundantMovProcess(var p: tai; hp1: tai): boolean; + function RedundantMovProcess(var p: tai; var hp1: tai): boolean; function GetNextInstructionUsingReg(Current: tai; out Next: tai; reg: TRegister): Boolean; function OptPass1UXTB(var p: tai): Boolean; @@ -292,10 +292,10 @@ Implementation end; - function TARMAsmOptimizer.RedundantMovProcess(var p: tai;hp1: tai):boolean; + function TARMAsmOptimizer.RedundantMovProcess(var p: tai; var hp1: tai):boolean; var I: Integer; - current_hp: tai; + current_hp, next_hp: tai; LDRChange: Boolean; begin Result:=false; @@ -390,80 +390,80 @@ Implementation TransferUsedRegs(TmpUsedRegs); { Search local instruction block } - while GetNextInstruction(current_hp, hp1) and (hp1 <> BlockEnd) and (hp1.typ = ait_instruction) do + while GetNextInstruction(current_hp, next_hp) and (next_hp <> BlockEnd) and (next_hp.typ = ait_instruction) do begin UpdateUsedRegs(TmpUsedRegs, tai(current_hp.Next)); LDRChange := False; - if (taicpu(hp1).opcode in [A_LDR,A_STR]) and (taicpu(hp1).ops = 2) then + if (taicpu(next_hp).opcode in [A_LDR,A_STR]) and (taicpu(next_hp).ops = 2) then begin { Change the registers from r1 to r0 } - if (taicpu(hp1).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) and + if (taicpu(next_hp).oper[1]^.ref^.base = taicpu(p).oper[0]^.reg) and {$ifdef ARM} { This optimisation conflicts with something and raises an access violation - needs further investigation. [Kit] } - (taicpu(hp1).opcode <> A_LDR) and + (taicpu(next_hp).opcode <> A_LDR) and {$endif ARM} { Don't mess around with the base register if the reference is pre- or post-indexed } - (taicpu(hp1).oper[1]^.ref^.addressmode = AM_OFFSET) then + (taicpu(next_hp).oper[1]^.ref^.addressmode = AM_OFFSET) then begin - taicpu(hp1).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg; + taicpu(next_hp).oper[1]^.ref^.base := taicpu(p).oper[1]^.reg; LDRChange := True; end; - if taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then + if taicpu(next_hp).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg then begin - taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg; + taicpu(next_hp).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg; LDRChange := True; end; if LDRChange then - DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 1)', hp1); + DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 1)', next_hp); { Drop out if we're dealing with pre-indexed references } - if (taicpu(hp1).oper[1]^.ref^.addressmode = AM_PREINDEXED) and + if (taicpu(next_hp).oper[1]^.ref^.addressmode = AM_PREINDEXED) and ( - RegInRef(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^.ref^) or - RegInRef(taicpu(p).oper[1]^.reg, taicpu(hp1).oper[1]^.ref^) + RegInRef(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[1]^.ref^) or + RegInRef(taicpu(p).oper[1]^.reg, taicpu(next_hp).oper[1]^.ref^) ) then begin { Remember to update register allocations } if LDRChange then - AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs); + AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs); Break; end; { The register being stored can be potentially changed (as long as it's not the stack pointer) } - if (taicpu(hp1).opcode = A_STR) and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and - MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) then + if (taicpu(next_hp).opcode = A_STR) and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and + MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) then begin - DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 2)', hp1); - taicpu(hp1).oper[0]^.reg := taicpu(p).oper[1]^.reg; + DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovLdr2Ldr 2)', next_hp); + taicpu(next_hp).oper[0]^.reg := taicpu(p).oper[1]^.reg; LDRChange := True; end; if LDRChange and (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) then begin - AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs); + AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs); if (taicpu(p).oppostfix = PF_None) and ( ( - (taicpu(hp1).opcode = A_LDR) and - MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) + (taicpu(next_hp).opcode = A_LDR) and + MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) ) or - not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) + not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, next_hp, TmpUsedRegs) ) and { Double-check to see if the old registers were actually changed (e.g. if the super registers matched, but not the sizes, they won't be changed). } ( - (taicpu(hp1).opcode = A_LDR) or - not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[0]^) + (taicpu(next_hp).opcode = A_LDR) or + not RegInOp(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[0]^) ) and - not RegInRef(taicpu(p).oper[0]^.reg, taicpu(hp1).oper[1]^.ref^) then + not RegInRef(taicpu(p).oper[0]^.reg, taicpu(next_hp).oper[1]^.ref^) then begin DebugMsg('Peephole Optimization: RedundantMovProcess 2a done', p); RemoveCurrentP(p); @@ -472,23 +472,28 @@ Implementation end; end; end - else if (taicpu(hp1).opcode = A_MOV) and (taicpu(hp1).oppostfix = PF_None) and - (taicpu(hp1).ops = 2) then + else if (taicpu(next_hp).opcode = A_MOV) and (taicpu(next_hp).oppostfix = PF_None) and + (taicpu(next_hp).ops = 2) then begin - if MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[0]^.reg) then + if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[0]^.reg) then begin { Found another mov that writes entirely to the register } - if RegUsedBetween(taicpu(p).oper[0]^.reg, p, hp1) then + if RegUsedBetween(taicpu(p).oper[0]^.reg, p, next_hp) then begin { Register was used beforehand } - if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[1]^.reg) then + if MatchOperand(taicpu(next_hp).oper[1]^, taicpu(p).oper[1]^.reg) then begin { This MOV is exactly the same as the first one. Since none of the registers have changed value at this point, we can remove it. } - DebugMsg('Peephole Optimization: RedundantMovProcess 3a done', hp1); - asml.Remove(hp1); - hp1.Free; + DebugMsg('Peephole Optimization: RedundantMovProcess 3a done', next_hp); + + if (next_hp = hp1) then + { Don't let hp1 become a dangling pointer } + hp1 := nil; + + asml.Remove(next_hp); + next_hp.Free; { We still have the original p, so we can continue optimising; if it was -O2 or below, this instruction appeared immediately @@ -504,7 +509,7 @@ Implementation { We can delete the first MOV (only if the second MOV is unconditional) } {$ifdef ARM} if (taicpu(p).oppostfix = PF_None) and - (taicpu(hp1).condition = C_None) then + (taicpu(next_hp).condition = C_None) then {$endif ARM} begin DebugMsg('Peephole Optimization: RedundantMovProcess 2b done', p); @@ -513,9 +518,9 @@ Implementation end; Exit; end - else if MatchOperand(taicpu(hp1).oper[1]^, taicpu(p).oper[0]^.reg) then + else if MatchOperand(taicpu(next_hp).oper[1]^, taicpu(p).oper[0]^.reg) then begin - if MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) + if MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[1]^.reg) { Be careful - if the entire register is not used, removing this instruction will leave the unused part uninitialised } {$ifdef AARCH64} @@ -524,9 +529,14 @@ Implementation then begin { Instruction will become mov r1,r1 } - DebugMsg('Peephole Optimization: Mov2None 2 done', hp1); - asml.Remove(hp1); - hp1.Free; + DebugMsg('Peephole Optimization: Mov2None 2 done', next_hp); + + if (next_hp = hp1) then + { Don't let hp1 become a dangling pointer } + hp1 := nil; + + asml.Remove(next_hp); + next_hp.Free; Continue; end; @@ -534,12 +544,12 @@ Implementation forces it to be left alone if the full register is not used, lest mov w1,w1 gets optimised out by mistake. [Kit] } {$ifdef AARCH64} - if not MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) then + if not MatchOperand(taicpu(next_hp).oper[0]^, taicpu(p).oper[1]^.reg) then {$endif AARCH64} begin - DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovMov2Mov 2)', hp1); - taicpu(hp1).oper[1]^.reg := taicpu(p).oper[1]^.reg; - AllocRegBetween(taicpu(p).oper[1]^.reg, p, hp1, UsedRegs); + DebugMsg('Peephole Optimization: ' + std_regname(taicpu(p).oper[0]^.reg) + ' = ' + std_regname(taicpu(p).oper[1]^.reg) + ' (MovMov2Mov 2)', next_hp); + taicpu(next_hp).oper[1]^.reg := taicpu(p).oper[1]^.reg; + AllocRegBetween(taicpu(p).oper[1]^.reg, p, next_hp, UsedRegs); { If this was the only reference to the old register, then we can remove the original MOV now } @@ -551,7 +561,7 @@ Implementation register). [Kit] } (getsupreg(taicpu(p).oper[1]^.reg) <> RS_STACK_POINTER_REG) and RegInUsedRegs(taicpu(p).oper[0]^.reg, UsedRegs) and - not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, hp1, TmpUsedRegs) then + not RegUsedAfterInstruction(taicpu(p).oper[0]^.reg, next_hp, TmpUsedRegs) then begin DebugMsg('Peephole Optimization: RedundantMovProcess 2c done', p); RemoveCurrentP(p); @@ -565,14 +575,14 @@ Implementation { On low optimisation settions, don't search more than one instruction ahead } if not(cs_opt_level3 in current_settings.optimizerswitches) or { Stop at procedure calls and jumps } - is_calljmp(taicpu(hp1).opcode) or + is_calljmp(taicpu(next_hp).opcode) or { If the read register has changed value, or the MOV destination register has been used, drop out } - RegInInstruction(taicpu(p).oper[0]^.reg, hp1) or - RegModifiedByInstruction(taicpu(p).oper[1]^.reg, hp1) then + RegInInstruction(taicpu(p).oper[0]^.reg, next_hp) or + RegModifiedByInstruction(taicpu(p).oper[1]^.reg, next_hp) then Break; - current_hp := hp1; + current_hp := next_hp; end; end; end;