From 096d5f50b21f8d4fab941c73982959109fb92631 Mon Sep 17 00:00:00 2001 From: "J. Gareth \"Curious Kit\" Moreton" Date: Tue, 14 May 2024 02:22:00 +0100 Subject: [PATCH] * arm / a64: New optimisation for removing the number of necessary S/UXTB/H instructions --- compiler/armgen/aoptarm.pas | 128 ++++++++++++++++++++++++++++++++++++ 1 file changed, 128 insertions(+) diff --git a/compiler/armgen/aoptarm.pas b/compiler/armgen/aoptarm.pas index eecdb5152d..a67ff08543 100644 --- a/compiler/armgen/aoptarm.pas +++ b/compiler/armgen/aoptarm.pas @@ -60,6 +60,9 @@ Type function OptPass2Bitwise(var p: tai): Boolean; function OptPass2TST(var p: tai): Boolean; + + protected + function DoXTArithOp(var p: tai; hp1: tai): Boolean; End; function MatchInstruction(const instr: tai; const op: TCommonAsmOps; const cond: TAsmConds; const postfix: TOpPostfixes): boolean; @@ -675,6 +678,123 @@ Implementation end; + function TARMAsmOptimizer.DoXTArithOp(var p: tai; hp1: tai): Boolean; + var + hp2: tai; + ConstLimit: TCGInt; + ValidPostFixes: TOpPostFixes; + FirstCode, SecondCode, ThirdCode, FourthCode: TAsmOp; + begin + Result := False; + { Change: + uxtb/h reg1,reg1 + (operation on reg1 with immediate operand where the upper 24/56 + bits don't affect the state of the first 8 bits ) + uxtb/h reg1,reg1 + + Remove first uxtb/h + } + case taicpu(p).opcode of + A_UXTB, + A_SXTB: + begin + ConstLimit := $FF; + ValidPostFixes := [PF_B]; + FirstCode := A_UXTB; + SecondCode := A_SXTB; + ThirdCode := A_UXTB; { Used to indicate no other valid codes } + FourthCode := A_SXTB; + end; + A_UXTH, + A_SXTH: + begin + ConstLimit := $FFFF; + ValidPostFixes := [PF_B, PF_H]; + FirstCode := A_UXTH; + SecondCode := A_SXTH; + ThirdCode := A_UXTB; + FourthCode := A_SXTB; + end; + else + InternalError(2024051401); + end; + +{$ifndef AARCH64} + { Regular ARM doesn't have the multi-instruction MatchInstruction available } + if (hp1.typ = ait_instruction) and (taicpu(hp1).oppostfix = PF_None) then + case taicpu(hp1).opcode of + A_ADD, A_SUB, A_MUL, A_LSL, A_AND, A_ORR, A_EOR, A_BIC, A_ORN: +{$endif AARCH64} + + if + (taicpu(p).oper[1]^.reg = taicpu(p).oper[0]^.reg) and +{$ifdef AARCH64} + MatchInstruction(hp1, [A_ADD, A_SUB, A_MUL, A_LSL, A_AND, A_ORR, A_EOR, A_BIC, A_ORN, A_EON], [PF_None]) and +{$endif AARCH64} + (taicpu(hp1).condition = C_None) and + (taicpu(hp1).ops = 3) and + (taicpu(hp1).oper[0]^.reg = taicpu(p).oper[0]^.reg) and + (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[0]^.reg) and + (taicpu(hp1).oper[2]^.typ = top_const) and + ( + ( + { If the AND immediate is 8-bit, then this essentially performs + the functionality of the second UXTB and so its presence is + not required } + (taicpu(hp1).opcode = A_AND) and + (taicpu(hp1).oper[2]^.val >= 0) and + (taicpu(hp1).oper[2]^.val <= ConstLimit) + ) or + ( + GetNextInstructionUsingReg(hp1,hp2,taicpu(p).oper[0]^.reg) and + (hp2.typ = ait_instruction) and + (taicpu(hp2).ops = 2) and + (taicpu(hp2).condition = C_None) and + ( + ( + (taicpu(hp2).opcode in [FirstCode, SecondCode, ThirdCode, FourthCode]) and + (taicpu(hp2).oppostfix = PF_None) and + (taicpu(hp2).oper[1]^.reg = taicpu(p).oper[0]^.reg) + { Destination is allowed to be different in this case, but + only if the source is no longer in use (it being the same as + the source is covered by RegEndOfLife as well) } + ) or + ( + { STRB essentially fills the same role as the second UXTB + as long as the register is deallocated afterwards } + MatchInstruction(hp2, A_STR, [C_None], ValidPostFixes) and + (taicpu(hp2).oper[0]^.reg = taicpu(p).oper[0]^.reg) and + not RegInOp(taicpu(p).oper[0]^.reg, taicpu(hp2).oper[1]^) + ) + ) and + RegEndOfLife(taicpu(p).oper[0]^.reg, taicpu(hp2)) + ) + ) then + begin + DebugMsg(SPeepholeOptimization + 'S/Uxtb/hArithUxtb/h2ArithS/Uxtb/h done', p); + Result := RemoveCurrentP(p); + + { Simplify bitwise constants if able } +{$ifdef AARCH64} + if (taicpu(hp1).opcode in [A_AND, A_ORR, A_EOR, A_BIC, A_ORN, A_EON]) and + is_shifter_const(taicpu(hp1).oper[2]^.val and ConstLimit, OS_32) then +{$else AARCH64} + if ( + (ConstLimit = $FF) or + (taicpu(hp1).oper[2]^.val <= $100) + ) and + (taicpu(hp1).opcode in [A_AND, A_ORR, A_EOR, A_BIC, A_ORN]) then +{$endif AARCH64} + taicpu(hp1).oper[2]^.val := taicpu(hp1).oper[2]^.val and ConstLimit; + end; +{$ifndef AARCH64} + else + ; + end; +{$endif not AARCH64} + end; + + function TARMAsmOptimizer.OptPass1UXTB(var p : tai) : Boolean; var hp1, hp2: tai; @@ -773,6 +893,8 @@ Implementation taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg); result:=RemoveCurrentP(p); end + else if DoXTArithOp(p, hp1) then + Result:=true {$ifdef AARCH64} else if USxtOp2Op(p,hp1,SM_UXTB) then Result:=true @@ -862,6 +984,8 @@ Implementation taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg); result:=RemoveCurrentP(p); end + else if DoXTArithOp(p, hp1) then + Result:=true {$ifdef AARCH64} else if USxtOp2Op(p,hp1,SM_UXTH) then Result:=true @@ -974,6 +1098,8 @@ Implementation taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg); result:=RemoveCurrentP(p); end + else if DoXTArithOp(p, hp1) then + Result:=true {$ifdef AARCH64} else if USxtOp2Op(p,hp1,SM_SXTB) then Result:=true @@ -1094,6 +1220,8 @@ Implementation taicpu(hp1).loadReg(1,taicpu(p).oper[1]^.reg); result:=RemoveCurrentP(p); end + else if DoXTArithOp(p, hp1) then + Result:=true {$ifdef AARCH64} else if USxtOp2Op(p,hp1,SM_SXTH) then Result:=true