New ARM Peephole optimizer FoldShiftLdrStr

This one folds mov r1, r2, lsl #2 ldr/ldrb r0, [r0, r1] into ldr/ldrb r0, [r0, r2, lsl #2] There is still some room for improvement, maybe it would be better to do this before the register allocator runs, as we'll currently waste a register (r1 in the above example) in many cases. That would also allow to to fold more operations, because currently if r2 gets reused between the mov and ldr we'll not be able to do the optimization. git-svn-id: trunk@23408 -
2025-08-08 14:25:57 +02:00 · 2013-01-16 14:37:28 +00:00 · 2013-01-16 14:37:28 +00:00 · fe520c215b
commit fe520c215b
parent e9615716c1
1 changed files with 44 additions and 0 deletions
--- a/compiler/arm/aoptcpu.pas
+++ b/compiler/arm/aoptcpu.pas
@ -1064,7 +1064,51 @@ Implementation
                              break;
                            end;
                      end;
+                    {
+                      Fold
+                        mov r1, r1, lsl #2
+                        ldr/ldrb r0, [r0, r1]
+                      to
+                        ldr/ldrb r0, [r0, r1, lsl #2]

+                      XXX: This still needs some work, as we quite often encounter something like
+                             mov r1, r2, lsl #2
+                             add r2, r3, #imm
+                             ldr r0, [r2, r1]
+                           which can't be folded because r2 is overwritten between the shift and the ldr.
+                           We could try to shuffle the registers around and fold it into.
+                             add r1, r3, #imm
+                             ldr r0, [r1, r2, lsl #2]
+                    }
+                    if (taicpu(p).opcode = A_MOV) and
+                       (taicpu(p).ops = 3) and
+                       (taicpu(p).oper[1]^.typ = top_reg) and
+                       (taicpu(p).oper[2]^.typ = top_shifterop) and
+                       { RRX is tough to handle, because it requires tracking the C-Flag,
+                         it is also extremly unlikely to be emitted this way}
+                       (taicpu(p).oper[2]^.shifterop^.shiftmode <> SM_RRX) and
+                       (taicpu(p).oper[2]^.shifterop^.shiftimm <> 0) and
+                       (taicpu(p).oppostfix = PF_NONE) and
+                       GetNextInstructionUsingReg(p, hp1, taicpu(p).oper[0]^.reg) and
+                       {Only LDR, LDRB, STR, STRB can handle scaled register indexing}
+                       MatchInstruction(hp1, [A_LDR, A_STR], [taicpu(p).condition],
+                                             [PF_None, PF_B]) and
+                       (taicpu(hp1).oper[1]^.ref^.index = taicpu(p).oper[0]^.reg) and
+                       (taicpu(hp1).oper[1]^.ref^.base <> taicpu(p).oper[0]^.reg) and
+                       { Only fold if there isn't another shifterop already. }
+                       (taicpu(hp1).oper[1]^.ref^.shiftmode = SM_None) and
+                       not(RegModifiedBetween(taicpu(p).oper[1]^.reg,p,hp1)) and
+                       (assigned(FindRegDealloc(taicpu(p).oper[0]^.reg,tai(hp1.Next))) or
+                         regLoadedWithNewValue(taicpu(p).oper[0]^.reg, hp1)) then
+                       begin
+                         DebugMsg('Peephole FoldShiftLdrStr done', hp1);
+                         taicpu(hp1).oper[1]^.ref^.index := taicpu(p).oper[1]^.reg;
+                         taicpu(hp1).oper[1]^.ref^.shiftmode := taicpu(p).oper[2]^.shifterop^.shiftmode;
+                         taicpu(hp1).oper[1]^.ref^.shiftimm := taicpu(p).oper[2]^.shifterop^.shiftimm;
+                         asml.remove(p);
+                         p.free;
+                         p:=hp1;
+                       end;
                    {
                      Often we see shifts and then a superfluous mov to another register
                      In the future this might be handled in RedundantMovProcess when it uses RegisterTracking