From 993144b91b468ebfe0cb0b2cfdbe93b8022e748b Mon Sep 17 00:00:00 2001
From: florian <florian@freepascal.org>
Date: Fri, 24 Jan 2020 21:10:17 +0000
Subject: [PATCH]   * patch by J. Gareth Moreton: x86 SUB and LEA
 optimisations, resolves #36622

git-svn-id: trunk@44030 -
---
 compiler/i386/aoptcpu.pas   |  2 +
 compiler/x86/aoptx86.pas    | 90 +++++++++++++++++++++++++++++++++++++
 compiler/x86_64/aoptcpu.pas |  2 +
 3 files changed, 94 insertions(+)

diff --git a/compiler/i386/aoptcpu.pas b/compiler/i386/aoptcpu.pas
index 35aacee47f..825c527c8d 100644
--- a/compiler/i386/aoptcpu.pas
+++ b/compiler/i386/aoptcpu.pas
@@ -242,6 +242,8 @@ unit aoptcpu;
                   Result:=OptPass2Jmp(p);
                 A_MOV:
                   Result:=OptPass2MOV(p);
+                A_SUB:
+                  Result:=OptPass2SUB(p);
                 else
                   ;
               end;
diff --git a/compiler/x86/aoptx86.pas b/compiler/x86/aoptx86.pas
index 9d4bea764e..5b14e8d6d1 100644
--- a/compiler/x86/aoptx86.pas
+++ b/compiler/x86/aoptx86.pas
@@ -82,6 +82,7 @@ unit aoptx86;
         function OptPass2Jmp(var p : tai) : boolean;
         function OptPass2Jcc(var p : tai) : boolean;
         function OptPass2Lea(var p: tai): Boolean;
+        function OptPass2SUB(var p: tai): Boolean;
 
         function PostPeepholeOptMov(var p : tai) : Boolean;
 {$ifdef x86_64} { These post-peephole optimisations only affect 64-bit registers. [Kit] }
@@ -3507,6 +3508,7 @@ unit aoptx86;
        end;
 
       var
+        NewRef: TReference;
        hp1,hp2,hp3: tai;
 {$ifndef x86_64}
        hp4: tai;
@@ -3535,6 +3537,50 @@ unit aoptx86;
               to the MOV instruction on this pass }
           end
         else if MatchOpType(taicpu(p),top_reg,top_reg) and
+          (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
+          MatchInstruction(hp1,A_ADD,A_SUB,[taicpu(p).opsize]) and
+          MatchOpType(taicpu(hp1),top_const,top_reg) and
+          (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
+          { be lazy, checking separately for sub would be slightly better }
+          (abs(taicpu(hp1).oper[0]^.val)<=$7fffffff) then
+          begin
+            { Change:
+                movl/q %reg1,%reg2      movl/q %reg1,%reg2
+                addl/q $x,%reg2         subl/q $x,%reg2
+              To:
+                leal/q x(%reg1),%reg2   leal/q -x(%reg1),%reg2
+            }
+            if not GetNextInstruction(hp1, hp2) or
+              { The FLAGS register isn't always tracked properly, so do not
+                perform this optimisation if a conditional statement follows }
+              not MatchInstruction(hp2, [A_Jcc, A_SETcc, A_CMOVcc], []) then
+              begin
+                reference_reset(NewRef, 1, []);
+                NewRef.base := taicpu(p).oper[0]^.reg;
+                NewRef.scalefactor := 1;
+
+                if taicpu(hp1).opcode = A_ADD then
+                  begin
+                    DebugMsg(SPeepholeOptimization + 'MovAdd2Lea', p);
+                    NewRef.offset := taicpu(hp1).oper[0]^.val;
+                  end
+                else
+                  begin
+                    DebugMsg(SPeepholeOptimization + 'MovSub2Lea', p);
+                    NewRef.offset := -taicpu(hp1).oper[0]^.val;
+                  end;
+
+                taicpu(p).opcode := A_LEA;
+                taicpu(p).loadref(0, NewRef);
+
+                Asml.Remove(hp1);
+                hp1.Free;
+
+                Result := True;
+                Exit;
+              end;
+          end
+        else if MatchOpType(taicpu(p),top_reg,top_reg) and
 {$ifdef x86_64}
           MatchInstruction(hp1,A_MOVZX,A_MOVSX,A_MOVSXD,[]) and
 {$else x86_64}
@@ -5039,6 +5085,50 @@ unit aoptx86;
       end;
 
 
+    function TX86AsmOptimizer.OptPass2SUB(var p: tai): Boolean;
+      var
+        hp1, hp2: tai; NewRef: TReference;
+      begin
+        { Change:
+            subl/q $x,%reg1
+            movl/q %reg1,%reg2
+          To:
+            leal/q $-x(%reg1),%reg2
+            subl/q $x,%reg1
+
+          Breaks the dependency chain and potentially permits the removal of
+          a CMP instruction if one follows.
+        }
+        Result := False;
+        if not (cs_opt_size in current_settings.optimizerswitches) and
+          (taicpu(p).opsize in [S_L{$ifdef x86_64}, S_Q{$endif x86_64}]) and
+          MatchOpType(taicpu(p),top_const,top_reg) and
+          GetNextInstruction(p, hp1) and
+          MatchInstruction(hp1, A_MOV, [taicpu(p).opsize]) and
+          (taicpu(hp1).oper[1]^.typ = top_reg) and
+          MatchOperand(taicpu(hp1).oper[0]^, taicpu(p).oper[1]^.reg) then
+          begin
+            { Change the MOV instruction to a LEA instruction, and update the
+              first operand }
+            reference_reset(NewRef, 1, []);
+            NewRef.base := taicpu(p).oper[1]^.reg;
+            NewRef.scalefactor := 1;
+            NewRef.offset := -taicpu(p).oper[0]^.val;
+
+            taicpu(hp1).opcode := A_LEA;
+            taicpu(hp1).loadref(0, NewRef);
+
+            { Move what is now the LEA instruction to before the SUB instruction }
+            Asml.Remove(hp1);
+            Asml.InsertBefore(hp1, p);
+            AllocRegBetween(taicpu(hp1).oper[1]^.reg, hp1, p, UsedRegs);
+
+            DebugMsg(SPeepholeOptimization + 'SubMov2LeaSub', p);
+            Result := True;
+          end;
+      end;
+
+
     function TX86AsmOptimizer.PostPeepholeOptLea(var p : tai) : Boolean;
 
       function SkipSimpleInstructions(var hp1 : tai) : Boolean;
diff --git a/compiler/x86_64/aoptcpu.pas b/compiler/x86_64/aoptcpu.pas
index bd1bda2705..48c686c3b3 100644
--- a/compiler/x86_64/aoptcpu.pas
+++ b/compiler/x86_64/aoptcpu.pas
@@ -154,6 +154,8 @@ uses
                   Result:=OptPass2Jcc(p);
                 A_Lea:
                   Result:=OptPass2Lea(p);
+                A_SUB:
+                  Result:=OptPass2SUB(p);
                 else
                   ;
               end;