From 41fba0c4f7ec96f526e70b22bb349e733e06c642 Mon Sep 17 00:00:00 2001
From: Jonas Maebe <jonas@freepascal.org>
Date: Mon, 23 Feb 2015 22:54:03 +0000
Subject: [PATCH]   * switched to using the stack pointer as base register for
 the temp allocator     instead of the frame pointer register:       1) we
 exactly know the offsets of the temps from the stack pointer          after
 pass 1 (based on the require parameter stack size for called         
 routines), while we don't know it for the frame pointer (it depends         
 on the number of saved registers)       2) temp offsets from the stack
 pointer are positive while those from          the frame pointer are
 negative, and we can directly encode much          bigger positive offsets in
 the instructions    o move the stack pointer register to a virtual register
 in      loadparentfpn, because many instructions cannot directly operate     
 on/with the stack pointer    o add the necessary register interference edges
 for the stack pointer      register

git-svn-id: trunk@29938 -
---
 .gitattributes               |  2 ++
 compiler/aarch64/cgcpu.pas   | 67 ++++++++++++++++++++++++++++++++----
 compiler/aarch64/cpunode.pas |  2 +-
 compiler/aarch64/cpupara.pas | 11 +++---
 compiler/aarch64/cpupi.pas   | 30 ++++++++++++++--
 compiler/aarch64/ncpuinl.pas | 12 +++++++
 compiler/aarch64/ncpumem.pas | 66 +++++++++++++++++++++++++++++++++++
 compiler/aarch64/rgcpu.pas   | 62 +++++++++++++++++++++++++++++++++
 compiler/fpcdefs.inc         |  1 +
 compiler/tgobj.pas           |  2 +-
 rtl/aarch64/aarch64.inc      |  6 ----
 tests/test/tnest1.pp         | 38 ++++++++++++++++++++
 12 files changed, 278 insertions(+), 21 deletions(-)
 create mode 100644 compiler/aarch64/ncpumem.pas
 create mode 100644 tests/test/tnest1.pp

diff --git a/.gitattributes b/.gitattributes
index 3e24b8af1d..a98fed7650 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -31,6 +31,7 @@ compiler/aarch64/ncpuadd.pas svneol=native#text/plain
 compiler/aarch64/ncpucnv.pas svneol=native#text/plain
 compiler/aarch64/ncpuinl.pas svneol=native#text/plain
 compiler/aarch64/ncpumat.pas svneol=native#text/plain
+compiler/aarch64/ncpumem.pas svneol=native#text/plain
 compiler/aarch64/ra64con.inc svneol=native#text/plain
 compiler/aarch64/ra64dwa.inc svneol=native#text/plain
 compiler/aarch64/ra64nor.inc svneol=native#text/plain
@@ -11945,6 +11946,7 @@ tests/test/tmsg3.pp svneol=native#text/plain
 tests/test/tmsg4.pp svneol=native#text/plain
 tests/test/tmt1.pp svneol=native#text/plain
 tests/test/tmul1.pp svneol=native#text/pascal
+tests/test/tnest1.pp svneol=native#text/plain
 tests/test/tnoext1.pp svneol=native#text/plain
 tests/test/tnoext2.pp svneol=native#text/plain
 tests/test/tnoext3.pp svneol=native#text/plain
diff --git a/compiler/aarch64/cgcpu.pas b/compiler/aarch64/cgcpu.pas
index 5b58749ac7..309edeece9 100644
--- a/compiler/aarch64/cgcpu.pas
+++ b/compiler/aarch64/cgcpu.pas
@@ -95,7 +95,7 @@ interface
         procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
         procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
        private
-        procedure save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
+        function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
         procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
       end;
 
@@ -519,7 +519,7 @@ implementation
       begin
         inherited init_register_allocators;
 
-        rg[R_INTREGISTER]:=Trgcpu.create(R_INTREGISTER,R_SUBWHOLE,
+        rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
             [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
              RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
              RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
@@ -1399,12 +1399,13 @@ implementation
 
   { *********** entry/exit code and address loading ************ }
 
-    procedure tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
+    function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
       var
         ref: treference;
         sr: tsuperregister;
         pairreg: tregister;
       begin
+        result:=0;
         reference_reset_base(ref,NR_SP,-16,16);
         ref.addressmode:=AM_PREINDEXED;
         pairreg:=NR_NO;
@@ -1415,18 +1416,38 @@ implementation
               pairreg:=newreg(rt,sr,sub)
             else
               begin
+                inc(result,16);
                 list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
                 pairreg:=NR_NO
               end;
         { one left -> store twice (stack must be 16 bytes aligned) }
         if pairreg<>NR_NO then
-          list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
+          begin
+            list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
+            inc(result,16);
+          end;
       end;
 
 
+    procedure FixupOffsets(p:TObject;arg:pointer);
+      var
+        sym: tabstractnormalvarsym absolute p;
+      begin
+        if (tsym(p).typ in [paravarsym,localvarsym]) and
+          (sym.localloc.loc=LOC_REFERENCE) and
+          (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
+          begin
+            sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
+            dec(sym.localloc.reference.offset,PLongint(arg)^);
+          end;
+      end;
+
+
+
     procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
       var
         ref: treference;
+        totalstackframesize: longint;
       begin
         if nostackframe then
           exit;
@@ -1440,12 +1461,15 @@ implementation
         { initialise frame pointer }
         a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
 
+        totalstackframesize:=localsize;
         { save modified integer registers }
-        save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
+        inc(totalstackframesize,
+          save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
         { only the lower 64 bits of the modified vector registers need to be
           saved; if the caller needs the upper 64 bits, it has to save them
           itself }
-        save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
+        inc(totalstackframesize,
+          save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
 
         { allocate stack space }
         if localsize<>0 then
@@ -1454,6 +1478,37 @@ implementation
             current_procinfo.final_localsize:=localsize;
             handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
           end;
+        { By default, we use the frame pointer to access parameters passed via
+          the stack and the stack pointer to address local variables and temps
+          because
+           a) we can use bigger positive than negative offsets (so accessing
+              locals via negative offsets from the frame pointer would be less
+              efficient)
+           b) we don't know the local size while generating the code, so
+              accessing the parameters via the stack pointer is not possible
+              without copying them
+          The problem with this is the get_frame() intrinsic:
+           a) it must return the same value as what we pass as parentfp
+              parameter, since that's how it's used in the TP-style objects unit
+           b) its return value must usable to access all local data from a
+              routine (locals and parameters), since it's all the nested
+              routines have access to
+           c) its return value must be usable to construct a backtrace, as it's
+              also used by the exception handling routines
+
+          The solution we use here, based on something similar that's done in
+          the MIPS port, is to generate all accesses to locals in the routine
+          itself SP-relative, and then after the code is generated and the local
+          size is known (namely, here), we change all SP-relative variables/
+          parameters into FP-relative ones. This means that they'll be accessed
+          less efficiently from nested routines, but those accesses are indirect
+          anyway and at least this way they can be accessed at all
+        }
+        if current_procinfo.has_nestedprocs then
+          begin
+            current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
+            current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
+          end;
       end;
 
 
diff --git a/compiler/aarch64/cpunode.pas b/compiler/aarch64/cpunode.pas
index c155bc88ae..7456746a70 100644
--- a/compiler/aarch64/cpunode.pas
+++ b/compiler/aarch64/cpunode.pas
@@ -31,7 +31,7 @@ implementation
 
   uses
     ncgbas,ncgflw,ncgcal,ncgcnv,ncgld,ncgmem,ncgcon,ncgset,ncgobjc,
-    ncpuadd,ncpumat,ncpuinl,ncpucnv,{ncpuset,}
+    ncpuadd,ncpumat,ncpumem,ncpuinl,ncpucnv,{ncpuset,}
     { this not really a node }
     rgcpu,
     { symtable }
diff --git a/compiler/aarch64/cpupara.pas b/compiler/aarch64/cpupara.pas
index 41a5f72862..463f852dec 100644
--- a/compiler/aarch64/cpupara.pas
+++ b/compiler/aarch64/cpupara.pas
@@ -535,10 +535,6 @@ unit cpupara;
                begin
                   paraloc^.size:=paracgsize;
                   paraloc^.loc:=LOC_REFERENCE;
-                  if side=callerside then
-                    paraloc^.reference.index:=NR_STACK_POINTER_REG
-                  else
-                    paraloc^.reference.index:=NR_FRAME_POINTER_REG;
 
                   { the current stack offset may not be properly aligned in
                     case we're on Darwin have allocated a non-variadic argument
@@ -563,6 +559,13 @@ unit cpupara;
                     paraloc^.reference.offset:=curstackoffset
                   else
                     paraloc^.reference.offset:=curstackoffset+stackslotlen-paralen;
+                  if side=callerside then
+                    paraloc^.reference.index:=NR_STACK_POINTER_REG
+                  else
+                    begin
+                      paraloc^.reference.index:=NR_FRAME_POINTER_REG;
+                      inc(paraloc^.reference.offset,16);
+                    end;
                   inc(curstackoffset,stackslotlen);
                   paralen:=0
                end;
diff --git a/compiler/aarch64/cpupi.pas b/compiler/aarch64/cpupi.pas
index 0a8c1e7d16..ff6505c4fd 100644
--- a/compiler/aarch64/cpupi.pas
+++ b/compiler/aarch64/cpupi.pas
@@ -26,18 +26,42 @@ unit cpupi;
 interface
 
   uses
+    procinfo,
     psub;
 
   type
     taarch64procinfo=class(tcgprocinfo)
-      { no need to override anything, as the ABI requires us to use a frame
-        pointer at all times }
+      constructor create(aparent: tprocinfo); override;
+      procedure set_first_temp_offset; override;
     end;
 
 implementation
 
   uses
-    procinfo;
+    tgobj,
+    cpubase;
+
+  constructor taarch64procinfo.create(aparent: tprocinfo);
+    begin
+      inherited;
+      { use the stack pointer as framepointer, because
+         1) we exactly know the offsets of the temps from the stack pointer
+            after pass 1 (based on the require parameter stack size for called
+            routines), while we don't know it for the frame pointer (it depends
+            on the number of saved registers)
+         2) temp offsets from the stack pointer are positive while those from
+            the frame pointer are negative, and we can directly encode much
+            bigger positive offsets in the instructions
+      }
+      framepointer:=NR_STACK_POINTER_REG;
+    end;
+
+  procedure taarch64procinfo.set_first_temp_offset;
+    begin
+     { leave room for allocated parameters }
+     tg.setfirsttemp(align(maxpushedparasize,16));
+    end;
+
 
 begin
   cprocinfo:=taarch64procinfo;
diff --git a/compiler/aarch64/ncpuinl.pas b/compiler/aarch64/ncpuinl.pas
index 596eefe6c4..93f3456d69 100644
--- a/compiler/aarch64/ncpuinl.pas
+++ b/compiler/aarch64/ncpuinl.pas
@@ -41,6 +41,7 @@ interface
         procedure second_abs_long; override;
         procedure second_round_real; override;
         procedure second_trunc_real; override;
+        procedure second_get_frame; override;
       private
         procedure load_fpu_location;
       end;
@@ -167,6 +168,17 @@ implementation
         current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FCVTZS,location.register,left.location.register));
       end;
 
+
+    procedure taarch64inlinenode.second_get_frame;
+      begin
+        location_reset(location,LOC_CREGISTER,OS_ADDR);
+        { this routine is used to get the frame pointer for backtracing
+          purposes. current_procinfo.framepointer is set to SP because that one
+          is used to access temps. On most platforms these two frame pointers
+          are the same, but not on AArch64. }
+        location.register:=NR_FRAME_POINTER_REG;
+      end;
+
 begin
   cinlinenode:=taarch64inlinenode;
 end.
diff --git a/compiler/aarch64/ncpumem.pas b/compiler/aarch64/ncpumem.pas
new file mode 100644
index 0000000000..2547334b55
--- /dev/null
+++ b/compiler/aarch64/ncpumem.pas
@@ -0,0 +1,66 @@
+{
+    Copyright (c) 2014 by Jonas Maebe
+
+    Generate AArch64 code for in memory related nodes
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ ****************************************************************************
+}
+unit ncpumem;
+
+{$i fpcdefs.inc}
+
+interface
+
+  uses
+    globtype,
+    node,nmem,ncgmem;
+
+  type
+    taarch64loadparentfpnode = class(tcgloadparentfpnode)
+      procedure pass_generate_code; override;
+    end;
+
+implementation
+
+  uses
+    aasmdata,cgbase,cpubase,
+    cgobj;
+
+  { taarch64loadparentfpnode }
+
+  procedure taarch64loadparentfpnode.pass_generate_code;
+    begin
+      inherited pass_generate_code;
+      { see the comments in tcgaarch64.g_proc_entry }
+      if (location.loc in [LOC_REGISTER,LOC_CREGISTER]) and
+         (location.register=NR_STACK_POINTER_REG) then
+        if (kind=lpf_forpara) then
+          location.register:=NR_FRAME_POINTER_REG
+        else
+          begin
+            { load stack pointer in a different register, as many instructions
+              cannot directly work with the stack pointer. The register
+              allocator can merge them if possible }
+            location.register:=cg.getaddressregister(current_asmdata.CurrAsmList);
+            cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_ADDR,OS_ADDR,NR_STACK_POINTER_REG,location.register);
+            location.loc:=LOC_REGISTER;
+          end;
+    end;
+
+begin
+  cloadparentfpnode:=taarch64loadparentfpnode;
+end.
diff --git a/compiler/aarch64/rgcpu.pas b/compiler/aarch64/rgcpu.pas
index 7590597416..0a55c9af72 100644
--- a/compiler/aarch64/rgcpu.pas
+++ b/compiler/aarch64/rgcpu.pas
@@ -40,6 +40,10 @@ unit rgcpu;
         procedure do_spill_op(list: tasmlist; op: tasmop; pos: tai; const spilltemp: treference; tempreg: tregister);
       end;
 
+      trgintcpu=class(trgcpu)
+        procedure add_cpu_interferences(p: tai); override;
+      end;
+
 
 implementation
 
@@ -97,4 +101,62 @@ implementation
       end;
 
 
+    procedure trgintcpu.add_cpu_interferences(p: tai);
+     var
+       i: longint;
+     begin
+       if p.typ=ait_instruction then
+         begin
+           { add interferences for instructions that can have SP as a register
+             operand }
+           case taicpu(p).opcode of
+             A_MOV:
+               { all operands can be SP }
+               exit;
+             A_ADD,
+             A_SUB,
+             A_CMP,
+             A_CMN:
+               { ok as destination or first source in immediate or extended
+                 register form }
+               if (taicpu(p).oper[taicpu(p).ops-1]^.typ<>top_shifterop) or
+                  valid_shifter_operand(taicpu(p).opcode,false,true,
+                    reg_cgsize(taicpu(p).oper[0]^.reg) in [OS_64,OS_S64],
+                    taicpu(p).oper[taicpu(p).ops-1]^.shifterop^.shiftmode,
+                    taicpu(p).oper[taicpu(p).ops-1]^.shifterop^.shiftimm) then
+                 begin
+                   if taicpu(p).oper[taicpu(p).ops-1]^.typ=top_shifterop then
+                     i:=taicpu(p).ops-2
+                   else
+                     i:=taicpu(p).ops-1;
+                   if (taicpu(p).oper[i]^.typ=top_reg) then
+                     add_edge(getsupreg(taicpu(p).oper[i]^.reg),RS_SP);
+                   exit;
+                 end;
+             A_AND,
+             A_EOR,
+             A_ORR,
+             A_TST:
+               { ok in immediate form }
+               if taicpu(p).oper[taicpu(p).ops-1]^.typ=top_const then
+                 exit;
+           end;
+           { add interferences for other registers }
+           for i:=0 to taicpu(p).ops-1 do
+             begin
+               case taicpu(p).oper[i]^.typ of
+                 top_reg:
+                   if getregtype(taicpu(p).oper[i]^.reg)=R_INTREGISTER then
+                     add_edge(getsupreg(taicpu(p).oper[i]^.reg),RS_SP);
+                 top_ref:
+                   begin
+                     { sp can always be base, never be index }
+                     if taicpu(p).oper[i]^.ref^.index<>NR_NO then
+                       add_edge(getsupreg(taicpu(p).oper[i]^.ref^.index),RS_SP);
+                   end;
+               end;
+             end;
+         end;
+     end;
+
 end.
diff --git a/compiler/fpcdefs.inc b/compiler/fpcdefs.inc
index cf24905b7b..83e7c7f54e 100644
--- a/compiler/fpcdefs.inc
+++ b/compiler/fpcdefs.inc
@@ -242,6 +242,7 @@
   {$define cpurox}
   {$define cputargethasfixedstack}
   {$define cpurefshaveindexreg}
+  {$define SUPPORT_GET_FRAME}
 {$endif aarch64}
 
 {$IFDEF MACOS}
diff --git a/compiler/tgobj.pas b/compiler/tgobj.pas
index 6c47ba2ad0..de49cb6f63 100644
--- a/compiler/tgobj.pas
+++ b/compiler/tgobj.pas
@@ -179,7 +179,7 @@ implementation
        tempfreelist:=nil;
        templist:=nil;
        { we could create a new child class for this but I don't if it is worth the effort (FK) }
-{$if defined(powerpc) or defined(powerpc64) or defined(avr) or defined(jvm)}
+{$if defined(powerpc) or defined(powerpc64) or defined(avr) or defined(jvm) or defined(aarch64)}
        direction:=1;
 {$else}
        direction:=-1;
diff --git a/rtl/aarch64/aarch64.inc b/rtl/aarch64/aarch64.inc
index 6e5dae0579..38b51ff09d 100644
--- a/rtl/aarch64/aarch64.inc
+++ b/rtl/aarch64/aarch64.inc
@@ -82,12 +82,6 @@ procedure fpc_cpuinit;
                                  String
 ****************************************************************************}
 
-{$define FPC_SYSTEM_HAS_GET_FRAME}
-function get_frame:pointer;assembler; nostackframe;
-  asm
-    mov x0, x29
-  end;
-
 {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
 function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;assembler; nostackframe;
   asm
diff --git a/tests/test/tnest1.pp b/tests/test/tnest1.pp
new file mode 100644
index 0000000000..21f32feda6
--- /dev/null
+++ b/tests/test/tnest1.pp
@@ -0,0 +1,38 @@
+{$inline on}
+
+procedure test(l1, l2: longint);
+
+var
+  a1: cardinal;
+  d1, d2: double;
+  a2: cardinal;
+
+  procedure nested; inline;
+    begin
+      l1:=1;
+      l2:=2;
+      d1:=3.0;
+      d2:=4.0;
+    end;
+
+begin
+  a1:=$deadbeef;
+  a2:=$cafe0000;
+  nested;
+  if a1<>$deadbeef then
+    halt(1);
+  if a2<>$cafe0000 then
+    halt(2);
+  if l1<>1 then
+    halt(3);
+  if l2<>2 then
+    halt(4);
+  if d1<>3.0 then
+    halt(5);
+  if d2<>4.0 then
+    halt(6);
+end;
+
+begin
+  test(5,6);
+end.