* switched to using the stack pointer as base register for the temp allocator

instead of the frame pointer register: 1) we exactly know the offsets of the temps from the stack pointer after pass 1 (based on the require parameter stack size for called routines), while we don't know it for the frame pointer (it depends on the number of saved registers) 2) temp offsets from the stack pointer are positive while those from the frame pointer are negative, and we can directly encode much bigger positive offsets in the instructions o move the stack pointer register to a virtual register in loadparentfpn, because many instructions cannot directly operate on/with the stack pointer o add the necessary register interference edges for the stack pointer register git-svn-id: trunk@29938 -
2025-04-07 01:48:00 +02:00 · 2015-02-23 22:54:03 +00:00 · 2015-02-23 22:54:03 +00:00 · 41fba0c4f7
commit 41fba0c4f7
parent 5bb89cc2f0
12 changed files with 278 additions and 21 deletions
--- a/.gitattributes
+++ b/.gitattributes
@ -31,6 +31,7 @@ compiler/aarch64/ncpuadd.pas svneol=native#text/plain
 compiler/aarch64/ncpucnv.pas svneol=native#text/plain
 compiler/aarch64/ncpuinl.pas svneol=native#text/plain
 compiler/aarch64/ncpumat.pas svneol=native#text/plain
+compiler/aarch64/ncpumem.pas svneol=native#text/plain
 compiler/aarch64/ra64con.inc svneol=native#text/plain
 compiler/aarch64/ra64dwa.inc svneol=native#text/plain
 compiler/aarch64/ra64nor.inc svneol=native#text/plain
@ -11945,6 +11946,7 @@ tests/test/tmsg3.pp svneol=native#text/plain
 tests/test/tmsg4.pp svneol=native#text/plain
 tests/test/tmt1.pp svneol=native#text/plain
 tests/test/tmul1.pp svneol=native#text/pascal
+tests/test/tnest1.pp svneol=native#text/plain
 tests/test/tnoext1.pp svneol=native#text/plain
 tests/test/tnoext2.pp svneol=native#text/plain
 tests/test/tnoext3.pp svneol=native#text/plain
--- a/compiler/aarch64/cgcpu.pas
+++ b/compiler/aarch64/cgcpu.pas
@ -95,7 +95,7 @@ interface
        procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
        procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
       private
-        procedure save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
+        function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
        procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
      end;

@ -519,7 +519,7 @@ implementation
      begin
        inherited init_register_allocators;

-        rg[R_INTREGISTER]:=Trgcpu.create(R_INTREGISTER,R_SUBWHOLE,
+        rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
            [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
             RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
             RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
@ -1399,12 +1399,13 @@ implementation

  { *********** entry/exit code and address loading ************ }

-    procedure tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
+    function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
      var
        ref: treference;
        sr: tsuperregister;
        pairreg: tregister;
      begin
+        result:=0;
        reference_reset_base(ref,NR_SP,-16,16);
        ref.addressmode:=AM_PREINDEXED;
        pairreg:=NR_NO;
@ -1415,18 +1416,38 @@ implementation
              pairreg:=newreg(rt,sr,sub)
            else
              begin
+                inc(result,16);
                list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
                pairreg:=NR_NO
              end;
        { one left -> store twice (stack must be 16 bytes aligned) }
        if pairreg<>NR_NO then
-          list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
+          begin
+            list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
+            inc(result,16);
+          end;
      end;


+    procedure FixupOffsets(p:TObject;arg:pointer);
+      var
+        sym: tabstractnormalvarsym absolute p;
+      begin
+        if (tsym(p).typ in [paravarsym,localvarsym]) and
+          (sym.localloc.loc=LOC_REFERENCE) and
+          (sym.localloc.reference.base=NR_STACK_POINTER_REG) then
+          begin
+            sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
+            dec(sym.localloc.reference.offset,PLongint(arg)^);
+          end;
+      end;
+
+
+
    procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
      var
        ref: treference;
+        totalstackframesize: longint;
      begin
        if nostackframe then
          exit;
@ -1440,12 +1461,15 @@ implementation
        { initialise frame pointer }
        a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);

+        totalstackframesize:=localsize;
        { save modified integer registers }
-        save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
+        inc(totalstackframesize,
+          save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
        { only the lower 64 bits of the modified vector registers need to be
          saved; if the caller needs the upper 64 bits, it has to save them
          itself }
-        save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
+        inc(totalstackframesize,
+          save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));

        { allocate stack space }
        if localsize<>0 then
@ -1454,6 +1478,37 @@ implementation
            current_procinfo.final_localsize:=localsize;
            handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
          end;
+        { By default, we use the frame pointer to access parameters passed via
+          the stack and the stack pointer to address local variables and temps
+          because
+           a) we can use bigger positive than negative offsets (so accessing
+              locals via negative offsets from the frame pointer would be less
+              efficient)
+           b) we don't know the local size while generating the code, so
+              accessing the parameters via the stack pointer is not possible
+              without copying them
+          The problem with this is the get_frame() intrinsic:
+           a) it must return the same value as what we pass as parentfp
+              parameter, since that's how it's used in the TP-style objects unit
+           b) its return value must usable to access all local data from a
+              routine (locals and parameters), since it's all the nested
+              routines have access to
+           c) its return value must be usable to construct a backtrace, as it's
+              also used by the exception handling routines
+
+          The solution we use here, based on something similar that's done in
+          the MIPS port, is to generate all accesses to locals in the routine
+          itself SP-relative, and then after the code is generated and the local
+          size is known (namely, here), we change all SP-relative variables/
+          parameters into FP-relative ones. This means that they'll be accessed
+          less efficiently from nested routines, but those accesses are indirect
+          anyway and at least this way they can be accessed at all
+        }
+        if current_procinfo.has_nestedprocs then
+          begin
+            current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
+            current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
+          end;
      end;


--- a/compiler/aarch64/cpunode.pas
+++ b/compiler/aarch64/cpunode.pas
@ -31,7 +31,7 @@ implementation

  uses
    ncgbas,ncgflw,ncgcal,ncgcnv,ncgld,ncgmem,ncgcon,ncgset,ncgobjc,
-    ncpuadd,ncpumat,ncpuinl,ncpucnv,{ncpuset,}
+    ncpuadd,ncpumat,ncpumem,ncpuinl,ncpucnv,{ncpuset,}
    { this not really a node }
    rgcpu,
    { symtable }
--- a/compiler/aarch64/cpupara.pas
+++ b/compiler/aarch64/cpupara.pas
@ -535,10 +535,6 @@ unit cpupara;
               begin
                  paraloc^.size:=paracgsize;
                  paraloc^.loc:=LOC_REFERENCE;
-                  if side=callerside then
-                    paraloc^.reference.index:=NR_STACK_POINTER_REG
-                  else
-                    paraloc^.reference.index:=NR_FRAME_POINTER_REG;

                  { the current stack offset may not be properly aligned in
                    case we're on Darwin have allocated a non-variadic argument
@ -563,6 +559,13 @@ unit cpupara;
                    paraloc^.reference.offset:=curstackoffset
                  else
                    paraloc^.reference.offset:=curstackoffset+stackslotlen-paralen;
+                  if side=callerside then
+                    paraloc^.reference.index:=NR_STACK_POINTER_REG
+                  else
+                    begin
+                      paraloc^.reference.index:=NR_FRAME_POINTER_REG;
+                      inc(paraloc^.reference.offset,16);
+                    end;
                  inc(curstackoffset,stackslotlen);
                  paralen:=0
               end;
--- a/compiler/aarch64/cpupi.pas
+++ b/compiler/aarch64/cpupi.pas
@ -26,18 +26,42 @@ unit cpupi;
 interface

  uses
+    procinfo,
    psub;

  type
    taarch64procinfo=class(tcgprocinfo)
-      { no need to override anything, as the ABI requires us to use a frame
-        pointer at all times }
+      constructor create(aparent: tprocinfo); override;
+      procedure set_first_temp_offset; override;
    end;

 implementation

  uses
-    procinfo;
+    tgobj,
+    cpubase;
+
+  constructor taarch64procinfo.create(aparent: tprocinfo);
+    begin
+      inherited;
+      { use the stack pointer as framepointer, because
+         1) we exactly know the offsets of the temps from the stack pointer
+            after pass 1 (based on the require parameter stack size for called
+            routines), while we don't know it for the frame pointer (it depends
+            on the number of saved registers)
+         2) temp offsets from the stack pointer are positive while those from
+            the frame pointer are negative, and we can directly encode much
+            bigger positive offsets in the instructions
+      }
+      framepointer:=NR_STACK_POINTER_REG;
+    end;
+
+  procedure taarch64procinfo.set_first_temp_offset;
+    begin
+     { leave room for allocated parameters }
+     tg.setfirsttemp(align(maxpushedparasize,16));
+    end;
+

 begin
  cprocinfo:=taarch64procinfo;
--- a/compiler/aarch64/ncpuinl.pas
+++ b/compiler/aarch64/ncpuinl.pas
@ -41,6 +41,7 @@ interface
        procedure second_abs_long; override;
        procedure second_round_real; override;
        procedure second_trunc_real; override;
+        procedure second_get_frame; override;
      private
        procedure load_fpu_location;
      end;
@ -167,6 +168,17 @@ implementation
        current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FCVTZS,location.register,left.location.register));
      end;

+
+    procedure taarch64inlinenode.second_get_frame;
+      begin
+        location_reset(location,LOC_CREGISTER,OS_ADDR);
+        { this routine is used to get the frame pointer for backtracing
+          purposes. current_procinfo.framepointer is set to SP because that one
+          is used to access temps. On most platforms these two frame pointers
+          are the same, but not on AArch64. }
+        location.register:=NR_FRAME_POINTER_REG;
+      end;
+
 begin
  cinlinenode:=taarch64inlinenode;
 end.
--- a/compiler/aarch64/ncpumem.pas
+++ b/compiler/aarch64/ncpumem.pas
@ -0,0 +1,66 @@
+{
+    Copyright (c) 2014 by Jonas Maebe
+
+    Generate AArch64 code for in memory related nodes
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ ****************************************************************************
+}
+unit ncpumem;
+
+{$i fpcdefs.inc}
+
+interface
+
+  uses
+    globtype,
+    node,nmem,ncgmem;
+
+  type
+    taarch64loadparentfpnode = class(tcgloadparentfpnode)
+      procedure pass_generate_code; override;
+    end;
+
+implementation
+
+  uses
+    aasmdata,cgbase,cpubase,
+    cgobj;
+
+  { taarch64loadparentfpnode }
+
+  procedure taarch64loadparentfpnode.pass_generate_code;
+    begin
+      inherited pass_generate_code;
+      { see the comments in tcgaarch64.g_proc_entry }
+      if (location.loc in [LOC_REGISTER,LOC_CREGISTER]) and
+         (location.register=NR_STACK_POINTER_REG) then
+        if (kind=lpf_forpara) then
+          location.register:=NR_FRAME_POINTER_REG
+        else
+          begin
+            { load stack pointer in a different register, as many instructions
+              cannot directly work with the stack pointer. The register
+              allocator can merge them if possible }
+            location.register:=cg.getaddressregister(current_asmdata.CurrAsmList);
+            cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_ADDR,OS_ADDR,NR_STACK_POINTER_REG,location.register);
+            location.loc:=LOC_REGISTER;
+          end;
+    end;
+
+begin
+  cloadparentfpnode:=taarch64loadparentfpnode;
+end.
--- a/compiler/aarch64/rgcpu.pas
+++ b/compiler/aarch64/rgcpu.pas
@ -40,6 +40,10 @@ unit rgcpu;
        procedure do_spill_op(list: tasmlist; op: tasmop; pos: tai; const spilltemp: treference; tempreg: tregister);
      end;

+      trgintcpu=class(trgcpu)
+        procedure add_cpu_interferences(p: tai); override;
+      end;
+

 implementation

@ -97,4 +101,62 @@ implementation
      end;


+    procedure trgintcpu.add_cpu_interferences(p: tai);
+     var
+       i: longint;
+     begin
+       if p.typ=ait_instruction then
+         begin
+           { add interferences for instructions that can have SP as a register
+             operand }
+           case taicpu(p).opcode of
+             A_MOV:
+               { all operands can be SP }
+               exit;
+             A_ADD,
+             A_SUB,
+             A_CMP,
+             A_CMN:
+               { ok as destination or first source in immediate or extended
+                 register form }
+               if (taicpu(p).oper[taicpu(p).ops-1]^.typ<>top_shifterop) or
+                  valid_shifter_operand(taicpu(p).opcode,false,true,
+                    reg_cgsize(taicpu(p).oper[0]^.reg) in [OS_64,OS_S64],
+                    taicpu(p).oper[taicpu(p).ops-1]^.shifterop^.shiftmode,
+                    taicpu(p).oper[taicpu(p).ops-1]^.shifterop^.shiftimm) then
+                 begin
+                   if taicpu(p).oper[taicpu(p).ops-1]^.typ=top_shifterop then
+                     i:=taicpu(p).ops-2
+                   else
+                     i:=taicpu(p).ops-1;
+                   if (taicpu(p).oper[i]^.typ=top_reg) then
+                     add_edge(getsupreg(taicpu(p).oper[i]^.reg),RS_SP);
+                   exit;
+                 end;
+             A_AND,
+             A_EOR,
+             A_ORR,
+             A_TST:
+               { ok in immediate form }
+               if taicpu(p).oper[taicpu(p).ops-1]^.typ=top_const then
+                 exit;
+           end;
+           { add interferences for other registers }
+           for i:=0 to taicpu(p).ops-1 do
+             begin
+               case taicpu(p).oper[i]^.typ of
+                 top_reg:
+                   if getregtype(taicpu(p).oper[i]^.reg)=R_INTREGISTER then
+                     add_edge(getsupreg(taicpu(p).oper[i]^.reg),RS_SP);
+                 top_ref:
+                   begin
+                     { sp can always be base, never be index }
+                     if taicpu(p).oper[i]^.ref^.index<>NR_NO then
+                       add_edge(getsupreg(taicpu(p).oper[i]^.ref^.index),RS_SP);
+                   end;
+               end;
+             end;
+         end;
+     end;
+
 end.
--- a/compiler/fpcdefs.inc
+++ b/compiler/fpcdefs.inc
@ -242,6 +242,7 @@
  {$define cpurox}
  {$define cputargethasfixedstack}
  {$define cpurefshaveindexreg}
+  {$define SUPPORT_GET_FRAME}
 {$endif aarch64}

 {$IFDEF MACOS}
--- a/compiler/tgobj.pas
+++ b/compiler/tgobj.pas
@ -179,7 +179,7 @@ implementation
       tempfreelist:=nil;
       templist:=nil;
       { we could create a new child class for this but I don't if it is worth the effort (FK) }
-{$if defined(powerpc) or defined(powerpc64) or defined(avr) or defined(jvm)}
+{$if defined(powerpc) or defined(powerpc64) or defined(avr) or defined(jvm) or defined(aarch64)}
       direction:=1;
 {$else}
       direction:=-1;
--- a/rtl/aarch64/aarch64.inc
+++ b/rtl/aarch64/aarch64.inc
@ -82,12 +82,6 @@ procedure fpc_cpuinit;
                                 String
 ****************************************************************************}

-{$define FPC_SYSTEM_HAS_GET_FRAME}
-function get_frame:pointer;assembler; nostackframe;
-  asm
-    mov x0, x29
-  end;
-
 {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
 function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;assembler; nostackframe;
  asm
--- a/tests/test/tnest1.pp
+++ b/tests/test/tnest1.pp
@ -0,0 +1,38 @@
+{$inline on}
+
+procedure test(l1, l2: longint);
+
+var
+  a1: cardinal;
+  d1, d2: double;
+  a2: cardinal;
+
+  procedure nested; inline;
+    begin
+      l1:=1;
+      l2:=2;
+      d1:=3.0;
+      d2:=4.0;
+    end;
+
+begin
+  a1:=$deadbeef;
+  a2:=$cafe0000;
+  nested;
+  if a1<>$deadbeef then
+    halt(1);
+  if a2<>$cafe0000 then
+    halt(2);
+  if l1<>1 then
+    halt(3);
+  if l2<>2 then
+    halt(4);
+  if d1<>3.0 then
+    halt(5);
+  if d2<>4.0 then
+    halt(6);
+end;
+
+begin
+  test(5,6);
+end.