From 41fba0c4f7ec96f526e70b22bb349e733e06c642 Mon Sep 17 00:00:00 2001 From: Jonas Maebe Date: Mon, 23 Feb 2015 22:54:03 +0000 Subject: [PATCH] * switched to using the stack pointer as base register for the temp allocator instead of the frame pointer register: 1) we exactly know the offsets of the temps from the stack pointer after pass 1 (based on the require parameter stack size for called routines), while we don't know it for the frame pointer (it depends on the number of saved registers) 2) temp offsets from the stack pointer are positive while those from the frame pointer are negative, and we can directly encode much bigger positive offsets in the instructions o move the stack pointer register to a virtual register in loadparentfpn, because many instructions cannot directly operate on/with the stack pointer o add the necessary register interference edges for the stack pointer register git-svn-id: trunk@29938 - --- .gitattributes | 2 ++ compiler/aarch64/cgcpu.pas | 67 ++++++++++++++++++++++++++++++++---- compiler/aarch64/cpunode.pas | 2 +- compiler/aarch64/cpupara.pas | 11 +++--- compiler/aarch64/cpupi.pas | 30 ++++++++++++++-- compiler/aarch64/ncpuinl.pas | 12 +++++++ compiler/aarch64/ncpumem.pas | 66 +++++++++++++++++++++++++++++++++++ compiler/aarch64/rgcpu.pas | 62 +++++++++++++++++++++++++++++++++ compiler/fpcdefs.inc | 1 + compiler/tgobj.pas | 2 +- rtl/aarch64/aarch64.inc | 6 ---- tests/test/tnest1.pp | 38 ++++++++++++++++++++ 12 files changed, 278 insertions(+), 21 deletions(-) create mode 100644 compiler/aarch64/ncpumem.pas create mode 100644 tests/test/tnest1.pp diff --git a/.gitattributes b/.gitattributes index 3e24b8af1d..a98fed7650 100644 --- a/.gitattributes +++ b/.gitattributes @@ -31,6 +31,7 @@ compiler/aarch64/ncpuadd.pas svneol=native#text/plain compiler/aarch64/ncpucnv.pas svneol=native#text/plain compiler/aarch64/ncpuinl.pas svneol=native#text/plain compiler/aarch64/ncpumat.pas svneol=native#text/plain +compiler/aarch64/ncpumem.pas svneol=native#text/plain compiler/aarch64/ra64con.inc svneol=native#text/plain compiler/aarch64/ra64dwa.inc svneol=native#text/plain compiler/aarch64/ra64nor.inc svneol=native#text/plain @@ -11945,6 +11946,7 @@ tests/test/tmsg3.pp svneol=native#text/plain tests/test/tmsg4.pp svneol=native#text/plain tests/test/tmt1.pp svneol=native#text/plain tests/test/tmul1.pp svneol=native#text/pascal +tests/test/tnest1.pp svneol=native#text/plain tests/test/tnoext1.pp svneol=native#text/plain tests/test/tnoext2.pp svneol=native#text/plain tests/test/tnoext3.pp svneol=native#text/plain diff --git a/compiler/aarch64/cgcpu.pas b/compiler/aarch64/cgcpu.pas index 5b58749ac7..309edeece9 100644 --- a/compiler/aarch64/cgcpu.pas +++ b/compiler/aarch64/cgcpu.pas @@ -95,7 +95,7 @@ interface procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override; procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override; private - procedure save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); + function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint; procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); end; @@ -519,7 +519,7 @@ implementation begin inherited init_register_allocators; - rg[R_INTREGISTER]:=Trgcpu.create(R_INTREGISTER,R_SUBWHOLE, + rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE, [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8, RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17, RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28 @@ -1399,12 +1399,13 @@ implementation { *********** entry/exit code and address loading ************ } - procedure tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); + function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint; var ref: treference; sr: tsuperregister; pairreg: tregister; begin + result:=0; reference_reset_base(ref,NR_SP,-16,16); ref.addressmode:=AM_PREINDEXED; pairreg:=NR_NO; @@ -1415,18 +1416,38 @@ implementation pairreg:=newreg(rt,sr,sub) else begin + inc(result,16); list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref)); pairreg:=NR_NO end; { one left -> store twice (stack must be 16 bytes aligned) } if pairreg<>NR_NO then - list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref)); + begin + list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref)); + inc(result,16); + end; end; + procedure FixupOffsets(p:TObject;arg:pointer); + var + sym: tabstractnormalvarsym absolute p; + begin + if (tsym(p).typ in [paravarsym,localvarsym]) and + (sym.localloc.loc=LOC_REFERENCE) and + (sym.localloc.reference.base=NR_STACK_POINTER_REG) then + begin + sym.localloc.reference.base:=NR_FRAME_POINTER_REG; + dec(sym.localloc.reference.offset,PLongint(arg)^); + end; + end; + + + procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean); var ref: treference; + totalstackframesize: longint; begin if nostackframe then exit; @@ -1440,12 +1461,15 @@ implementation { initialise frame pointer } a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP); + totalstackframesize:=localsize; { save modified integer registers } - save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE); + inc(totalstackframesize, + save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE)); { only the lower 64 bits of the modified vector registers need to be saved; if the caller needs the upper 64 bits, it has to save them itself } - save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD); + inc(totalstackframesize, + save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD)); { allocate stack space } if localsize<>0 then @@ -1454,6 +1478,37 @@ implementation current_procinfo.final_localsize:=localsize; handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true); end; + { By default, we use the frame pointer to access parameters passed via + the stack and the stack pointer to address local variables and temps + because + a) we can use bigger positive than negative offsets (so accessing + locals via negative offsets from the frame pointer would be less + efficient) + b) we don't know the local size while generating the code, so + accessing the parameters via the stack pointer is not possible + without copying them + The problem with this is the get_frame() intrinsic: + a) it must return the same value as what we pass as parentfp + parameter, since that's how it's used in the TP-style objects unit + b) its return value must usable to access all local data from a + routine (locals and parameters), since it's all the nested + routines have access to + c) its return value must be usable to construct a backtrace, as it's + also used by the exception handling routines + + The solution we use here, based on something similar that's done in + the MIPS port, is to generate all accesses to locals in the routine + itself SP-relative, and then after the code is generated and the local + size is known (namely, here), we change all SP-relative variables/ + parameters into FP-relative ones. This means that they'll be accessed + less efficiently from nested routines, but those accesses are indirect + anyway and at least this way they can be accessed at all + } + if current_procinfo.has_nestedprocs then + begin + current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize); + current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize); + end; end; diff --git a/compiler/aarch64/cpunode.pas b/compiler/aarch64/cpunode.pas index c155bc88ae..7456746a70 100644 --- a/compiler/aarch64/cpunode.pas +++ b/compiler/aarch64/cpunode.pas @@ -31,7 +31,7 @@ implementation uses ncgbas,ncgflw,ncgcal,ncgcnv,ncgld,ncgmem,ncgcon,ncgset,ncgobjc, - ncpuadd,ncpumat,ncpuinl,ncpucnv,{ncpuset,} + ncpuadd,ncpumat,ncpumem,ncpuinl,ncpucnv,{ncpuset,} { this not really a node } rgcpu, { symtable } diff --git a/compiler/aarch64/cpupara.pas b/compiler/aarch64/cpupara.pas index 41a5f72862..463f852dec 100644 --- a/compiler/aarch64/cpupara.pas +++ b/compiler/aarch64/cpupara.pas @@ -535,10 +535,6 @@ unit cpupara; begin paraloc^.size:=paracgsize; paraloc^.loc:=LOC_REFERENCE; - if side=callerside then - paraloc^.reference.index:=NR_STACK_POINTER_REG - else - paraloc^.reference.index:=NR_FRAME_POINTER_REG; { the current stack offset may not be properly aligned in case we're on Darwin have allocated a non-variadic argument @@ -563,6 +559,13 @@ unit cpupara; paraloc^.reference.offset:=curstackoffset else paraloc^.reference.offset:=curstackoffset+stackslotlen-paralen; + if side=callerside then + paraloc^.reference.index:=NR_STACK_POINTER_REG + else + begin + paraloc^.reference.index:=NR_FRAME_POINTER_REG; + inc(paraloc^.reference.offset,16); + end; inc(curstackoffset,stackslotlen); paralen:=0 end; diff --git a/compiler/aarch64/cpupi.pas b/compiler/aarch64/cpupi.pas index 0a8c1e7d16..ff6505c4fd 100644 --- a/compiler/aarch64/cpupi.pas +++ b/compiler/aarch64/cpupi.pas @@ -26,18 +26,42 @@ unit cpupi; interface uses + procinfo, psub; type taarch64procinfo=class(tcgprocinfo) - { no need to override anything, as the ABI requires us to use a frame - pointer at all times } + constructor create(aparent: tprocinfo); override; + procedure set_first_temp_offset; override; end; implementation uses - procinfo; + tgobj, + cpubase; + + constructor taarch64procinfo.create(aparent: tprocinfo); + begin + inherited; + { use the stack pointer as framepointer, because + 1) we exactly know the offsets of the temps from the stack pointer + after pass 1 (based on the require parameter stack size for called + routines), while we don't know it for the frame pointer (it depends + on the number of saved registers) + 2) temp offsets from the stack pointer are positive while those from + the frame pointer are negative, and we can directly encode much + bigger positive offsets in the instructions + } + framepointer:=NR_STACK_POINTER_REG; + end; + + procedure taarch64procinfo.set_first_temp_offset; + begin + { leave room for allocated parameters } + tg.setfirsttemp(align(maxpushedparasize,16)); + end; + begin cprocinfo:=taarch64procinfo; diff --git a/compiler/aarch64/ncpuinl.pas b/compiler/aarch64/ncpuinl.pas index 596eefe6c4..93f3456d69 100644 --- a/compiler/aarch64/ncpuinl.pas +++ b/compiler/aarch64/ncpuinl.pas @@ -41,6 +41,7 @@ interface procedure second_abs_long; override; procedure second_round_real; override; procedure second_trunc_real; override; + procedure second_get_frame; override; private procedure load_fpu_location; end; @@ -167,6 +168,17 @@ implementation current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FCVTZS,location.register,left.location.register)); end; + + procedure taarch64inlinenode.second_get_frame; + begin + location_reset(location,LOC_CREGISTER,OS_ADDR); + { this routine is used to get the frame pointer for backtracing + purposes. current_procinfo.framepointer is set to SP because that one + is used to access temps. On most platforms these two frame pointers + are the same, but not on AArch64. } + location.register:=NR_FRAME_POINTER_REG; + end; + begin cinlinenode:=taarch64inlinenode; end. diff --git a/compiler/aarch64/ncpumem.pas b/compiler/aarch64/ncpumem.pas new file mode 100644 index 0000000000..2547334b55 --- /dev/null +++ b/compiler/aarch64/ncpumem.pas @@ -0,0 +1,66 @@ +{ + Copyright (c) 2014 by Jonas Maebe + + Generate AArch64 code for in memory related nodes + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + **************************************************************************** +} +unit ncpumem; + +{$i fpcdefs.inc} + +interface + + uses + globtype, + node,nmem,ncgmem; + + type + taarch64loadparentfpnode = class(tcgloadparentfpnode) + procedure pass_generate_code; override; + end; + +implementation + + uses + aasmdata,cgbase,cpubase, + cgobj; + + { taarch64loadparentfpnode } + + procedure taarch64loadparentfpnode.pass_generate_code; + begin + inherited pass_generate_code; + { see the comments in tcgaarch64.g_proc_entry } + if (location.loc in [LOC_REGISTER,LOC_CREGISTER]) and + (location.register=NR_STACK_POINTER_REG) then + if (kind=lpf_forpara) then + location.register:=NR_FRAME_POINTER_REG + else + begin + { load stack pointer in a different register, as many instructions + cannot directly work with the stack pointer. The register + allocator can merge them if possible } + location.register:=cg.getaddressregister(current_asmdata.CurrAsmList); + cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_ADDR,OS_ADDR,NR_STACK_POINTER_REG,location.register); + location.loc:=LOC_REGISTER; + end; + end; + +begin + cloadparentfpnode:=taarch64loadparentfpnode; +end. diff --git a/compiler/aarch64/rgcpu.pas b/compiler/aarch64/rgcpu.pas index 7590597416..0a55c9af72 100644 --- a/compiler/aarch64/rgcpu.pas +++ b/compiler/aarch64/rgcpu.pas @@ -40,6 +40,10 @@ unit rgcpu; procedure do_spill_op(list: tasmlist; op: tasmop; pos: tai; const spilltemp: treference; tempreg: tregister); end; + trgintcpu=class(trgcpu) + procedure add_cpu_interferences(p: tai); override; + end; + implementation @@ -97,4 +101,62 @@ implementation end; + procedure trgintcpu.add_cpu_interferences(p: tai); + var + i: longint; + begin + if p.typ=ait_instruction then + begin + { add interferences for instructions that can have SP as a register + operand } + case taicpu(p).opcode of + A_MOV: + { all operands can be SP } + exit; + A_ADD, + A_SUB, + A_CMP, + A_CMN: + { ok as destination or first source in immediate or extended + register form } + if (taicpu(p).oper[taicpu(p).ops-1]^.typ<>top_shifterop) or + valid_shifter_operand(taicpu(p).opcode,false,true, + reg_cgsize(taicpu(p).oper[0]^.reg) in [OS_64,OS_S64], + taicpu(p).oper[taicpu(p).ops-1]^.shifterop^.shiftmode, + taicpu(p).oper[taicpu(p).ops-1]^.shifterop^.shiftimm) then + begin + if taicpu(p).oper[taicpu(p).ops-1]^.typ=top_shifterop then + i:=taicpu(p).ops-2 + else + i:=taicpu(p).ops-1; + if (taicpu(p).oper[i]^.typ=top_reg) then + add_edge(getsupreg(taicpu(p).oper[i]^.reg),RS_SP); + exit; + end; + A_AND, + A_EOR, + A_ORR, + A_TST: + { ok in immediate form } + if taicpu(p).oper[taicpu(p).ops-1]^.typ=top_const then + exit; + end; + { add interferences for other registers } + for i:=0 to taicpu(p).ops-1 do + begin + case taicpu(p).oper[i]^.typ of + top_reg: + if getregtype(taicpu(p).oper[i]^.reg)=R_INTREGISTER then + add_edge(getsupreg(taicpu(p).oper[i]^.reg),RS_SP); + top_ref: + begin + { sp can always be base, never be index } + if taicpu(p).oper[i]^.ref^.index<>NR_NO then + add_edge(getsupreg(taicpu(p).oper[i]^.ref^.index),RS_SP); + end; + end; + end; + end; + end; + end. diff --git a/compiler/fpcdefs.inc b/compiler/fpcdefs.inc index cf24905b7b..83e7c7f54e 100644 --- a/compiler/fpcdefs.inc +++ b/compiler/fpcdefs.inc @@ -242,6 +242,7 @@ {$define cpurox} {$define cputargethasfixedstack} {$define cpurefshaveindexreg} + {$define SUPPORT_GET_FRAME} {$endif aarch64} {$IFDEF MACOS} diff --git a/compiler/tgobj.pas b/compiler/tgobj.pas index 6c47ba2ad0..de49cb6f63 100644 --- a/compiler/tgobj.pas +++ b/compiler/tgobj.pas @@ -179,7 +179,7 @@ implementation tempfreelist:=nil; templist:=nil; { we could create a new child class for this but I don't if it is worth the effort (FK) } -{$if defined(powerpc) or defined(powerpc64) or defined(avr) or defined(jvm)} +{$if defined(powerpc) or defined(powerpc64) or defined(avr) or defined(jvm) or defined(aarch64)} direction:=1; {$else} direction:=-1; diff --git a/rtl/aarch64/aarch64.inc b/rtl/aarch64/aarch64.inc index 6e5dae0579..38b51ff09d 100644 --- a/rtl/aarch64/aarch64.inc +++ b/rtl/aarch64/aarch64.inc @@ -82,12 +82,6 @@ procedure fpc_cpuinit; String ****************************************************************************} -{$define FPC_SYSTEM_HAS_GET_FRAME} -function get_frame:pointer;assembler; nostackframe; - asm - mov x0, x29 - end; - {$define FPC_SYSTEM_HAS_GET_CALLER_ADDR} function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;assembler; nostackframe; asm diff --git a/tests/test/tnest1.pp b/tests/test/tnest1.pp new file mode 100644 index 0000000000..21f32feda6 --- /dev/null +++ b/tests/test/tnest1.pp @@ -0,0 +1,38 @@ +{$inline on} + +procedure test(l1, l2: longint); + +var + a1: cardinal; + d1, d2: double; + a2: cardinal; + + procedure nested; inline; + begin + l1:=1; + l2:=2; + d1:=3.0; + d2:=4.0; + end; + +begin + a1:=$deadbeef; + a2:=$cafe0000; + nested; + if a1<>$deadbeef then + halt(1); + if a2<>$cafe0000 then + halt(2); + if l1<>1 then + halt(3); + if l2<>2 then + halt(4); + if d1<>3.0 then + halt(5); + if d2<>4.0 then + halt(6); +end; + +begin + test(5,6); +end.