* switched to using the stack pointer as base register for the temp allocator

instead of the frame pointer register:
      1) we exactly know the offsets of the temps from the stack pointer
         after pass 1 (based on the require parameter stack size for called
         routines), while we don't know it for the frame pointer (it depends
         on the number of saved registers)
      2) temp offsets from the stack pointer are positive while those from
         the frame pointer are negative, and we can directly encode much
         bigger positive offsets in the instructions
   o move the stack pointer register to a virtual register in
     loadparentfpn, because many instructions cannot directly operate
     on/with the stack pointer
   o add the necessary register interference edges for the stack pointer
     register

git-svn-id: trunk@29938 -
This commit is contained in:
Jonas Maebe 2015-02-23 22:54:03 +00:00
parent 5bb89cc2f0
commit 41fba0c4f7
12 changed files with 278 additions and 21 deletions

2
.gitattributes vendored
View File

@ -31,6 +31,7 @@ compiler/aarch64/ncpuadd.pas svneol=native#text/plain
compiler/aarch64/ncpucnv.pas svneol=native#text/plain
compiler/aarch64/ncpuinl.pas svneol=native#text/plain
compiler/aarch64/ncpumat.pas svneol=native#text/plain
compiler/aarch64/ncpumem.pas svneol=native#text/plain
compiler/aarch64/ra64con.inc svneol=native#text/plain
compiler/aarch64/ra64dwa.inc svneol=native#text/plain
compiler/aarch64/ra64nor.inc svneol=native#text/plain
@ -11945,6 +11946,7 @@ tests/test/tmsg3.pp svneol=native#text/plain
tests/test/tmsg4.pp svneol=native#text/plain
tests/test/tmt1.pp svneol=native#text/plain
tests/test/tmul1.pp svneol=native#text/pascal
tests/test/tnest1.pp svneol=native#text/plain
tests/test/tnoext1.pp svneol=native#text/plain
tests/test/tnoext2.pp svneol=native#text/plain
tests/test/tnoext3.pp svneol=native#text/plain

View File

@ -95,7 +95,7 @@ interface
procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
private
procedure save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
end;
@ -519,7 +519,7 @@ implementation
begin
inherited init_register_allocators;
rg[R_INTREGISTER]:=Trgcpu.create(R_INTREGISTER,R_SUBWHOLE,
rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE,
[RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8,
RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17,
RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28
@ -1399,12 +1399,13 @@ implementation
{ *********** entry/exit code and address loading ************ }
procedure tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister);
function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint;
var
ref: treference;
sr: tsuperregister;
pairreg: tregister;
begin
result:=0;
reference_reset_base(ref,NR_SP,-16,16);
ref.addressmode:=AM_PREINDEXED;
pairreg:=NR_NO;
@ -1415,18 +1416,38 @@ implementation
pairreg:=newreg(rt,sr,sub)
else
begin
inc(result,16);
list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref));
pairreg:=NR_NO
end;
{ one left -> store twice (stack must be 16 bytes aligned) }
if pairreg<>NR_NO then
list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
begin
list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref));
inc(result,16);
end;
end;
procedure FixupOffsets(p:TObject;arg:pointer);
var
sym: tabstractnormalvarsym absolute p;
begin
if (tsym(p).typ in [paravarsym,localvarsym]) and
(sym.localloc.loc=LOC_REFERENCE) and
(sym.localloc.reference.base=NR_STACK_POINTER_REG) then
begin
sym.localloc.reference.base:=NR_FRAME_POINTER_REG;
dec(sym.localloc.reference.offset,PLongint(arg)^);
end;
end;
procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
var
ref: treference;
totalstackframesize: longint;
begin
if nostackframe then
exit;
@ -1440,12 +1461,15 @@ implementation
{ initialise frame pointer }
a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
totalstackframesize:=localsize;
{ save modified integer registers }
save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
inc(totalstackframesize,
save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE));
{ only the lower 64 bits of the modified vector registers need to be
saved; if the caller needs the upper 64 bits, it has to save them
itself }
save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD);
inc(totalstackframesize,
save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD));
{ allocate stack space }
if localsize<>0 then
@ -1454,6 +1478,37 @@ implementation
current_procinfo.final_localsize:=localsize;
handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true);
end;
{ By default, we use the frame pointer to access parameters passed via
the stack and the stack pointer to address local variables and temps
because
a) we can use bigger positive than negative offsets (so accessing
locals via negative offsets from the frame pointer would be less
efficient)
b) we don't know the local size while generating the code, so
accessing the parameters via the stack pointer is not possible
without copying them
The problem with this is the get_frame() intrinsic:
a) it must return the same value as what we pass as parentfp
parameter, since that's how it's used in the TP-style objects unit
b) its return value must usable to access all local data from a
routine (locals and parameters), since it's all the nested
routines have access to
c) its return value must be usable to construct a backtrace, as it's
also used by the exception handling routines
The solution we use here, based on something similar that's done in
the MIPS port, is to generate all accesses to locals in the routine
itself SP-relative, and then after the code is generated and the local
size is known (namely, here), we change all SP-relative variables/
parameters into FP-relative ones. This means that they'll be accessed
less efficiently from nested routines, but those accesses are indirect
anyway and at least this way they can be accessed at all
}
if current_procinfo.has_nestedprocs then
begin
current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize);
end;
end;

View File

@ -31,7 +31,7 @@ implementation
uses
ncgbas,ncgflw,ncgcal,ncgcnv,ncgld,ncgmem,ncgcon,ncgset,ncgobjc,
ncpuadd,ncpumat,ncpuinl,ncpucnv,{ncpuset,}
ncpuadd,ncpumat,ncpumem,ncpuinl,ncpucnv,{ncpuset,}
{ this not really a node }
rgcpu,
{ symtable }

View File

@ -535,10 +535,6 @@ unit cpupara;
begin
paraloc^.size:=paracgsize;
paraloc^.loc:=LOC_REFERENCE;
if side=callerside then
paraloc^.reference.index:=NR_STACK_POINTER_REG
else
paraloc^.reference.index:=NR_FRAME_POINTER_REG;
{ the current stack offset may not be properly aligned in
case we're on Darwin have allocated a non-variadic argument
@ -563,6 +559,13 @@ unit cpupara;
paraloc^.reference.offset:=curstackoffset
else
paraloc^.reference.offset:=curstackoffset+stackslotlen-paralen;
if side=callerside then
paraloc^.reference.index:=NR_STACK_POINTER_REG
else
begin
paraloc^.reference.index:=NR_FRAME_POINTER_REG;
inc(paraloc^.reference.offset,16);
end;
inc(curstackoffset,stackslotlen);
paralen:=0
end;

View File

@ -26,18 +26,42 @@ unit cpupi;
interface
uses
procinfo,
psub;
type
taarch64procinfo=class(tcgprocinfo)
{ no need to override anything, as the ABI requires us to use a frame
pointer at all times }
constructor create(aparent: tprocinfo); override;
procedure set_first_temp_offset; override;
end;
implementation
uses
procinfo;
tgobj,
cpubase;
constructor taarch64procinfo.create(aparent: tprocinfo);
begin
inherited;
{ use the stack pointer as framepointer, because
1) we exactly know the offsets of the temps from the stack pointer
after pass 1 (based on the require parameter stack size for called
routines), while we don't know it for the frame pointer (it depends
on the number of saved registers)
2) temp offsets from the stack pointer are positive while those from
the frame pointer are negative, and we can directly encode much
bigger positive offsets in the instructions
}
framepointer:=NR_STACK_POINTER_REG;
end;
procedure taarch64procinfo.set_first_temp_offset;
begin
{ leave room for allocated parameters }
tg.setfirsttemp(align(maxpushedparasize,16));
end;
begin
cprocinfo:=taarch64procinfo;

View File

@ -41,6 +41,7 @@ interface
procedure second_abs_long; override;
procedure second_round_real; override;
procedure second_trunc_real; override;
procedure second_get_frame; override;
private
procedure load_fpu_location;
end;
@ -167,6 +168,17 @@ implementation
current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_FCVTZS,location.register,left.location.register));
end;
procedure taarch64inlinenode.second_get_frame;
begin
location_reset(location,LOC_CREGISTER,OS_ADDR);
{ this routine is used to get the frame pointer for backtracing
purposes. current_procinfo.framepointer is set to SP because that one
is used to access temps. On most platforms these two frame pointers
are the same, but not on AArch64. }
location.register:=NR_FRAME_POINTER_REG;
end;
begin
cinlinenode:=taarch64inlinenode;
end.

View File

@ -0,0 +1,66 @@
{
Copyright (c) 2014 by Jonas Maebe
Generate AArch64 code for in memory related nodes
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
****************************************************************************
}
unit ncpumem;
{$i fpcdefs.inc}
interface
uses
globtype,
node,nmem,ncgmem;
type
taarch64loadparentfpnode = class(tcgloadparentfpnode)
procedure pass_generate_code; override;
end;
implementation
uses
aasmdata,cgbase,cpubase,
cgobj;
{ taarch64loadparentfpnode }
procedure taarch64loadparentfpnode.pass_generate_code;
begin
inherited pass_generate_code;
{ see the comments in tcgaarch64.g_proc_entry }
if (location.loc in [LOC_REGISTER,LOC_CREGISTER]) and
(location.register=NR_STACK_POINTER_REG) then
if (kind=lpf_forpara) then
location.register:=NR_FRAME_POINTER_REG
else
begin
{ load stack pointer in a different register, as many instructions
cannot directly work with the stack pointer. The register
allocator can merge them if possible }
location.register:=cg.getaddressregister(current_asmdata.CurrAsmList);
cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_ADDR,OS_ADDR,NR_STACK_POINTER_REG,location.register);
location.loc:=LOC_REGISTER;
end;
end;
begin
cloadparentfpnode:=taarch64loadparentfpnode;
end.

View File

@ -40,6 +40,10 @@ unit rgcpu;
procedure do_spill_op(list: tasmlist; op: tasmop; pos: tai; const spilltemp: treference; tempreg: tregister);
end;
trgintcpu=class(trgcpu)
procedure add_cpu_interferences(p: tai); override;
end;
implementation
@ -97,4 +101,62 @@ implementation
end;
procedure trgintcpu.add_cpu_interferences(p: tai);
var
i: longint;
begin
if p.typ=ait_instruction then
begin
{ add interferences for instructions that can have SP as a register
operand }
case taicpu(p).opcode of
A_MOV:
{ all operands can be SP }
exit;
A_ADD,
A_SUB,
A_CMP,
A_CMN:
{ ok as destination or first source in immediate or extended
register form }
if (taicpu(p).oper[taicpu(p).ops-1]^.typ<>top_shifterop) or
valid_shifter_operand(taicpu(p).opcode,false,true,
reg_cgsize(taicpu(p).oper[0]^.reg) in [OS_64,OS_S64],
taicpu(p).oper[taicpu(p).ops-1]^.shifterop^.shiftmode,
taicpu(p).oper[taicpu(p).ops-1]^.shifterop^.shiftimm) then
begin
if taicpu(p).oper[taicpu(p).ops-1]^.typ=top_shifterop then
i:=taicpu(p).ops-2
else
i:=taicpu(p).ops-1;
if (taicpu(p).oper[i]^.typ=top_reg) then
add_edge(getsupreg(taicpu(p).oper[i]^.reg),RS_SP);
exit;
end;
A_AND,
A_EOR,
A_ORR,
A_TST:
{ ok in immediate form }
if taicpu(p).oper[taicpu(p).ops-1]^.typ=top_const then
exit;
end;
{ add interferences for other registers }
for i:=0 to taicpu(p).ops-1 do
begin
case taicpu(p).oper[i]^.typ of
top_reg:
if getregtype(taicpu(p).oper[i]^.reg)=R_INTREGISTER then
add_edge(getsupreg(taicpu(p).oper[i]^.reg),RS_SP);
top_ref:
begin
{ sp can always be base, never be index }
if taicpu(p).oper[i]^.ref^.index<>NR_NO then
add_edge(getsupreg(taicpu(p).oper[i]^.ref^.index),RS_SP);
end;
end;
end;
end;
end;
end.

View File

@ -242,6 +242,7 @@
{$define cpurox}
{$define cputargethasfixedstack}
{$define cpurefshaveindexreg}
{$define SUPPORT_GET_FRAME}
{$endif aarch64}
{$IFDEF MACOS}

View File

@ -179,7 +179,7 @@ implementation
tempfreelist:=nil;
templist:=nil;
{ we could create a new child class for this but I don't if it is worth the effort (FK) }
{$if defined(powerpc) or defined(powerpc64) or defined(avr) or defined(jvm)}
{$if defined(powerpc) or defined(powerpc64) or defined(avr) or defined(jvm) or defined(aarch64)}
direction:=1;
{$else}
direction:=-1;

View File

@ -82,12 +82,6 @@ procedure fpc_cpuinit;
String
****************************************************************************}
{$define FPC_SYSTEM_HAS_GET_FRAME}
function get_frame:pointer;assembler; nostackframe;
asm
mov x0, x29
end;
{$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;assembler; nostackframe;
asm

38
tests/test/tnest1.pp Normal file
View File

@ -0,0 +1,38 @@
{$inline on}
procedure test(l1, l2: longint);
var
a1: cardinal;
d1, d2: double;
a2: cardinal;
procedure nested; inline;
begin
l1:=1;
l2:=2;
d1:=3.0;
d2:=4.0;
end;
begin
a1:=$deadbeef;
a2:=$cafe0000;
nested;
if a1<>$deadbeef then
halt(1);
if a2<>$cafe0000 then
halt(2);
if l1<>1 then
halt(3);
if l2<>2 then
halt(4);
if d1<>3.0 then
halt(5);
if d2<>4.0 then
halt(6);
end;
begin
test(5,6);
end.