From a47f153daed4b95925d299168baf916a65573f73 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20Kl=C3=A4mpfl?= Date: Tue, 23 Oct 2018 22:35:34 +0200 Subject: [PATCH] * avoid to create a stack frame on aarch64 if possible --- compiler/aarch64/cgcpu.pas | 71 ++++++++++++++++++------------ compiler/armgen/aoptarm.pas | 3 +- compiler/globtype.pas | 4 +- compiler/psub.pas | 14 ++++-- compiler/utils/ppuutils/ppudump.pp | 5 ++- 5 files changed, 63 insertions(+), 34 deletions(-) diff --git a/compiler/aarch64/cgcpu.pas b/compiler/aarch64/cgcpu.pas index 0a453753bf..be5966b9fc 100644 --- a/compiler/aarch64/cgcpu.pas +++ b/compiler/aarch64/cgcpu.pas @@ -1872,6 +1872,13 @@ implementation ref: treference; totalstackframesize: longint; begin + { on aarch64, we need to store the link register and the generate a frame pointer if the subroutine either + - receives parameters on the stack + - is not a leaf procedure + - has nested procedures + - helpers retrieve the stack pointer + } + hitem:=list.last; { pi_has_unwind_info may already be set at this point if there are SEH directives in assembler body. In this case, .seh_endprologue @@ -1885,28 +1892,30 @@ implementation if target_info.system=system_aarch64_win64 then include(current_procinfo.flags,pi_has_unwind_info); - - { save stack pointer and return address } - reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]); - ref.addressmode:=AM_PREINDEXED; - list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref)); - current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint)); - current_asmdata.asmcfi.cfa_offset(list,NR_FP,-16); - current_asmdata.asmcfi.cfa_offset(list,NR_LR,-8); - if target_info.system=system_aarch64_win64 then - list.concat(cai_seh_directive.create_offset(ash_savefplr_x,16)); - { initialise frame pointer } - if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then + if not(pi_no_framepointer_needed in current_procinfo.flags) then begin - a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP); - current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FP); + { save stack pointer and return address } + reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]); + ref.addressmode:=AM_PREINDEXED; + list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref)); + current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint)); + current_asmdata.asmcfi.cfa_offset(list,NR_FP,-16); + current_asmdata.asmcfi.cfa_offset(list,NR_LR,-8); if target_info.system=system_aarch64_win64 then - list.concat(cai_seh_directive.create(ash_setfp)); - end - else - begin - gen_load_frame_for_exceptfilter(list); - localsize:=current_procinfo.maxpushedparasize; + list.concat(cai_seh_directive.create_offset(ash_savefplr_x,16)); + { initialise frame pointer } + if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then + begin + a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP); + current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FP); + if target_info.system=system_aarch64_win64 then + list.concat(cai_seh_directive.create(ash_setfp)); + end + else + begin + gen_load_frame_for_exceptfilter(list); + localsize:=current_procinfo.maxpushedparasize; + end; end; totalstackframesize:=localsize; @@ -2081,7 +2090,6 @@ implementation end; - procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean); var ref: treference; @@ -2122,13 +2130,22 @@ implementation load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE); end else if current_procinfo.final_localsize<>0 then - { restore stack pointer } - a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP); + begin + { restore stack pointer } + if pi_no_framepointer_needed in current_procinfo.flags then + handle_reg_imm12_reg(list,A_ADD,OS_ADDR,current_procinfo.framepointer,current_procinfo.final_localsize, + current_procinfo.framepointer,NR_IP0,false,true) + else + a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP); + end; - { restore framepointer and return address } - reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]); - ref.addressmode:=AM_POSTINDEXED; - list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref)); + if not(pi_no_framepointer_needed in current_procinfo.flags) then + begin + { restore framepointer and return address } + reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]); + ref.addressmode:=AM_POSTINDEXED; + list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref)); + end; end; { return } diff --git a/compiler/armgen/aoptarm.pas b/compiler/armgen/aoptarm.pas index 66e54c61cd..40175737f5 100644 --- a/compiler/armgen/aoptarm.pas +++ b/compiler/armgen/aoptarm.pas @@ -1133,7 +1133,8 @@ Implementation GetNextInstruction(p, hp1) and (hp1.typ = ait_instruction) and (taicpu(hp1).condition = C_None) and - (taicpu(hp1).oppostfix = taicpu(p).oppostfix) then + (taicpu(hp1).oppostfix = taicpu(p).oppostfix) and + (taicpu(hp1).ops>0) and (taicpu(hp1).oper[0]^.typ=top_reg) then begin { Saves constant dereferencing and makes it easier to change the size if necessary } SrcReg := taicpu(p).oper[0]^.reg; diff --git a/compiler/globtype.pas b/compiler/globtype.pas index f8977f4f95..1536978d6b 100644 --- a/compiler/globtype.pas +++ b/compiler/globtype.pas @@ -786,7 +786,9 @@ interface { subroutine uses get_frame } pi_uses_get_frame, { x86 only: subroutine uses ymm registers, requires vzeroupper call } - pi_uses_ymm + pi_uses_ymm, + { set if no frame pointer is needed, the rules when this applies is target specific } + pi_no_framepointer_needed ); tprocinfoflags=set of tprocinfoflag; diff --git a/compiler/psub.pas b/compiler/psub.pas index 24979aa20d..6c89f86362 100644 --- a/compiler/psub.pas +++ b/compiler/psub.pas @@ -1046,7 +1046,7 @@ implementation end; -{$if defined(i386) or defined(x86_64) or defined(arm) or defined(riscv32) or defined(riscv64) or defined(m68k)} +{$if defined(i386) or defined(x86_64) or defined(arm) or defined(aarch64) or defined(riscv32) or defined(riscv64) or defined(m68k)} const exception_flags: array[boolean] of tprocinfoflags = ( [], @@ -1058,7 +1058,7 @@ implementation begin tg:=tgobjclass.create; -{$if defined(i386) or defined(x86_64) or defined(arm) or defined(m68k)} +{$if defined(i386) or defined(x86_64) or defined(arm) or defined(aarch64) or defined(m68k)} {$if defined(arm)} { frame and stack pointer must be always the same on arm thumb so it makes no sense to fiddle with a frame pointer } @@ -1102,11 +1102,16 @@ implementation not(cs_generate_stackframes in current_settings.localswitches) and not(cs_profile in current_settings.moduleswitches) and not(po_assembler in procdef.procoptions) and +{$if defined(aarch64)} + { on aarch64, it must be a leaf subroutine } + not(pi_do_call in flags) and +{$endif defined(aarch64)} not ((pi_has_stackparameter in flags) -{$ifndef arm} { Outgoing parameter(s) on stack do not need stackframe on x86 targets +{$if defined(i386) or defined(x86_64)} + { Outgoing parameter(s) on stack do not need stackframe on x86 targets with fixed stack. On ARM it fails, see bug #25050 } and (not paramanager.use_fixed_stack) -{$endif arm} +{$endif defined(i386) or defined(x86_64)} ) and ((flags*([pi_has_assembler_block,pi_is_assembler, pi_needs_stackframe]+ @@ -1137,6 +1142,7 @@ implementation { Only need to set the framepointer } framepointer:=NR_STACK_POINTER_REG; tg.direction:=1; + Include(flags,pi_no_framepointer_needed) end {$if defined(arm)} { On arm, the stack frame size can be estimated to avoid using an extra frame pointer, diff --git a/compiler/utils/ppuutils/ppudump.pp b/compiler/utils/ppuutils/ppudump.pp index df032744d1..64e4e4e523 100644 --- a/compiler/utils/ppuutils/ppudump.pp +++ b/compiler/utils/ppuutils/ppudump.pp @@ -1661,7 +1661,10 @@ const (mask:pi_uses_get_frame; str:' uses get_frame'), (mask:pi_uses_ymm; - str:' uses ymm register (x86 only)') + str:' uses ymm register (x86 only)'), + (mask:pi_no_framepointer_needed; + str:' set if no frame pointer is needed, the rules when this applies is target specific' + ) ); var procinfooptions : tprocinfoflags;