From 94cf650d9a79f85b294f1ee90e89f0bd1fc1505a Mon Sep 17 00:00:00 2001 From: florian Date: Fri, 28 Jun 2013 17:06:57 +0000 Subject: [PATCH] * use lea to adjust stack pointer, this is equal or faster on all modern CPUs than add/sub git-svn-id: trunk@25010 - --- compiler/i386/cgcpu.pas | 16 +++++++++++++--- compiler/x86/cgx86.pas | 24 +++++++++++++++++------- compiler/x86_64/cgcpu.pas | 12 +++++++++++- 3 files changed, 41 insertions(+), 11 deletions(-) diff --git a/compiler/i386/cgcpu.pas b/compiler/i386/cgcpu.pas index c962a17abc..02b446c7c5 100644 --- a/compiler/i386/cgcpu.pas +++ b/compiler/i386/cgcpu.pas @@ -293,6 +293,16 @@ unit cgcpu; procedure tcg386.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean); + + procedure increase_fp(a : tcgint); + var + href : treference; + begin + reference_reset_base(href,current_procinfo.framepointer,a,0); + { normally, lea is a better choice than an add } + list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,current_procinfo.framepointer)); + end; + var stacksize : longint; begin @@ -304,7 +314,7 @@ unit cgcpu; { remove stackframe } if not nostackframe then begin - if (current_procinfo.framepointer=NR_STACK_POINTER_REG) then + if current_procinfo.framepointer=NR_STACK_POINTER_REG then begin stacksize:=current_procinfo.calc_stackframe_size; if (target_info.stackalign>4) and @@ -314,8 +324,8 @@ unit cgcpu; { if you (think you) know what you are doing } (po_assembler in current_procinfo.procdef.procoptions)) then stacksize := align(stacksize+sizeof(aint),target_info.stackalign) - sizeof(aint); - if (stacksize<>0) then - cg.a_op_const_reg(list,OP_ADD,OS_ADDR,stacksize,current_procinfo.framepointer); + if stacksize<>0 then + increase_fp(stacksize); end else list.concat(Taicpu.op_none(A_LEAVE,S_NO)); diff --git a/compiler/x86/cgx86.pas b/compiler/x86/cgx86.pas index 1cf53e0819..40af0ca0a0 100644 --- a/compiler/x86/cgx86.pas +++ b/compiler/x86/cgx86.pas @@ -2318,6 +2318,16 @@ unit cgx86; procedure tcgx86.g_stackpointer_alloc(list : TAsmList;localsize : longint); + + procedure decrease_sp(a : tcgint); + var + href : treference; + begin + reference_reset_base(href,NR_STACK_POINTER_REG,-a,0); + { normally, lea is a better choice than a sub to adjust the stack pointer } + list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,NR_STACK_POINTER_REG)); + end; + {$ifdef x86} {$ifndef NOTARGETWIN} var @@ -2338,7 +2348,7 @@ unit cgx86; begin if localsize div winstackpagesize<=5 then begin - list.concat(Taicpu.Op_const_reg(A_SUB,S_L,localsize-4,NR_ESP)); + decrease_sp(localsize-4); for i:=1 to localsize div winstackpagesize do begin reference_reset_base(href,NR_ESP,localsize-i*winstackpagesize,4); @@ -2353,11 +2363,11 @@ unit cgx86; list.concat(Taicpu.op_reg(A_PUSH,S_L,NR_EDI)); list.concat(Taicpu.op_const_reg(A_MOV,S_L,localsize div winstackpagesize,NR_EDI)); a_label(list,again); - list.concat(Taicpu.op_const_reg(A_SUB,S_L,winstackpagesize-4,NR_ESP)); + decrease_sp(winstackpagesize-4); list.concat(Taicpu.op_reg(A_PUSH,S_L,NR_EAX)); list.concat(Taicpu.op_reg(A_DEC,S_L,NR_EDI)); a_jmp_cond(list,OC_NE,again); - list.concat(Taicpu.op_const_reg(A_SUB,S_L,localsize mod winstackpagesize - 4,NR_ESP)); + decrease_sp(localsize mod winstackpagesize-4); reference_reset_base(href,NR_ESP,localsize-4,4); list.concat(Taicpu.op_ref_reg(A_MOV,S_L,href,NR_EDI)); ungetcpuregister(list,NR_EDI); @@ -2375,7 +2385,7 @@ unit cgx86; begin if localsize div winstackpagesize<=5 then begin - list.concat(Taicpu.Op_const_reg(A_SUB,S_Q,localsize,NR_RSP)); + decrease_sp(localsize); for i:=1 to localsize div winstackpagesize do begin reference_reset_base(href,NR_RSP,localsize-i*winstackpagesize+4,4); @@ -2390,19 +2400,19 @@ unit cgx86; getcpuregister(list,NR_R10); list.concat(Taicpu.op_const_reg(A_MOV,S_Q,localsize div winstackpagesize,NR_R10)); a_label(list,again); - list.concat(Taicpu.op_const_reg(A_SUB,S_Q,winstackpagesize,NR_RSP)); + decrease_sp(winstackpagesize); reference_reset_base(href,NR_RSP,0,4); list.concat(Taicpu.op_reg_ref(A_MOV,S_L,NR_EAX,href)); list.concat(Taicpu.op_reg(A_DEC,S_Q,NR_R10)); a_jmp_cond(list,OC_NE,again); - list.concat(Taicpu.op_const_reg(A_SUB,S_Q,localsize mod winstackpagesize,NR_RSP)); + decrease_sp(localsize mod winstackpagesize); ungetcpuregister(list,NR_R10); end end else {$endif NOTARGETWIN} {$endif x86_64} - list.concat(Taicpu.Op_const_reg(A_SUB,tcgsize2opsize[OS_ADDR],localsize,NR_STACK_POINTER_REG)); + decrease_sp(localsize); end; end; diff --git a/compiler/x86_64/cgcpu.pas b/compiler/x86_64/cgcpu.pas index bd564f9f87..8caaeffbea 100644 --- a/compiler/x86_64/cgcpu.pas +++ b/compiler/x86_64/cgcpu.pas @@ -177,6 +177,16 @@ unit cgcpu; procedure tcgx86_64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean); + + procedure increase_sp(a : tcgint); + var + href : treference; + begin + reference_reset_base(href,NR_STACK_POINTER_REG,a,0); + { normally, lea is a better choice than an add } + list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,NR_STACK_POINTER_REG)); + end; + var href : treference; begin @@ -195,7 +205,7 @@ unit cgcpu; (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then begin if (current_procinfo.final_localsize<>0) then - cg.a_op_const_reg(list,OP_ADD,OS_ADDR,current_procinfo.final_localsize,NR_STACK_POINTER_REG); + increase_sp(current_procinfo.final_localsize); if (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG)); end