diff --git a/compiler/x86_64/cgcpu.pas b/compiler/x86_64/cgcpu.pas index b2615bcd30..52be18cb53 100644 --- a/compiler/x86_64/cgcpu.pas +++ b/compiler/x86_64/cgcpu.pas @@ -40,9 +40,14 @@ unit cgcpu; procedure g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);override; procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override; procedure g_local_unwind(list: TAsmList; l: TAsmLabel);override; + procedure g_save_registers(list: TAsmList);override; + procedure g_restore_registers(list: TAsmList);override; procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize;intreg, mmreg: tregister; shuffle: pmmshuffle); override; procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize;mmreg, intreg: tregister;shuffle : pmmshuffle); override; + private + function use_push: boolean; + function saved_xmm_reg_size: longint; end; procedure create_codegen; @@ -103,6 +108,29 @@ unit cgcpu; end; + function tcgx86_64.use_push: boolean; + begin + result:=(current_procinfo.framepointer=NR_STACK_POINTER_REG) or + (current_procinfo.procdef.proctypeoption=potype_exceptfilter); + end; + + + function tcgx86_64.saved_xmm_reg_size: longint; + var + i: longint; + begin + result:=0; + if (target_info.system<>system_x86_64_win64) or + (not uses_registers(R_MMREGISTER)) then + exit; + for i:=low(saved_mm_registers) to high(saved_mm_registers) do + begin + if (saved_mm_registers[i] in rg[R_MMREGISTER].used_in_proc) then + inc(result,tcgsize2size[OS_VECTOR]); + end; + end; + + procedure tcgx86_64.g_proc_entry(list : TAsmList;localsize:longint;nostackframe:boolean); var hitem: tlinkedlistitem; @@ -113,7 +141,31 @@ unit cgcpu; suppress_endprologue: boolean; stackmisalignment: longint; para: tparavarsym; + xmmsize: longint; + + procedure push_one_reg(reg: tregister); begin + list.concat(taicpu.op_reg(A_PUSH,tcgsize2opsize[OS_ADDR],reg)); + if (target_info.system=system_x86_64_win64) then + begin + list.concat(cai_seh_directive.create_reg(ash_pushreg,reg)); + include(current_procinfo.flags,pi_has_unwind_info); + end; + end; + + procedure push_regs; + var + r: longint; + begin + for r := low(saved_standard_registers) to high(saved_standard_registers) do + if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then + begin + inc(stackmisalignment,sizeof(pint)); + push_one_reg(newreg(R_INTREGISTER,saved_standard_registers[r],R_SUBWHOLE)); + end; + end; + + begin hitem:=list.last; { pi_has_unwind_info may already be set at this point if there are SEH directives in assembler body. In this case, .seh_endprologue @@ -127,17 +179,15 @@ unit cgcpu; stackmisalignment := sizeof(pint); list.concat(tai_regalloc.alloc(current_procinfo.framepointer,nil)); if current_procinfo.framepointer=NR_STACK_POINTER_REG then - CGmessage(cg_d_stackframe_omited) + begin + push_regs; + CGmessage(cg_d_stackframe_omited); + end else begin { push } inc(stackmisalignment,sizeof(pint)); - list.concat(Taicpu.op_reg(A_PUSH,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG)); - if (target_info.system=system_x86_64_win64) then - begin - list.concat(cai_seh_directive.create_reg(ash_pushreg,NR_FRAME_POINTER_REG)); - include(current_procinfo.flags,pi_has_unwind_info); - end; + push_one_reg(NR_FRAME_POINTER_REG); { Return address and FP are both on stack } current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint)); current_asmdata.asmcfi.cfa_offset(list,NR_FRAME_POINTER_REG,-(2*sizeof(pint))); @@ -145,6 +195,7 @@ unit cgcpu; list.concat(Taicpu.op_reg_reg(A_MOV,tcgsize2opsize[OS_ADDR],NR_STACK_POINTER_REG,NR_FRAME_POINTER_REG)) else begin + push_regs; { load framepointer from hidden $parentfp parameter } para:=tparavarsym(current_procinfo.procdef.paras[0]); if not (vo_is_parentfp in para.varoptions) then @@ -170,6 +221,14 @@ unit cgcpu; } end; + xmmsize:=saved_xmm_reg_size; + if use_push and (xmmsize<>0) then + begin + localsize:=align(localsize,target_info.stackalign)+xmmsize; + reference_reset_base(current_procinfo.save_regs_ref,NR_STACK_POINTER_REG, + localsize-xmmsize,tcgsize2size[OS_VECTOR]); + end; + { allocate stackframe space } if (localsize<>0) or ((target_info.stackalign>sizeof(pint)) and @@ -188,6 +247,16 @@ unit cgcpu; if localsize<>0 then list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize)); include(current_procinfo.flags,pi_has_unwind_info); + if use_push and (xmmsize<>0) then + begin + href:=current_procinfo.save_regs_ref; + for r:=low(saved_mm_registers) to high(saved_mm_registers) do + if saved_mm_registers[r] in rg[R_MMREGISTER].used_in_proc then + begin + a_loadmm_reg_ref(list,OS_VECTOR,OS_VECTOR,newreg(R_MMREGISTER,saved_mm_registers[r],R_SUBMMWHOLE),href,nil); + inc(href.offset,tcgsize2size[OS_VECTOR]); + end; + end; end; end; end; @@ -209,6 +278,8 @@ unit cgcpu; since registers are not modified before they are saved, and saves do not change RSP, 'logically' all saves can happen at the end of prologue. } href:=current_procinfo.save_regs_ref; + if (not use_push) then + begin for r:=low(saved_standard_registers) to high(saved_standard_registers) do if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then begin @@ -217,6 +288,7 @@ unit cgcpu; href.offset+frame_offset)); inc(href.offset,sizeof(aint)); end; + end; if uses_registers(R_MMREGISTER) then begin if (href.offset mod tcgsize2size[OS_VECTOR])<>0 then @@ -256,6 +328,8 @@ unit cgcpu; var href : treference; + hreg : tregister; + r : longint; begin { Release PIC register } if cs_create_pic in current_settings.moduleswitches then @@ -268,11 +342,26 @@ unit cgcpu; { remove stackframe } if not nostackframe then begin - if (current_procinfo.framepointer=NR_STACK_POINTER_REG) or - (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then + if use_push then begin + if (saved_xmm_reg_size<>0) then + begin + href:=current_procinfo.save_regs_ref; + for r:=low(saved_mm_registers) to high(saved_mm_registers) do + if saved_mm_registers[r] in rg[R_MMREGISTER].used_in_proc then + begin + { Allocate register so the optimizer does not remove the load } + hreg:=newreg(R_MMREGISTER,saved_mm_registers[r],R_SUBMMWHOLE); + a_reg_alloc(list,hreg); + a_loadmm_ref_reg(list,OS_VECTOR,OS_VECTOR,href,hreg,nil); + inc(href.offset,tcgsize2size[OS_VECTOR]); + end; + end; + if (current_procinfo.final_localsize<>0) then increase_sp(current_procinfo.final_localsize); + internal_restore_regs(list,true); + if (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG)); end @@ -300,6 +389,20 @@ unit cgcpu; end; + procedure tcgx86_64.g_save_registers(list: TAsmList); + begin + if (not use_push) then + inherited g_save_registers(list); + end; + + + procedure tcgx86_64.g_restore_registers(list: TAsmList); + begin + if (not use_push) then + inherited g_restore_registers(list); + end; + + procedure tcgx86_64.g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint); var make_global : boolean;