diff --git a/.gitattributes b/.gitattributes index 1eb0625ab8..8ffacaecf1 100644 --- a/.gitattributes +++ b/.gitattributes @@ -17,6 +17,7 @@ compiler/aarch64/aasmcpu.pas svneol=native#text/plain compiler/aarch64/aoptcpu.pas svneol=native#text/plain compiler/aarch64/aoptcpub.pas svneol=native#text/plain compiler/aarch64/aoptcpud.pas svneol=native#text/plain +compiler/aarch64/cgcpu.pas svneol=native#text/plain compiler/aarch64/cpubase.pas svneol=native#text/plain compiler/aarch64/cpuinfo.pas svneol=native#text/plain compiler/aarch64/cpupara.pas svneol=native#text/plain diff --git a/compiler/aarch64/cgcpu.pas b/compiler/aarch64/cgcpu.pas new file mode 100644 index 0000000000..fcc4ccb810 --- /dev/null +++ b/compiler/aarch64/cgcpu.pas @@ -0,0 +1,1816 @@ +{ + Copyright (c) 2014 by Jonas Maebe + + This unit implements the code generator for AArch64 + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + + **************************************************************************** +} +unit cgcpu; + +{$i fpcdefs.inc} + +interface + + uses + globtype,parabase, + cgbase,cgutils,cgobj, + aasmbase,aasmtai,aasmdata,aasmcpu, + cpubase,cpuinfo, + node,symconst,SymType,symdef, + rgcpu; + + type + tcgaarch64=class(tcg) + protected + { simplifies "ref" so it can be used with "op". If "ref" can be used + with a different load/Store operation that has the same meaning as the + original one, "op" will be replaced with the alternative } + procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister); + { changes register size without adding register allocation info } + function makeregsize(reg: tregister; size: tcgsize): tregister; overload; + public + function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override; + procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean); + procedure init_register_allocators;override; + procedure done_register_allocators;override; + function getmmregister(list:TAsmList;size:tcgsize):tregister;override; + function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference; + procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override; + procedure a_call_reg(list:TAsmList;Reg:tregister);override; + { General purpose instructions } + procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister); + procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override; + procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override; + procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override; + procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override; + procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override; + procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override; + { move instructions } + procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override; + procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override; + procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override; + procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override; + procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override; + procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override; + procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override; + { fpu move instructions (not used, all floating point is vector unit-based) } + procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override; + procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override; + procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override; + procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override; + procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override; + procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override; + { comparison operations } + procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override; + procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override; + procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override; + procedure a_jmp_name(list: TAsmList; const s: string);override; + procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;} + procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override; + procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override; + procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override; + procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override; + procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override; + procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override; + procedure g_maybe_got_init(list: TAsmList); override; + procedure g_restore_registers(list: TAsmList);override; + procedure g_save_registers(list: TAsmList);override; + procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override; + procedure g_concatcopy_unaligned(list: TAsmList; const source, dest: treference; len: tcgint);override; + procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len : tcgint); + procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override; + procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override; + private + procedure save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); + procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); + end; + + procedure create_codegen; + + const + TOpCG2AsmOpReg: array[topcg] of TAsmOp = ( + A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV + ); + TOpCG2AsmOpImm: array[topcg] of TAsmOp = ( + A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR + ); + TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT, + C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI + ); + + +implementation + + uses + globals,verbose,systems,cutils, + paramgr,fmodule, + symtable,symsym, + tgobj, + procinfo,cpupi; + + + procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister); + var + href: treference; + so: tshifterop; + accesssize: longint; + begin + if (ref.base=NR_NO) then + begin + if ref.shiftmode<>SM_None then + internalerror(2014110701); + ref.base:=ref.index; + ref.index:=NR_NO; + end; + { no abitrary scale factor support (the generic code doesn't set it, + AArch-specific code shouldn't either) } + if not(ref.scalefactor in [0,1]) then + internalerror(2014111002); + + case simple_ref_type(op,size,oppostfix,ref) of + sr_simple: + exit; + sr_internal_illegal: + internalerror(2014121702); + sr_complex: + { continue } ; + end; + + if assigned(ref.symbol) then + begin + { internal "load symbol" instructions should already be valid } + if assigned(ref.symboldata) or + (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset]) then + internalerror(2014110802); + { no relative symbol support (needed) yet } + if assigned(ref.relsymbol) then + internalerror(2014111001); + { on Darwin: load the address from the GOT. There does not appear to + be a non-GOT variant. This consists of first loading the address + of the page containing the GOT entry for this variable, and then + the address of the entry itself from that page (can be relaxed by + the linker in case the variable itself can be stored directly in + the GOT) } + if target_info.system in systems_darwin then + begin + if (preferred_newbasereg=NR_NO) or + (ref.base=preferred_newbasereg) or + (ref.index=preferred_newbasereg) then + preferred_newbasereg:=getaddressregister(list); + { load the GOT page } + reference_reset_symbol(href,ref.symbol,0,8); + href.refaddr:=addr_gotpage; + list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href)); + { load the GOT entry (= address of the variable) } + reference_reset_base(href,preferred_newbasereg,0,sizeof(pint)); + href.symbol:=ref.symbol; + href.refaddr:=addr_gotpageoffset; + { use a_load_ref_reg() rather than directly encoding the LDR, + so that we'll check the validity of the reference } + a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg); + { set as new base register } + if ref.base=NR_NO then + ref.base:=preferred_newbasereg + else if ref.index=NR_NO then + ref.index:=preferred_newbasereg + else + begin + { make sure it's valid in case ref.base is SP -> make it + the second operand} + a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg); + ref.base:=preferred_newbasereg + end; + ref.symbol:=nil; + end + else + { todo } + internalerror(2014111003); + end; + + { base & index } + if (ref.base<>NR_NO) and + (ref.index<>NR_NO) then + begin + case op of + A_LDR, A_STR: + begin + if (ref.shiftmode=SM_None) and + (ref.shiftimm<>0) then + internalerror(2014110805); + { wrong shift? (possible in case of something like + array_of_2byte_rec[x].bytefield -> shift will be set 1, but + the final load is a 1 byte -> can't use shift after all } + if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and + ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or + (ref.offset<>0)) then + begin + if preferred_newbasereg=NR_NO then + preferred_newbasereg:=getaddressregister(list); + { "add" supports a superset of the shift modes supported by + load/store instructions } + shifterop_reset(so); + so.shiftmode:=ref.shiftmode; + so.shiftimm:=ref.shiftimm; + list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so)); + reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.alignment); + { possibly still an invalid offset -> fall through } + end + else if ref.offset<>0 then + begin + if (preferred_newbasereg=NR_NO) or + { we keep ref.index, so it must not be overwritten } + (ref.index=preferred_newbasereg) then + preferred_newbasereg:=getaddressregister(list); + { add to the base and not to the index, because the index + may be scaled; this works even if the base is SP } + a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg); + ref.offset:=0; + ref.base:=preferred_newbasereg; + { finished } + exit; + end + else + { valid -> exit } + exit; + end; + { todo } + A_LD1,A_LD2,A_LD3,A_LD4, + A_ST1,A_ST2,A_ST3,A_ST4: + internalerror(2014110704); + { these don't support base+index } + A_LDUR,A_STUR, + A_LDP,A_STP: + begin + { these either don't support pre-/post-indexing, or don't + support it with base+index } + if ref.addressmode<>AM_OFFSET then + internalerror(2014110911); + if preferred_newbasereg=NR_NO then + preferred_newbasereg:=getaddressregister(list); + if ref.shiftmode<>SM_None then + begin + { "add" supports a superset of the shift modes supported by + load/store instructions } + shifterop_reset(so); + so.shiftmode:=ref.shiftmode; + so.shiftimm:=ref.shiftimm; + list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so)); + end + else + a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg); + reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.alignment); + { fall through to the handling of base + offset, since the + offset may still be too big } + end; + else + internalerror(2014110901); + end; + end; + + { base + offset } + if ref.base<>NR_NO then + begin + { valid offset for LDUR/STUR -> use that } + if (ref.addressmode=AM_OFFSET) and + (op in [A_LDR,A_STR]) and + (ref.offset>=-256) and + (ref.offset<=255) then + begin + if op=A_LDR then + op:=A_LDUR + else + op:=A_STUR + end + { if it's not a valid LDUR/STUR, use LDR/STR } + else if (op in [A_LDUR,A_STUR]) and + ((ref.offset<-256) or + (ref.offset>255) or + (ref.addressmode<>AM_OFFSET)) then + begin + if op=A_LDUR then + op:=A_LDR + else + op:=A_STR + end; + case op of + A_LDR,A_STR: + begin + case ref.addressmode of + AM_PREINDEXED: + begin + { since the loaded/stored register cannot be the same + as the base register, we can safely add the + offset to the base if it doesn't fit} + if (ref.offset<-256) or + (ref.offset>255) then + begin + a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base); + ref.offset:=0; + end; + end; + AM_POSTINDEXED: + begin + { cannot emulate post-indexing if we have to fold the + offset into the base register } + if (ref.offset<-256) or + (ref.offset>255) then + internalerror(2014110909); + { ok } + end; + AM_OFFSET: + begin + { unsupported offset -> fold into base register } + accesssize:=1 shl tcgsizep2size[size]; + if (ref.offset<0) or + (ref.offset>(((1 shl 12)-1)*accesssize)) or + ((ref.offset mod accesssize)<>0) then + begin + if preferred_newbasereg=NR_NO then + preferred_newbasereg:=getaddressregister(list); + { can we split the offset beween an + "add/sub (imm12 shl 12)" and the load (also an + imm12)? + -- the offset from the load will always be added, + that's why the lower bound has a smaller range + than the upper bound; it must also be a multiple + of the access size } + if (ref.offset>=-(((1 shl 12)-1) shl 12)) and + (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and + ((ref.offset mod accesssize)=0) then + begin + a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg); + ref.offset:=ref.offset-(ref.offset shr 12) shl 12; + end + else + begin + a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg); + ref.offset:=0; + end; + reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.alignment); + end; + end + else + internalerror(2014110904); + end; + end; + A_LDP,A_STP: + begin + { unsupported offset -> fold into base register (these + instructions support all addressmodes) } + if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or + (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then + begin + case ref.addressmode of + AM_POSTINDEXED: + { don't emulate post-indexing if we have to fold the + offset into the base register } + internalerror(2014110910); + AM_PREINDEXED: + { this means the offset must be added to the current + base register } + preferred_newbasereg:=ref.base; + AM_OFFSET: + if preferred_newbasereg=NR_NO then + preferred_newbasereg:=getaddressregister(list); + end; + a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg); + reference_reset_base(ref,preferred_newbasereg,0,ref.alignment); + end + end; + A_LDUR,A_STUR: + begin + { valid, checked above } + end; + { todo } + A_LD1,A_LD2,A_LD3,A_LD4, + A_ST1,A_ST2,A_ST3,A_ST4: + internalerror(2014110908); + else + internalerror(2014110708); + end; + { done } + exit; + end; + + { only an offset -> change to base (+ offset 0) } + if preferred_newbasereg=NR_NO then + preferred_newbasereg:=getaddressregister(list); + a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg); + reference_reset_base(ref,preferred_newbasereg,0,newalignment(8,ref.offset)); + end; + + + function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister; + var + subreg:Tsubregister; + begin + subreg:=cgsize2subreg(getregtype(reg),size); + result:=reg; + setsubreg(result,subreg); + end; + + + function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister; + begin + internalerror(2014122110); + { squash warning } + result:=NR_NO; + end; + + + function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference; + begin + make_simple_ref(list,op,size,oppostfix,ref,NR_NO); + list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix)); + result:=ref; + end; + + + procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean); + var + instr: taicpu; + so: tshifterop; + hadtmpreg: boolean; + begin + { imm12 } + if (a>=0) and + (a<=((1 shl 12)-1)) then + if usedest then + instr:=taicpu.op_reg_reg_const(op,dst,src,a) + else + instr:=taicpu.op_reg_const(op,src,a) + { imm12 lsl 12 } + else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then + begin + so.shiftmode:=SM_LSL; + so.shiftimm:=12; + if usedest then + instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so) + else + instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so) + end + else + begin + { todo: other possible optimizations (e.g. load 16 bit constant in + register and then add/sub/cmp/cmn shifted the rest) } + if tmpreg=NR_NO then + begin + hadtmpreg:=false; + tmpreg:=getintregister(list,size); + end + else + begin + hadtmpreg:=true; + getcpuregister(list,tmpreg); + end; + a_load_const_reg(list,size,a,tmpreg); + if usedest then + instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg) + else + instr:=taicpu.op_reg_reg(op,src,tmpreg); + if hadtmpreg then + ungetcpuregister(list,tmpreg); + end; + if setflags then + setoppostfix(instr,PF_S); + list.concat(instr); + end; + + +{**************************************************************************** + Assembler code +****************************************************************************} + + procedure tcgaarch64.init_register_allocators; + begin + inherited init_register_allocators; + + rg[R_INTREGISTER]:=Trgcpu.create(R_INTREGISTER,R_SUBWHOLE, + [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8, + RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17, + RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28 + { maybe we can enable this in the future for leaf functions (it's + the frame pointer) + ,RS_X29 }], + first_int_imreg,[]); + + { needs at least one element for rgobj not to crash } + rg[R_FPUREGISTER]:=trgcpu.create(R_FPUREGISTER,R_SUBNONE, + [RS_B0],first_fpu_imreg,[]); + + rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD, + [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7, + RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15, + RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23, + RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31], + first_mm_imreg,[]); + end; + + + procedure tcgaarch64.done_register_allocators; + begin + rg[R_INTREGISTER].free; + rg[R_FPUREGISTER].free; + rg[R_MMREGISTER].free; + inherited done_register_allocators; + end; + + + function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister; + begin + case size of + OS_F32: + result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS); + OS_F64: + result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD) + else + internalerror(2014102701); + end; + end; + + + procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean); + begin + if not weak then + list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s))) + else + list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s))); + end; + + + procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister); + begin + list.concat(taicpu.op_reg(A_BLR,reg)); + end; + + + {********************** load instructions ********************} + + procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister); + var + preva: tcgint; + opc: tasmop; + shift,maxshift: byte; + so: tshifterop; + reginited: boolean; + mask: tcgint; + begin + { if we load a value into a 32 bit register, it is automatically + zero-extended to 64 bit } + if (high(a)=0) and + (size in [OS_64,OS_S64]) then + begin + size:=OS_32; + reg:=makeregsize(reg,size); + end; + { values <= 32 bit are stored in a 32 bit register } + if not(size in [OS_64,OS_S64]) then + a:=cardinal(a); + + if size in [OS_64,OS_S64] then + begin + mask:=-1; + maxshift:=64; + end + else + begin + mask:=$ffffffff; + maxshift:=32; + end; + { single movn enough? (to be extended) } + shift:=16; + preva:=a; + repeat + if (a shr shift)=(mask shr shift) then + begin + if shift=16 then + list.concat(taicpu.op_reg_const(A_MOVN,reg,not(word(preva)))) + else + begin + shifterop_reset(so); + so.shiftmode:=SM_LSL; + so.shiftimm:=shift-16; + list.concat(taicpu.op_reg_const_shifterop(A_MOVN,reg,not(word(preva)),so)); + end; + exit; + end; + { only try the next 16 bits if the current one is all 1 bits, since + the movn will set all lower bits to 1 } + if word(a shr (shift-16))<>$ffff then + break; + inc(shift,16); + until shift=maxshift; + reginited:=false; + shift:=0; + { can be optimized later to use more movn } + repeat + { leftover is shifterconst? (don't check if we can represent it just + as effectively with movz/movk, as this check is expensive) } + if ((shift0) and + ((a shr 16)<>0)) and + is_shifter_const(a shl shift,size) then + begin + if reginited then + list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift)) + else + list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift)); + exit; + end; + { set all 16 bit parts <> 0 } + if (word(a)<>0) or + ((shift=0) and + (a=0)) then + if shift=0 then + begin + list.concat(taicpu.op_reg_const(A_MOVZ,reg,word(a))); + reginited:=true; + end + else + begin + shifterop_reset(so); + so.shiftmode:=SM_LSL; + so.shiftimm:=shift; + if not reginited then + begin + opc:=A_MOVZ; + reginited:=true; + end + else + opc:=A_MOVK; + list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so)); + end; + preva:=a; + a:=a shr 16; + inc(shift,16); + until word(preva)=preva; + if not reginited then + internalerror(2014102702); + end; + + + procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); + var + oppostfix:toppostfix; + hreg: tregister; + begin + if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then + fromsize:=tosize + { have a 32 bit register but need a 64 bit one? } + else if tosize in [OS_64,OS_S64] then + begin + { sign extend if necessary } + if fromsize in [OS_S8,OS_S16,OS_S32] then + begin + { can't overwrite reg, may be a constant reg } + hreg:=getintregister(list,tosize); + a_load_reg_reg(list,fromsize,tosize,reg,hreg); + reg:=hreg; + end + else + { top 32 bit are zero by default } + reg:=makeregsize(reg,OS_64); + fromsize:=tosize; + end; + if (ref.alignment<>0) and + (ref.alignment=tcgsize2Size[tosize] then + fromsize:=tosize; + { ensure that all bits of the 32/64 register are always correctly set: + * default behaviour is always to zero-extend to the entire (64 bit) + register -> unsigned 8/16/32 bit loads only exist with a 32 bit + target register, as the upper 32 bit will be zeroed implicitly + -> always make target register 32 bit + * signed loads exist both with 32 and 64 bit target registers, + depending on whether the value should be sign extended to 32 or + to 64 bit (if sign extended to 32 bit, the upper 32 bits of the + corresponding 64 bit register are again zeroed) -> no need to + change anything (we only have 32 and 64 bit registers), except that + when loading an OS_S32 to a 32 bit register, we don't need/can't + use sign extension + } + if fromsize in [OS_8,OS_16,OS_32] then + reg:=makeregsize(reg,OS_32); + if (ref.alignment<>0) and + (ref.alignment insert conversion when when + we have to truncate/sign extend inside the (32 or 64 bit) register + holding the value, and when we sign extend from a 32 to a 64 bit + register } + if ((tcgsize2size[fromsize]>tcgsize2size[tosize]) and + not(tosize in [OS_32,OS_S32])) or + ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and + (fromsize<>tosize) and + not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or + ((fromsize in [OS_S8,OS_S16,OS_S32]) and + (tosize in [OS_64,OS_S64])) or + { needs to mask out the sign in the top 16 bits } + ((fromsize=OS_S8) and + (tosize=OS_16)) then + begin + case tosize of + OS_8: + list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_B)); + OS_16: + list.concat(setoppostfix(taicpu.op_reg_reg(A_UXT,reg2,makeregsize(reg1,OS_32)),PF_H)); + OS_S8: + list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_B)); + OS_S16: + list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_H)); + OS_64, + OS_S64: + list.concat(setoppostfix(taicpu.op_reg_reg(A_SXT,reg2,makeregsize(reg1,OS_32)),PF_W)); + else + internalerror(2002090901); + end; + end + else + begin + { 32 -> 32 bit move implies zero extension (sign extensions have + been handled above) -> also use for 32 <-> 64 bit moves } + if not(fromsize in [OS_64,OS_S64]) or + not(tosize in [OS_64,OS_S64]) then + instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32)) + else + instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1); + list.Concat(instr); + { Notify the register allocator that we have written a move instruction so + it can try to eliminate it. } + add_move_instruction(instr); + end; + end; + + + procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister); + var + href: treference; + so: tshifterop; + op: tasmop; + begin + op:=A_LDR; + href:=ref; + { simplify as if we're going to perform a regular 64 bit load, using + "r" as the new base register if possible/necessary } + make_simple_ref(list,op,OS_ADDR,PF_None,href,r); + { load literal? } + if assigned(href.symbol) then + begin + if (href.base<>NR_NO) or + (href.index<>NR_NO) or + not assigned(href.symboldata) then + internalerror(2014110912); + list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset)); + end + else + begin + if href.index<>NR_NO then + begin + if href.shiftmode<>SM_None then + begin + { "add" supports a supperset of the shift modes supported by + load/store instructions } + shifterop_reset(so); + so.shiftmode:=href.shiftmode; + so.shiftimm:=href.shiftimm; + list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so)); + end + else + a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r); + end + else if href.offset<>0 then + a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r) + else + a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r); + end; + end; + + + procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); + begin + internalerror(2014122107) + end; + + + procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); + begin + internalerror(2014122108) + end; + + + procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); + begin + internalerror(2014122109) + end; + + + procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle); + var + instr: taicpu; + begin + if assigned(shuffle) and + not shufflescalar(shuffle) then + internalerror(2014122104); + if fromsize=tosize then + begin + instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1); + { Notify the register allocator that we have written a move + instruction so it can try to eliminate it. } + add_move_instruction(instr); + end + else + begin + if (reg_cgsize(reg1)<>fromsize) or + (reg_cgsize(reg2)<>tosize) then + internalerror(2014110913); + instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1); + end; + list.Concat(instr); + end; + + + procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle); + var + tmpreg: tregister; + begin + if assigned(shuffle) and + not shufflescalar(shuffle) then + internalerror(2014122105); + tmpreg:=NR_NO; + if (fromsize<>tosize) then + begin + tmpreg:=reg; + reg:=getmmregister(list,fromsize); + end; + handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref); + if (fromsize<>tosize) then + a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil); + end; + + + procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle); + var + tmpreg: tregister; + begin + if assigned(shuffle) and + not shufflescalar(shuffle) then + internalerror(2014122106); + if (fromsize<>tosize) then + begin + tmpreg:=getmmregister(list,tosize); + a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil); + reg:=tmpreg; + end; + handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref); + end; + + + procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); + var + href: treference; + hreg1, hreg2, tmpreg: tregister; + begin + if fromsize in [OS_64,OS_S64] then + begin + { split into two 32 bit stores } + hreg1:=makeregsize(register,OS_32); + hreg2:=getintregister(list,OS_32); + a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64)); + if target_info.endian=endian_big then + begin + tmpreg:=hreg1; + hreg1:=hreg2; + hreg2:=tmpreg; + end; + { can we use STP? } + if (ref.alignment=4) and + (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then + list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref)) + else + begin + a_load_reg_ref(list,OS_32,OS_32,hreg1,ref); + href:=ref; + inc(href.offset,4); + a_load_reg_ref(list,OS_32,OS_32,hreg2,href); + end; + end + else + inherited; + end; + + + procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister); + const + overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG]; + begin + if (op in overflowops) and + (size in [OS_8,OS_S8,OS_16,OS_S16]) then + a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32)) + end; + + + procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister); + begin + optimize_op_const(size,op,a); + case op of + OP_NONE: + exit; + OP_MOVE: + a_load_const_reg(list,size,a,reg); + OP_NEG,OP_NOT: + internalerror(200306011); + else + a_op_const_reg_reg(list,op,size,a,reg,reg); + end; + end; + + + procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister); + begin + Case op of + OP_NEG, + OP_NOT: + begin + list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src)); + maybeadjustresult(list,op,size,dst); + end + else + a_op_reg_reg_reg(list,op,size,src,dst,dst); + end; + end; + + + procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister); + var + l: tlocation; + begin + a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l); + end; + + + procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister); + var + hreg: tregister; + begin + { no ROLV opcode... } + if op=OP_ROL then + begin + case size of + OS_32,OS_S32, + OS_64,OS_S64: + begin + hreg:=getintregister(list,size); + a_load_const_reg(list,size,tcgsize2size[size]*8,hreg); + a_op_reg_reg(list,OP_SUB,size,src1,hreg); + a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst); + exit; + end; + else + internalerror(2014111005); + end; + end + else if (op=OP_ROR) and + not(size in [OS_32,OS_S32,OS_64,OS_S64]) then + internalerror(2014111006); + if TOpCG2AsmOpReg[op]=A_NONE then + internalerror(2014111007); + list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1)); + maybeadjustresult(list,op,size,dst); + end; + + + procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation); + var + shiftcountmask: longint; + constreg: tregister; + begin + { add/sub instructions have only positive immediate operands } + if (op in [OP_ADD,OP_SUB]) and + (a<0) then + begin + if op=OP_ADD then + op:=op_SUB + else + op:=OP_ADD; + { avoid range/overflow error in case a = low(tcgint) } +{$push}{$r-}{$q-} + a:=-a; +{$pop} + end; + ovloc.loc:=LOC_VOID; + optimize_op_const(size,op,a); + case op of + OP_NONE: + begin + a_load_reg_reg(list,size,size,src,dst); + exit; + end; + OP_MOVE: + begin + a_load_const_reg(list,size,a,dst); + exit; + end; + end; + case op of + OP_ADD, + OP_SUB: + begin + handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true); + { on a 64 bit target, overflows with smaller data types + are handled via range errors } + if setflags and + (size in [OS_64,OS_S64]) then + begin + location_reset(ovloc,LOC_FLAGS,OS_8); + if size=OS_64 then + if op=OP_ADD then + ovloc.resflags:=F_CS + else + ovloc.resflags:=F_CC + else + ovloc.resflags:=F_VS; + end; + end; + OP_OR, + OP_AND, + OP_XOR: + begin + if not(size in [OS_64,OS_S64]) then + a:=cardinal(a); + if is_shifter_const(a,size) then + list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a)) + else + begin + constreg:=getintregister(list,size); + a_load_const_reg(list,size,a,constreg); + a_op_reg_reg_reg(list,op,size,constreg,src,dst); + end; + end; + OP_SHL, + OP_SHR, + OP_SAR: + begin + if size in [OS_64,OS_S64] then + shiftcountmask:=63 + else + shiftcountmask:=31; + if (a and shiftcountmask)<>0 Then + list.concat(taicpu.op_reg_reg_const( + TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask)) + else + a_load_reg_reg(list,size,size,src,dst); + if (a and not(tcgint(shiftcountmask)))<>0 then + internalError(2014112101); + end; + OP_ROL, + OP_ROR: + begin + case size of + OS_32,OS_S32: + if (a and not(tcgint(31)))<>0 then + internalError(2014112102); + OS_64,OS_S64: + if (a and not(tcgint(63)))<>0 then + internalError(2014112103); + else + internalError(2014112104); + end; + { there's only a ror opcode } + if op=OP_ROL then + a:=(tcgsize2size[size]*8)-a; + list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a)); + end; + OP_MUL, + OP_IMUL, + OP_DIV, + OP_IDIV: + begin + constreg:=getintregister(list,size); + a_load_const_reg(list,size,a,constreg); + a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc); + end; + else + internalerror(2014111403); + end; + maybeadjustresult(list,op,size,dst); + end; + + + procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation); + var + tmpreg1: tregister; + begin + ovloc.loc:=LOC_VOID; + { overflow can only occur with 64 bit calculations on 64 bit cpus } + if setflags and + (size in [OS_64,OS_S64]) then + begin + case op of + OP_ADD, + OP_SUB: + begin + list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S)); + ovloc.loc:=LOC_FLAGS; + if size=OS_64 then + if op=OP_ADD then + ovloc.resflags:=F_CS + else + ovloc.resflags:=F_CC + else + ovloc.resflags:=F_VS; + { finished; since we won't call through to a_op_reg_reg_reg, + adjust the result here if necessary } + maybeadjustresult(list,op,size,dst); + exit; + end; + OP_MUL: + begin + { check whether the upper 64 bit of the 128 bit product is 0 } + tmpreg1:=getintregister(list,OS_64); + list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1)); + list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0)); + ovloc.loc:=LOC_FLAGS; + ovloc.resflags:=F_NE; + { still have to perform the actual multiplication } + end; + OP_IMUL: + begin + { check whether the sign bit of the (128 bit) result is the + same as "sign bit of src1" xor "signbit of src2" (if so, no + overflow and the xor-product of all sign bits is 0) } + tmpreg1:=getintregister(list,OS_64); + list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1)); + list.concat(taicpu.op_reg_reg_reg(A_EOR,tmpreg1,tmpreg1,src1)); + list.concat(taicpu.op_reg_reg_reg(A_EOR,tmpreg1,tmpreg1,src2)); + list.concat(taicpu.op_reg_const(A_TST,tmpreg1,$80000000)); + ovloc.loc:=LOC_FLAGS; + ovloc.resflags:=F_NE; + { still have to perform the actual multiplication } + end; + OP_IDIV, + OP_DIV: + begin + { not handled here, needs div-by-zero check (dividing by zero + just gives a 0 result on aarch64), and low(int64) div -1 + check for overflow) } + internalerror(2014122101); + end; + end; + end; + a_op_reg_reg_reg(list,op,size,src1,src2,dst); + end; + + + + {*************** compare instructructions ****************} + + procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel); + var + op: tasmop; + begin + if a>=0 then + op:=A_CMP + else + op:=A_CMN; + { avoid range/overflow error in case a=low(tcgint) } +{$push}{$r-}{$q-} + handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false); +{$pop} + a_jmp_cond(list,cmp_op,l); + end; + + + procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel); + begin + list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1)); + a_jmp_cond(list,cmp_op,l); + end; + + + procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel); + var + ai: taicpu; + begin + ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name)); + ai.is_jmp:=true; + list.Concat(ai); + end; + + + procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string); + var + ai: taicpu; + begin + ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s)); + ai.is_jmp:=true; + list.Concat(ai); + end; + + + procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel); + var + ai: taicpu; + begin + ai:=TAiCpu.op_sym(A_B,l); + ai.is_jmp:=true; + ai.SetCondition(TOpCmp2AsmCond[cond]); + list.Concat(ai); + end; + + + procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel); + var + ai : taicpu; + begin + ai:=Taicpu.op_sym(A_B,l); + ai.is_jmp:=true; + ai.SetCondition(flags_to_cond(f)); + list.Concat(ai); + end; + + + procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister); + var + ai: taicpu; + begin + ai:=taicpu.op_reg(A_CSET,reg) + ai.SetCondition(flags_to_cond(f)); + list.concat(ai); + end; + + + procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef); + begin + { we need an explicit overflow location, because there are many + possibilities (not just the overflow flag, which is only used for + signed add/sub) } + internalerror(2014112303); + end; + + + procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation); + var + hl : tasmlabel; + hflags : tresflags; + begin + if not(cs_check_overflow in current_settings.localswitches) then + exit; + current_asmdata.getjumplabel(hl); + case ovloc.loc of + LOC_FLAGS: + begin + hflags:=ovloc.resflags; + inverse_flags(hflags); + cg.a_jmp_flags(list,hflags,hl); + end; + else + internalerror(2014112304); + end; + a_call_name(list,'FPC_OVERFLOW',false); + a_label(list,hl); + end; + + { *********** entry/exit code and address loading ************ } + + procedure tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); + var + ref: treference; + sr: tsuperregister; + pairreg: tregister; + begin + reference_reset_base(ref,NR_SP,-16,16); + ref.addressmode:=AM_PREINDEXED; + pairreg:=NR_NO; + { store all used registers pairwise } + for sr:=lowsr to highsr do + if sr in rg[rt].used_in_proc then + if pairreg=NR_NO then + pairreg:=newreg(rt,sr,sub) + else + begin + list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref)); + pairreg:=NR_NO + end; + { one left -> store twice (stack must be 16 bytes aligned) } + if pairreg<>NR_NO then + list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref)); + end; + + + procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean); + var + ref: treference; + begin + if nostackframe then + exit; + { stack pointer has to be aligned to 16 bytes at all times } + localsize:=align(localsize,16); + + { save stack pointer and return address } + reference_reset_base(ref,NR_SP,-16,16); + ref.addressmode:=AM_PREINDEXED; + list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref)); + { initialise frame pointer } + a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP); + + { save modified integer registers } + save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE); + { only the lower 64 bits of the modified vector registers need to be + saved; if the caller needs the upper 64 bits, it has to save them + itself } + save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD); + + { allocate stack space } + if localsize<>0 then + begin + localsize:=align(localsize,16); + current_procinfo.final_localsize:=localsize; + handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true); + end; + end; + + + procedure tcgaarch64.g_maybe_got_init(list : TAsmList); + begin + { nothing to do on Darwin; check on ELF targets } + if not(target_info.system in systems_darwin) then + internalerror(2014112601); + end; + + + procedure tcgaarch64.g_restore_registers(list:TAsmList); + begin + { done in g_proc_exit } + end; + + + procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); + var + ref: treference; + sr, highestsetsr: tsuperregister; + pairreg: tregister; + regcount: longint; + begin + reference_reset_base(ref,NR_SP,16,16); + ref.addressmode:=AM_POSTINDEXED; + { highest reg stored twice? } + regcount:=0; + highestsetsr:=RS_NO; + for sr:=lowsr to highsr do + if sr in rg[rt].used_in_proc then + begin + inc(regcount); + highestsetsr:=sr; + end; + if odd(regcount) then + begin + list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref)); + highestsetsr:=pred(highestsetsr); + end; + { load all (other) used registers pairwise } + pairreg:=NR_NO; + for sr:=highestsetsr downto lowsr do + if sr in rg[rt].used_in_proc then + if pairreg=NR_NO then + pairreg:=newreg(rt,sr,sub) + else + begin + list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref)); + pairreg:=NR_NO + end; + { There can't be any register left } + if pairreg<>NR_NO then + internalerror(2014112602); + end; + + + + procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean); + var + ref: treference; + regsstored: boolean; + sr: tsuperregister; + begin + if not nostackframe then + begin + { if no registers have been stored, we don't have to subtract the + allocated temp space from the stack pointer } + regsstored:=false; + for sr:=RS_X19 to RS_X28 do + if sr in rg[R_INTREGISTER].used_in_proc then + begin + regsstored:=true; + break; + end; + if not regsstored then + for sr:=RS_D8 to RS_D15 do + if sr in rg[R_MMREGISTER].used_in_proc then + begin + regsstored:=true; + break; + end; + { restore registers (and stack pointer) } + if regsstored then + begin + if current_procinfo.final_localsize<>0 then + handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true); + load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD); + load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE); + end + else if current_procinfo.final_localsize<>0 then + { restore stack pointer } + a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP); + + { restore framepointer and return address } + reference_reset_base(ref,NR_SP,16,16); + ref.addressmode:=AM_POSTINDEXED; + list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref)); + end; + + { return } + list.concat(taicpu.op_none(A_RET)); + end; + + + procedure tcgaarch64.g_save_registers(list : TAsmList); + begin + { done in g_proc_entry } + end; + + + { ************* concatcopy ************ } + + procedure tcgaarch64.g_concatcopy_move(list : TAsmList;const source,dest : treference;len : tcgint); + var + paraloc1,paraloc2,paraloc3 : TCGPara; + pd : tprocdef; + begin + pd:=search_system_proc('MOVE'); + paraloc1.init; + paraloc2.init; + paraloc3.init; + paramanager.getintparaloc(pd,1,paraloc1); + paramanager.getintparaloc(pd,2,paraloc2); + paramanager.getintparaloc(pd,3,paraloc3); + a_load_const_cgpara(list,OS_SINT,len,paraloc3); + a_loadaddr_ref_cgpara(list,dest,paraloc2); + a_loadaddr_ref_cgpara(list,source,paraloc1); + paramanager.freecgpara(list,paraloc3); + paramanager.freecgpara(list,paraloc2); + paramanager.freecgpara(list,paraloc1); + alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default)); + alloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default)); + a_call_name(list,'FPC_MOVE',false); + dealloccpuregisters(list,R_MMREGISTER,paramanager.get_volatile_registers_mm(pocall_default)); + dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default)); + paraloc3.done; + paraloc2.done; + paraloc1.done; + end; + + + procedure tcgaarch64.g_concatcopy(list:TAsmList;const source,dest:treference;len:tcgint); +(* + var + tmpreg1, + hreg, + countreg: tregister; + src, dst: treference; + lab: tasmlabel; + count, count2: aint; +*) + begin +(* + { anybody wants to determine a good value here :)? } + if len>100 then +*) + g_concatcopy_move(list,source,dest,len) +(* + else + begin + count:=len div 4; + if (count<=4) and reference_is_reusable(source) then + src:=source + else + begin + reference_reset_base(src,getintregister(list,OS_ADDR),0,sizeof(aint)); + a_loadaddr_ref_reg(list,source,src.base); + end; + if (count<=4) and reference_is_reusable(dest) then + dst:=dest + else + begin + reference_reset_base(dst,getintregister(list,OS_ADDR),0,sizeof(aint)); + a_loadaddr_ref_reg(list,dest,dst.base); + end; + { generate a loop } + if count>4 then + begin + countreg:=GetIntRegister(list,OS_INT); + tmpreg1:=GetIntRegister(list,OS_INT); + a_load_const_reg(list,OS_INT,count,countreg); + current_asmdata.getjumplabel(lab); + a_label(list, lab); + list.concat(taicpu.op_ref_reg(A_LD,src,tmpreg1)); + list.concat(taicpu.op_reg_ref(A_ST,tmpreg1,dst)); + list.concat(taicpu.op_reg_const_reg(A_ADD,src.base,4,src.base)); + list.concat(taicpu.op_reg_const_reg(A_ADD,dst.base,4,dst.base)); + list.concat(taicpu.op_reg_const_reg(A_SUBcc,countreg,1,countreg)); + a_jmp_cond(list,OC_NE,lab); + len := len mod 4; + end; + { unrolled loop } + count:=len div 4; + if count>0 then + begin + tmpreg1:=GetIntRegister(list,OS_INT); + for count2 := 1 to count do + begin + list.concat(taicpu.op_ref_reg(A_LD,src,tmpreg1)); + list.concat(taicpu.op_reg_ref(A_ST,tmpreg1,dst)); + inc(src.offset,4); + inc(dst.offset,4); + end; + len := len mod 4; + end; + if (len and 4) <> 0 then + begin + hreg:=GetIntRegister(list,OS_INT); + a_load_ref_reg(list,OS_32,OS_32,src,hreg); + a_load_reg_ref(list,OS_32,OS_32,hreg,dst); + inc(src.offset,4); + inc(dst.offset,4); + end; + { copy the leftovers } + if (len and 2) <> 0 then + begin + hreg:=GetIntRegister(list,OS_INT); + a_load_ref_reg(list,OS_16,OS_16,src,hreg); + a_load_reg_ref(list,OS_16,OS_16,hreg,dst); + inc(src.offset,2); + inc(dst.offset,2); + end; + if (len and 1) <> 0 then + begin + hreg:=GetIntRegister(list,OS_INT); + a_load_ref_reg(list,OS_8,OS_8,src,hreg); + a_load_reg_ref(list,OS_8,OS_8,hreg,dst); + end; + end; +*) + end; + + + procedure tcgaarch64.g_concatcopy_unaligned(list : TAsmList;const source,dest : treference;len : tcgint); +(* + var + src, dst: treference; + tmpreg1, + countreg: tregister; + i : aint; + lab: tasmlabel; +*) + begin +(* + if len>31 then +*) + g_concatcopy_move(list,source,dest,len) +(* + else + begin + reference_reset(src,source.alignment); + reference_reset(dst,dest.alignment); + { load the address of source into src.base } + src.base:=GetAddressRegister(list); + a_loadaddr_ref_reg(list,source,src.base); + { load the address of dest into dst.base } + dst.base:=GetAddressRegister(list); + a_loadaddr_ref_reg(list,dest,dst.base); + { generate a loop } + if len>4 then + begin + countreg:=GetIntRegister(list,OS_INT); + tmpreg1:=GetIntRegister(list,OS_INT); + a_load_const_reg(list,OS_INT,len,countreg); + current_asmdata.getjumplabel(lab); + a_label(list, lab); + list.concat(taicpu.op_ref_reg(A_LDUB,src,tmpreg1)); + list.concat(taicpu.op_reg_ref(A_STB,tmpreg1,dst)); + list.concat(taicpu.op_reg_const_reg(A_ADD,src.base,1,src.base)); + list.concat(taicpu.op_reg_const_reg(A_ADD,dst.base,1,dst.base)); + list.concat(taicpu.op_reg_const_reg(A_SUBcc,countreg,1,countreg)); + a_jmp_cond(list,OC_NE,lab); + end + else + begin + { unrolled loop } + tmpreg1:=GetIntRegister(list,OS_INT); + for i:=1 to len do + begin + list.concat(taicpu.op_ref_reg(A_LDUB,src,tmpreg1)); + list.concat(taicpu.op_reg_ref(A_STB,tmpreg1,dst)); + inc(src.offset); + inc(dst.offset); + end; + end; + end; +*) + end; + + + procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint); + begin + { This method is integrated into g_intf_wrapper and shouldn't be called separately } + InternalError(2013020102); + end; + + + procedure tcgaarch64.g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint); + var + make_global: boolean; + href: treference; + hsym: tsym; + paraloc: pcgparalocation; + op: tasmop; + begin + if not(procdef.proctypeoption in [potype_function,potype_procedure]) then + Internalerror(200006137); + if not assigned(procdef.struct) or + (procdef.procoptions*[po_classmethod, po_staticmethod, + po_methodpointer, po_interrupt, po_iocheck]<>[]) then + Internalerror(200006138); + if procdef.owner.symtabletype<>ObjectSymtable then + Internalerror(200109191); + + make_global:=false; + if (not current_module.is_unit) or create_smartlink_library or + (procdef.owner.defowner.owner.symtabletype=globalsymtable) then + make_global:=true; + + if make_global then + list.concat(Tai_symbol.Createname_global(labelname,AT_FUNCTION,0)) + else + list.concat(Tai_symbol.Createname(labelname,AT_FUNCTION,0)); + + { set param1 interface to self } + procdef.init_paraloc_info(callerside); + hsym:=tsym(procdef.parast.Find('self')); + if not(assigned(hsym) and + (hsym.typ=paravarsym)) then + internalerror(2010103101); + paraloc:=tparavarsym(hsym).paraloc[callerside].location; + if assigned(paraloc^.next) then + InternalError(2013020101); + + case paraloc^.loc of + LOC_REGISTER: + handle_reg_imm12_reg(list,A_SUB,paraloc^.size,paraloc^.register,ioffset,paraloc^.register,NR_IP0,false,true); + else + internalerror(2010103102); + end; + + if (po_virtualmethod in procdef.procoptions) and + not is_objectpascal_helper(procdef.struct) then + begin + if (procdef.extnumber=$ffff) then + Internalerror(200006139); + { mov 0(%rdi),%rax ; load vmt} + reference_reset_base(href,paraloc^.register,0,sizeof(pint)); + getcpuregister(list,NR_IP0); + a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_IP0); + { jmp *vmtoffs(%eax) ; method offs } + reference_reset_base(href,NR_IP0,tobjectdef(procdef.struct).vmtmethodoffset(procdef.extnumber),sizeof(pint)); + op:=A_LDR; + make_simple_ref(list,op,OS_ADDR,PF_None,href,NR_IP0); + list.concat(taicpu.op_reg_ref(op,NR_IP0,href)); + ungetcpuregister(list,NR_IP0); + list.concat(taicpu.op_reg(A_BR,NR_IP0)); + end + else + a_jmp_name(list,procdef.mangledname); + list.concat(Tai_symbol_end.Createname(labelname)); + end; + + + procedure create_codegen; + begin + cg:=tcgaarch64.Create; + cg128:=tcg128.Create; + end; + +end.