{ Copyright (c) 2014 by Jonas Maebe This unit implements the code generator for AArch64 This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. **************************************************************************** } unit cgcpu; {$i fpcdefs.inc} interface uses globtype,parabase, cgbase,cgutils,cgobj, aasmbase,aasmtai,aasmdata,aasmcpu, cpubase,cpuinfo, node,symconst,SymType,symdef, rgcpu; type { tcgaarch64 } tcgaarch64=class(tcg) protected { changes register size without adding register allocation info } function makeregsize(reg: tregister; size: tcgsize): tregister; overload; public { simplifies "ref" so it can be used with "op". If "ref" can be used with a different load/Store operation that has the same meaning as the original one, "op" will be replaced with the alternative } procedure make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister); function getfpuregister(list: TAsmList; size: Tcgsize): Tregister; override; procedure handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean); procedure init_register_allocators;override; procedure done_register_allocators;override; function getmmregister(list:TAsmList;size:tcgsize):tregister;override; function handle_load_store(list:TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference; procedure a_call_name(list:TAsmList;const s:string; weak: boolean);override; procedure a_call_reg(list:TAsmList;Reg:tregister);override; { General purpose instructions } procedure maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister); procedure a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister);override; procedure a_op_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src, dst: tregister);override; procedure a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister);override; procedure a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister);override; procedure a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation);override; procedure a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation);override; { move instructions } procedure a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg: tregister);override; procedure a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); override; procedure a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister;const ref: TReference);override; procedure a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); override; procedure a_load_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister);override; procedure a_load_ref_reg_unaligned(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; register: tregister); override; procedure a_load_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister);override; procedure a_loadaddr_ref_reg(list: TAsmList; const ref: TReference; r: tregister);override; { fpu move instructions (not used, all floating point is vector unit-based) } procedure a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); override; procedure a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); override; procedure a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); override; procedure a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister;shuffle : pmmshuffle);override; procedure a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: TReference; reg: tregister; shuffle: pmmshuffle);override; procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: TReference; shuffle: pmmshuffle);override; procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); override; procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); override; procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size: tcgsize; src, dst: tregister; shuffle: pmmshuffle); override; procedure a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); override; { comparison operations } procedure a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel);override; procedure a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1, reg2: tregister; l: tasmlabel);override; procedure a_jmp_always(list: TAsmList; l: TAsmLabel);override; procedure a_jmp_name(list: TAsmList; const s: string);override; procedure a_jmp_cond(list: TAsmList; cond: TOpCmp; l: tasmlabel);{ override;} procedure a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel);override; procedure g_flags2reg(list: TAsmList; size: tcgsize; const f:tresflags; reg: tregister);override; procedure g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef);override; procedure g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc: tlocation);override; procedure g_stackpointer_alloc(list: TAsmList; localsize: longint);override; procedure g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);override; procedure g_proc_exit(list: TAsmList; parasize: longint; nostackframe: boolean);override; procedure g_maybe_got_init(list: TAsmList); override; procedure g_restore_registers(list: TAsmList);override; procedure g_save_registers(list: TAsmList);override; procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override; procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override; procedure g_check_for_fpu_exception(list: TAsmList; force, clear: boolean);override; procedure g_profilecode(list: TAsmList);override; private function save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint; procedure load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); end; procedure create_codegen; const TOpCG2AsmOpReg: array[topcg] of TAsmOp = ( A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASRV,A_LSLV,A_LSRV,A_SUB,A_EOR,A_NONE,A_RORV ); TOpCG2AsmOpImm: array[topcg] of TAsmOp = ( A_NONE,A_MOV,A_ADD,A_AND,A_UDIV,A_SDIV,A_MUL,A_MUL,A_NEG,A_MVN,A_ORR,A_ASR,A_LSL,A_LSR,A_SUB,A_EOR,A_NONE,A_ROR ); TOpCmp2AsmCond: array[topcmp] of TAsmCond = (C_NONE,C_EQ,C_GT, C_LT,C_GE,C_LE,C_NE,C_LS,C_CC,C_CS,C_HI ); winstackpagesize = 4096; implementation uses globals,verbose,systems,cutils,cclasses, paramgr,fmodule, symtable,symsym, tgobj, ncgutil, procinfo,cpupi; procedure tcgaarch64.make_simple_ref(list:TAsmList; var op: tasmop; size: tcgsize; oppostfix: toppostfix; var ref: treference; preferred_newbasereg: tregister); var href: treference; so: tshifterop; accesssize: longint; begin if (ref.base=NR_NO) then begin if ref.shiftmode<>SM_None then internalerror(2014110701); ref.base:=ref.index; ref.index:=NR_NO; end; { no abitrary scale factor support (the generic code doesn't set it, AArch-specific code shouldn't either) } if not(ref.scalefactor in [0,1]) then internalerror(2014111002); case simple_ref_type(op,size,oppostfix,ref) of sr_simple: exit; sr_internal_illegal: internalerror(2014121702); sr_complex: { continue } ; end; if assigned(ref.symbol) then begin { internal "load symbol" instructions should already be valid } if assigned(ref.symboldata) or (ref.refaddr in [addr_pic,addr_gotpage,addr_gotpageoffset,addr_page,addr_pageoffset]) then internalerror(2014110802); { no relative symbol support (needed) yet } if assigned(ref.relsymbol) then internalerror(2014111001); { loading a symbol address (whether it's in the GOT or not) consists of two parts: first load the page on which it is located, then either the offset in the page or load the value at that offset in the page. This final GOT-load can be relaxed by the linker in case the variable itself can be stored directly in the GOT } if (preferred_newbasereg=NR_NO) or (ref.base=preferred_newbasereg) or (ref.index=preferred_newbasereg) then preferred_newbasereg:=getaddressregister(list); { load the (GOT) page } reference_reset_symbol(href,ref.symbol,0,8,[]); if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL,AT_DATA]) and (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or ((ref.symbol.typ=AT_DATA) and (ref.symbol.bind=AB_LOCAL)) or (target_info.system=system_aarch64_win64) then href.refaddr:=addr_page else href.refaddr:=addr_gotpage; list.concat(taicpu.op_reg_ref(A_ADRP,preferred_newbasereg,href)); { load the GOT entry (= address of the variable) } reference_reset_base(href,preferred_newbasereg,0,ctempposinvalid,sizeof(pint),[]); href.symbol:=ref.symbol; { code symbols defined in the current compilation unit do not have to be accessed via the GOT } if ((ref.symbol.typ in [AT_FUNCTION,AT_LABEL,AT_DATA]) and (ref.symbol.bind in [AB_LOCAL,AB_GLOBAL])) or ((ref.symbol.typ=AT_DATA) and (ref.symbol.bind=AB_LOCAL)) or (target_info.system=system_aarch64_win64) then begin href.base:=NR_NO; href.refaddr:=addr_pageoffset; list.concat(taicpu.op_reg_reg_ref(A_ADD,preferred_newbasereg,preferred_newbasereg,href)); end else begin href.refaddr:=addr_gotpageoffset; { use a_load_ref_reg() rather than directly encoding the LDR, so that we'll check the validity of the reference } a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,preferred_newbasereg); end; { set as new base register } if ref.base=NR_NO then ref.base:=preferred_newbasereg else if ref.index=NR_NO then ref.index:=preferred_newbasereg else begin { make sure it's valid in case ref.base is SP -> make it the second operand} a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,preferred_newbasereg,ref.base,preferred_newbasereg); ref.base:=preferred_newbasereg end; ref.symbol:=nil; end; { base & index } if (ref.base<>NR_NO) and (ref.index<>NR_NO) then begin case op of A_LDR, A_STR: begin if (ref.shiftmode=SM_None) and (ref.shiftimm<>0) then internalerror(2014110805); { wrong shift? (possible in case of something like array_of_2byte_rec[x].bytefield -> shift will be set 1, but the final load is a 1 byte -> can't use shift after all } if (ref.shiftmode in [SM_LSL,SM_UXTW,SM_SXTW]) and ((ref.shiftimm<>BsfDWord(tcgsizep2size[size])) or (ref.offset<>0)) then begin if preferred_newbasereg=NR_NO then preferred_newbasereg:=getaddressregister(list); { "add" supports a superset of the shift modes supported by load/store instructions } shifterop_reset(so); so.shiftmode:=ref.shiftmode; so.shiftimm:=ref.shiftimm; list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so)); reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility); { possibly still an invalid offset -> fall through } end else if ref.offset<>0 then begin if (preferred_newbasereg=NR_NO) or { we keep ref.index, so it must not be overwritten } (ref.index=preferred_newbasereg) then preferred_newbasereg:=getaddressregister(list); { add to the base and not to the index, because the index may be scaled; this works even if the base is SP } a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg); ref.offset:=0; ref.base:=preferred_newbasereg; { finished } exit; end else { valid -> exit } exit; end; { todo } A_LD1,A_LD2,A_LD3,A_LD4, A_ST1,A_ST2,A_ST3,A_ST4: internalerror(2014110702); { these don't support base+index } A_LDUR,A_STUR, A_LDP,A_STP: begin { these either don't support pre-/post-indexing, or don't support it with base+index } if ref.addressmode<>AM_OFFSET then internalerror(2014110911); if preferred_newbasereg=NR_NO then preferred_newbasereg:=getaddressregister(list); if ref.shiftmode<>SM_None then begin { "add" supports a superset of the shift modes supported by load/store instructions } shifterop_reset(so); so.shiftmode:=ref.shiftmode; so.shiftimm:=ref.shiftimm; list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,preferred_newbasereg,ref.base,ref.index,so)); end else a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,ref.index,ref.base,preferred_newbasereg); reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility); { fall through to the handling of base + offset, since the offset may still be too big } end; else internalerror(2014110903); end; end; { base + offset } if ref.base<>NR_NO then begin if ref.offset=0 then exit; { valid offset for LDUR/STUR -> use that } if (ref.addressmode=AM_OFFSET) and (op in [A_LDR,A_STR]) and (ref.offset>=-256) and (ref.offset<=255) then begin if op=A_LDR then op:=A_LDUR else op:=A_STUR end { if it's not a valid LDUR/STUR, use LDR/STR } else if (op in [A_LDUR,A_STUR]) and ((ref.offset<-256) or (ref.offset>255) or (ref.addressmode<>AM_OFFSET)) then begin if op=A_LDUR then op:=A_LDR else op:=A_STR end; case op of A_LDR,A_STR: begin case ref.addressmode of AM_PREINDEXED: begin { since the loaded/stored register cannot be the same as the base register, we can safely add the offset to the base if it doesn't fit} if (ref.offset<-256) or (ref.offset>255) then begin a_op_const_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base); ref.offset:=0; end; end; AM_POSTINDEXED: begin { cannot emulate post-indexing if we have to fold the offset into the base register } if (ref.offset<-256) or (ref.offset>255) then internalerror(2014110909); { ok } end; AM_OFFSET: begin { unsupported offset -> fold into base register } accesssize:=1 shl tcgsizep2size[size]; if (ref.offset<0) or (ref.offset>(((1 shl 12)-1)*accesssize)) or ((ref.offset mod accesssize)<>0) then begin if preferred_newbasereg=NR_NO then preferred_newbasereg:=getaddressregister(list); { can we split the offset beween an "add/sub (imm12 shl 12)" and the load (also an imm12)? -- the offset from the load will always be added, that's why the lower bound has a smaller range than the upper bound; it must also be a multiple of the access size } if (ref.offset>=-(((1 shl 12)-1) shl 12)) and (ref.offset<=((1 shl 12)-1) shl 12 + ((1 shl 12)-1)) and ((ref.offset mod accesssize)=0) then begin a_op_const_reg_reg(list,OP_ADD,OS_ADDR,(ref.offset shr 12) shl 12,ref.base,preferred_newbasereg); ref.offset:=ref.offset-(ref.offset shr 12) shl 12; end else begin a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg); ref.offset:=0; end; reference_reset_base(ref,preferred_newbasereg,ref.offset,ref.temppos,ref.alignment,ref.volatility); end; end end; end; A_LDP,A_STP: begin { unsupported offset -> fold into base register (these instructions support all addressmodes) } if (ref.offset<-(1 shl (6+tcgsizep2size[size]))) or (ref.offset>(1 shl (6+tcgsizep2size[size]))-1) then begin case ref.addressmode of AM_POSTINDEXED: { don't emulate post-indexing if we have to fold the offset into the base register } internalerror(2014110910); AM_PREINDEXED: { this means the offset must be added to the current base register } preferred_newbasereg:=ref.base; AM_OFFSET: if preferred_newbasereg=NR_NO then preferred_newbasereg:=getaddressregister(list); end; a_op_const_reg_reg(list,OP_ADD,OS_ADDR,ref.offset,ref.base,preferred_newbasereg); reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,ref.alignment,ref.volatility); end end; A_LDUR,A_STUR: begin { valid, checked above } end; { todo } A_LD1,A_LD2,A_LD3,A_LD4, A_ST1,A_ST2,A_ST3,A_ST4: internalerror(2014110908); else internalerror(2014110708); end; { done } exit; end; { only an offset -> change to base (+ offset 0) } if preferred_newbasereg=NR_NO then preferred_newbasereg:=getaddressregister(list); a_load_const_reg(list,OS_ADDR,ref.offset,preferred_newbasereg); reference_reset_base(ref,preferred_newbasereg,0,ref.temppos,newalignment(8,ref.offset),ref.volatility); end; function tcgaarch64.makeregsize(reg: tregister; size: tcgsize): tregister; var subreg:Tsubregister; begin subreg:=cgsize2subreg(getregtype(reg),size); result:=reg; setsubreg(result,subreg); end; function tcgaarch64.getfpuregister(list: TAsmList; size: Tcgsize): Tregister; begin internalerror(2014122110); { squash warning } result:=NR_NO; end; function tcgaarch64.handle_load_store(list: TAsmList; op: tasmop; size: tcgsize; oppostfix: toppostfix; reg: tregister; ref: treference):treference; begin make_simple_ref(list,op,size,oppostfix,ref,NR_NO); list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),oppostfix)); result:=ref; end; procedure tcgaarch64.handle_reg_imm12_reg(list: TAsmList; op: Tasmop; size: tcgsize; src: tregister; a: tcgint; dst: tregister; tmpreg: tregister; setflags, usedest: boolean); var instr: taicpu; so: tshifterop; hadtmpreg: boolean; begin { imm12 } if (a>=0) and (a<=((1 shl 12)-1)) then if usedest then instr:=taicpu.op_reg_reg_const(op,dst,src,a) else instr:=taicpu.op_reg_const(op,src,a) { imm12 lsl 12 } else if (a and not(((tcgint(1) shl 12)-1) shl 12))=0 then begin so.shiftmode:=SM_LSL; so.shiftimm:=12; if usedest then instr:=taicpu.op_reg_reg_const_shifterop(op,dst,src,a shr 12,so) else instr:=taicpu.op_reg_const_shifterop(op,src,a shr 12,so) end else begin { todo: other possible optimizations (e.g. load 16 bit constant in register and then add/sub/cmp/cmn shifted the rest) } if tmpreg=NR_NO then begin hadtmpreg:=false; tmpreg:=getintregister(list,size); end else begin hadtmpreg:=true; getcpuregister(list,tmpreg); end; a_load_const_reg(list,size,a,tmpreg); if usedest then instr:=taicpu.op_reg_reg_reg(op,dst,src,tmpreg) else instr:=taicpu.op_reg_reg(op,src,tmpreg); if hadtmpreg then ungetcpuregister(list,tmpreg); end; if setflags then setoppostfix(instr,PF_S); list.concat(instr); end; {**************************************************************************** Assembler code ****************************************************************************} procedure tcgaarch64.init_register_allocators; begin inherited init_register_allocators; rg[R_INTREGISTER]:=trgintcpu.create(R_INTREGISTER,R_SUBWHOLE, [RS_X0,RS_X1,RS_X2,RS_X3,RS_X4,RS_X5,RS_X6,RS_X7,RS_X8, RS_X9,RS_X10,RS_X11,RS_X12,RS_X13,RS_X14,RS_X15,RS_X16,RS_X17, RS_X19,RS_X20,RS_X21,RS_X22,RS_X23,RS_X24,RS_X25,RS_X26,RS_X27,RS_X28 { maybe we can enable this in the future for leaf functions (it's the frame pointer) ,RS_X29 }], first_int_imreg,[]); rg[R_MMREGISTER]:=trgcpu.create(R_MMREGISTER,R_SUBMMD, [RS_Q0,RS_Q1,RS_Q2,RS_Q3,RS_Q4,RS_Q5,RS_Q6,RS_Q7, RS_Q8,RS_Q9,RS_Q10,RS_Q11,RS_Q12,RS_Q13,RS_Q14,RS_Q15, RS_Q16,RS_Q17,RS_Q18,RS_Q19,RS_Q20,RS_Q21,RS_Q22,RS_Q23, RS_Q24,RS_Q25,RS_Q26,RS_Q27,RS_Q28,RS_Q29,RS_Q30,RS_Q31], first_mm_imreg,[]); end; procedure tcgaarch64.done_register_allocators; begin rg[R_INTREGISTER].free; rg[R_FPUREGISTER].free; rg[R_MMREGISTER].free; inherited done_register_allocators; end; function tcgaarch64.getmmregister(list: TAsmList; size: tcgsize):tregister; begin case size of OS_F32: result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS); OS_F64: result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD) else internalerror(2014102701); end; end; procedure tcgaarch64.a_call_name(list: TAsmList; const s: string; weak: boolean); begin if not weak then list.concat(taicpu.op_sym(A_BL,current_asmdata.RefAsmSymbol(s,AT_FUNCTION))) else list.concat(taicpu.op_sym(A_BL,current_asmdata.WeakRefAsmSymbol(s,AT_FUNCTION))); end; procedure tcgaarch64.a_call_reg(list:TAsmList;Reg:tregister); begin list.concat(taicpu.op_reg(A_BLR,reg)); end; {********************** load instructions ********************} procedure tcgaarch64.a_load_const_reg(list: TAsmList; size: tcgsize; a: tcgint; reg : tregister); var opc: tasmop; shift: byte; so: tshifterop; reginited,doinverted,extendedsize: boolean; manipulated_a: tcgint; leftover_a: word; begin {$ifdef extdebug} list.concat(tai_comment.Create(strpnew('Generating constant ' + tostr(a) + ' / $' + hexstr(a, 16)))); {$endif extdebug} extendedsize := (size in [OS_64,OS_S64]); case a of { Small positive number } $0..$FFFF: begin list.concat(taicpu.op_reg_const(A_MOVZ, reg, a)); Exit; end; { Small negative number } -65536..-1: begin list.concat(taicpu.op_reg_const(A_MOVN, reg, Word(not a))); Exit; end; { Can be represented as a negative number more compactly } $FFFF0000..$FFFFFFFF: begin { if we load a value into a 32 bit register, it is automatically zero-extended to 64 bit } list.concat(taicpu.op_reg_const(A_MOVN, makeregsize(reg,OS_32), Word(not a))); Exit; end; else begin if not extendedsize then { Mostly so programmers don't get confused when they view the disassembly and 'a' is sign-extended to 64-bit, say, but also avoids potential problems with third-party assemblers if the number is out of bounds for a given size } a := Cardinal(a); { Check to see if a is a valid shifter constant that can be encoded in ORR as is } if is_shifter_const(a,size) then begin { Use synthetic "MOV" instruction instead of "ORR reg,wzr,#a" (an alias), since AArch64 conventions prefer this, and it's clearer in the disassembly } list.concat(taicpu.op_reg_const(A_MOV,reg,a)); Exit; end; { If the value of a fits into 32 bits, it's fastest to use movz/movk regardless } if extendedsize and ((a shr 32) <> 0) then begin { This determines whether this write can be performed with an ORR followed by MOVK by copying the 3nd word to the 1st word for the ORR constant, then overwriting the 1st word. The alternative would require 4 instructions. This sequence is common when division reciprocals are calculated (e.g. 3 produces AAAAAAAAAAAAAAAB). } leftover_a := word(a and $FFFF); manipulated_a := (a and $FFFFFFFFFFFF0000) or ((a shr 32) and $FFFF); { if manipulated_a = a, don't check, because is_shifter_const was already called for a and it returned False. Reduces processing time. [Kit] } if (manipulated_a <> a) and is_shifter_const(manipulated_a, OS_64) then begin { Encode value as: orr reg,xzr,manipulated_a movk reg,#(leftover_a) Use "orr" instead of "mov" here for the assembly dump so it better implies that something special is happening with the number arrangement. } list.concat(taicpu.op_reg_reg_const(A_ORR, reg, NR_XZR, manipulated_a)); list.concat(taicpu.op_reg_const(A_MOVK, reg, leftover_a)); Exit; end; { This determines whether this write can be performed with an ORR followed by MOVK by copying the 2nd word to the 4th word for the ORR constant, then overwriting the 4th word. The alternative would require 3 instructions } leftover_a := word(a shr 48); manipulated_a := (a and $0000FFFFFFFFFFFF); if manipulated_a = $0000FFFFFFFFFFFF then begin { This is even better, as we can just use a single MOVN on the last word } shifterop_reset(so); so.shiftmode := SM_LSL; so.shiftimm := 48; list.concat(taicpu.op_reg_const_shifterop(A_MOVN, reg, word(not leftover_a), so)); Exit; end; manipulated_a := manipulated_a or (((a shr 16) and $FFFF) shl 48); { if manipulated_a = a, don't check, because is_shifter_const was already called for a and it returned False. Reduces processing time. [Kit] } if (manipulated_a <> a) and is_shifter_const(manipulated_a, OS_64) then begin { Encode value as: orr reg,xzr,manipulated_a movk reg,#(leftover_a),lsl #48 Use "orr" instead of "mov" here for the assembly dump so it better implies that something special is happening with the number arrangement. } list.concat(taicpu.op_reg_reg_const(A_ORR, reg, NR_XZR, manipulated_a)); shifterop_reset(so); so.shiftmode := SM_LSL; so.shiftimm := 48; list.concat(taicpu.op_reg_const_shifterop(A_MOVK, reg, leftover_a, so)); Exit; end; case a of { If a is in the given negative range, it can be stored more efficiently if it is inverted. } TCgInt($FFFF000000000000)..-65537: begin { NOTE: This excluded range can be more efficiently stored as the first 16 bits followed by a shifter constant } case a of TCgInt($FFFF0000FFFF0000)..TCgInt($FFFF0000FFFFFFFF): doinverted := False; else begin doinverted := True; a := not a; end; end; end; else doinverted := False; end; end else doinverted:=False; end; end; reginited:=false; shift:=0; if doinverted then opc:=A_MOVN else opc:=A_MOVZ; repeat { leftover is shifterconst? (don't check if we can represent it just as effectively with movz/movk, as this check is expensive) } if (word(a)<>0) then begin if not doinverted and ((shift0)) and is_shifter_const(a shl shift,size) then begin if reginited then list.concat(taicpu.op_reg_reg_const(A_ORR,reg,reg,a shl shift)) else list.concat(taicpu.op_reg_reg_const(A_ORR,reg,makeregsize(NR_XZR,size),a shl shift)); exit; end; { set all 16 bit parts <> 0 } if shift=0 then begin list.concat(taicpu.op_reg_const(opc,reg,word(a))); reginited:=true; end else begin shifterop_reset(so); so.shiftmode:=SM_LSL; so.shiftimm:=shift; if not reginited then begin list.concat(taicpu.op_reg_const_shifterop(opc,reg,word(a),so)); reginited:=true; end else begin if doinverted then list.concat(taicpu.op_reg_const_shifterop(A_MOVK,reg,word(not a),so)) else list.concat(taicpu.op_reg_const_shifterop(A_MOVK,reg,word(a),so)); end; end; end; a:=a shr 16; inc(shift,16); until a = 0; if not reginited then internalerror(2014102702); end; procedure tcgaarch64.a_load_const_ref(list: TAsmList; size: tcgsize; a: tcgint; const ref: treference); var reg: tregister; href: treference; i: Integer; begin { use the zero register if possible } if a=0 then begin href:=ref; inc(href.offset,tcgsize2size[size]-1); if (tcgsize2size[size]>1) and (ref.alignment=1) and (simple_ref_type(A_STUR,OS_8,PF_None,ref)=sr_simple) and (simple_ref_type(A_STUR,OS_8,PF_None,href)=sr_simple) then begin href:=ref; for i:=0 to tcgsize2size[size]-1 do begin a_load_const_ref(list,OS_8,0,href); inc(href.offset); end; end else begin if size in [OS_64,OS_S64] then reg:=NR_XZR else reg:=NR_WZR; a_load_reg_ref(list,size,size,reg,ref); end; end else inherited; end; procedure tcgaarch64.a_load_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); var oppostfix:toppostfix; hreg: tregister; begin if tcgsize2Size[fromsize]>=tcgsize2Size[tosize] then begin fromsize:=tosize; reg:=makeregsize(list,reg,fromsize); end { have a 32 bit register but need a 64 bit one? } else if tosize in [OS_64,OS_S64] then begin { sign extend if necessary } if fromsize in [OS_S8,OS_S16,OS_S32] then begin { can't overwrite reg, may be a constant reg } hreg:=getintregister(list,tosize); a_load_reg_reg(list,fromsize,tosize,reg,hreg); reg:=hreg; end else { top 32 bit are zero by default } reg:=makeregsize(reg,OS_64); fromsize:=tosize; end; if not(target_info.system=system_aarch64_darwin) and (ref.alignment<>0) and (ref.alignment=tcgsize2Size[tosize] then fromsize:=tosize; { ensure that all bits of the 32/64 register are always correctly set: * default behaviour is always to zero-extend to the entire (64 bit) register -> unsigned 8/16/32 bit loads only exist with a 32 bit target register, as the upper 32 bit will be zeroed implicitly -> always make target register 32 bit * signed loads exist both with 32 and 64 bit target registers, depending on whether the value should be sign extended to 32 or to 64 bit (if sign extended to 32 bit, the upper 32 bits of the corresponding 64 bit register are again zeroed) -> no need to change anything (we only have 32 and 64 bit registers), except that when loading an OS_S32 to a 32 bit register, we don't need/can't use sign extension } if fromsize in [OS_8,OS_16,OS_32] then reg:=makeregsize(reg,OS_32); if not(target_info.system=system_aarch64_darwin) and (ref.alignment<>0) and (ref.alignment insert conversion when when we have to truncate/sign extend inside the (32 or 64 bit) register holding the value, and when we sign extend from a 32 to a 64 bit register } if (tcgsize2size[fromsize]>tcgsize2size[tosize]) or ((tcgsize2size[fromsize]=tcgsize2size[tosize]) and (fromsize<>tosize) and not(fromsize in [OS_32,OS_S32,OS_64,OS_S64])) or ((fromsize in [OS_S8,OS_S16,OS_S32]) and (tosize in [OS_64,OS_S64])) or { needs to mask out the sign in the top 16 bits } ((fromsize=OS_S8) and (tosize=OS_16)) then begin case tosize of OS_8: list.concat(taicpu.op_reg_reg(A_UXTB,reg2,makeregsize(reg1,OS_32))); OS_16: list.concat(taicpu.op_reg_reg(A_UXTH,reg2,makeregsize(reg1,OS_32))); OS_S8: list.concat(taicpu.op_reg_reg(A_SXTB,reg2,makeregsize(reg1,OS_32))); OS_S16: list.concat(taicpu.op_reg_reg(A_SXTH,reg2,makeregsize(reg1,OS_32))); { while "mov wN, wM" automatically inserts a zero-extension and hence we could encode a 64->32 bit move like that, the problem is that we then can't distinguish 64->32 from 32->32 moves, and the 64->32 truncation could be removed altogether... So use a different instruction } OS_32, OS_S32: { in theory, reg1 should be 64 bit here (since fromsize>tosize), but because of the way location_force_register() tries to avoid superfluous zero/sign extensions, it's not always the case -> also force reg1 to to 64 bit } list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32)); OS_64, OS_S64: case fromsize of OS_8: list.concat(taicpu.op_reg_reg(A_UXTB,reg2,makeregsize(reg1,OS_64))); OS_S8: list.concat(taicpu.op_reg_reg(A_SXTB,reg2,makeregsize(reg1,OS_32))); OS_16: list.concat(taicpu.op_reg_reg(A_UXTH,reg2,makeregsize(reg1,OS_64))); OS_S16: list.concat(taicpu.op_reg_reg(A_SXTH,reg2,makeregsize(reg1,OS_32))); OS_32: list.concat(taicpu.op_reg_reg_const_const(A_UBFIZ,makeregsize(reg2,OS_64),makeregsize(reg1,OS_64),0,32)); OS_S32: list.concat(taicpu.op_reg_reg(A_SXTW,reg2,makeregsize(reg1,OS_32))); else internalerror(2024070701); end; else internalerror(2002090901); end; end else begin { 32 -> 32 bit move implies zero extension (sign extensions have been handled above) -> also use for 32 <-> 64 bit moves } if not(fromsize in [OS_64,OS_S64]) or not(tosize in [OS_64,OS_S64]) then instr:=taicpu.op_reg_reg(A_MOV,makeregsize(reg2,OS_32),makeregsize(reg1,OS_32)) else instr:=taicpu.op_reg_reg(A_MOV,reg2,reg1); list.Concat(instr); { Notify the register allocator that we have written a move instruction so it can try to eliminate it. } add_move_instruction(instr); end; end; procedure tcgaarch64.a_loadaddr_ref_reg(list: TAsmList; const ref: treference; r: tregister); var href: treference; so: tshifterop; op: tasmop; begin op:=A_LDR; href:=ref; { simplify as if we're going to perform a regular 64 bit load, using "r" as the new base register if possible/necessary } make_simple_ref(list,op,OS_ADDR,PF_None,href,r); { load literal? } if assigned(href.symbol) then begin if (href.base<>NR_NO) or (href.index<>NR_NO) or not assigned(href.symboldata) then internalerror(2014110912); list.concat(taicpu.op_reg_sym_ofs(A_ADR,r,href.symbol,href.offset)); end else begin if href.index<>NR_NO then begin if href.shiftmode<>SM_None then begin { "add" supports a supperset of the shift modes supported by load/store instructions } shifterop_reset(so); so.shiftmode:=href.shiftmode; so.shiftimm:=href.shiftimm; list.concat(taicpu.op_reg_reg_reg_shifterop(A_ADD,r,href.base,href.index,so)); end else a_op_reg_reg_reg(list,OP_ADD,OS_ADDR,href.index,href.base,r); end else if href.offset<>0 then a_op_const_reg_reg(list,OP_ADD,OS_ADDR,href.offset,href.base,r) else a_load_reg_reg(list,OS_ADDR,OS_ADDR,href.base,r); end; end; procedure tcgaarch64.a_loadfpu_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister); begin internalerror(2014122107) end; procedure tcgaarch64.a_loadfpu_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister); begin internalerror(2014122108) end; procedure tcgaarch64.a_loadfpu_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference); begin internalerror(2014122109) end; procedure tcgaarch64.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize: tcgsize; reg1, reg2: tregister; shuffle: pmmshuffle); var instr: taicpu; begin if assigned(shuffle) and not shufflescalar(shuffle) then internalerror(2014122104); if fromsize=tosize then begin instr:=taicpu.op_reg_reg(A_FMOV,reg2,reg1); { Notify the register allocator that we have written a move instruction so it can try to eliminate it. } add_move_instruction(instr); { FMOV cannot generate a floating point exception } end else begin if (reg_cgsize(reg1)<>fromsize) or (reg_cgsize(reg2)<>tosize) then internalerror(2014110913); instr:=taicpu.op_reg_reg(A_FCVT,reg2,reg1); maybe_check_for_fpu_exception(list); end; list.Concat(instr); end; procedure tcgaarch64.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize: tcgsize; const ref: treference; reg: tregister; shuffle: pmmshuffle); var tmpreg: tregister; begin if assigned(shuffle) and not shufflescalar(shuffle) then internalerror(2014122105); tmpreg:=NR_NO; if (fromsize<>tosize) then begin tmpreg:=reg; reg:=getmmregister(list,fromsize); end; handle_load_store(list,A_LDR,fromsize,PF_None,reg,ref); if (fromsize<>tosize) then a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil); end; procedure tcgaarch64.a_loadmm_reg_ref(list: TAsmList; fromsize, tosize: tcgsize; reg: tregister; const ref: treference; shuffle: pmmshuffle); var tmpreg: tregister; begin if assigned(shuffle) and not shufflescalar(shuffle) then internalerror(2014122106); if (fromsize<>tosize) then begin tmpreg:=getmmregister(list,tosize); a_loadmm_reg_reg(list,fromsize,tosize,reg,tmpreg,nil); reg:=tmpreg; end; handle_load_store(list,A_STR,tosize,PF_NONE,reg,ref); end; procedure tcgaarch64.a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize: tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); begin if not shufflescalar(shuffle) then internalerror(2014122801); if tcgsize2size[fromsize]<>tcgsize2size[tosize] then internalerror(2014122803); case tcgsize2size[tosize] of 4: setsubreg(mmreg,R_SUBMMS); 8: setsubreg(mmreg,R_SUBMMD); else internalerror(2020101310); end; list.concat(taicpu.op_indexedreg_reg(A_INS,mmreg,0,intreg)); end; procedure tcgaarch64.a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize: tcgsize; mmreg, intreg: tregister; shuffle: pmmshuffle); var r : tregister; begin if not shufflescalar(shuffle) then internalerror(2014122802); if tcgsize2size[fromsize]>tcgsize2size[tosize] then internalerror(2014122804); case tcgsize2size[fromsize] of 4: setsubreg(mmreg,R_SUBMMS); 8: setsubreg(mmreg,R_SUBMMD); else internalerror(2020101311); end; if tcgsize2size[fromsize]dst) or (reg_cgsize(src)<>size) or assigned(shuffle) then internalerror(2015011401) else case size of OS_F32, OS_F64: list.concat(taicpu.op_reg_const(A_MOVI,makeregsize(dst,OS_F64),0)); else internalerror(2015011402); end; end else internalerror(2015011403); end; end; procedure tcgaarch64.a_bit_scan_reg_reg(list: TAsmList; reverse: boolean; srcsize, dstsize: tcgsize; src, dst: TRegister); var bitsize: longint; begin if srcsize in [OS_64,OS_S64] then begin bitsize:=64; end else begin bitsize:=32; end; { source is 0 -> dst will have to become 255 } list.concat(taicpu.op_reg_const(A_CMP,src,0)); if reverse then begin list.Concat(taicpu.op_reg_reg(A_CLZ,makeregsize(dst,srcsize),src)); { xor 31/63 is the same as setting the lower 5/6 bits to "31/63-(lower 5/6 bits of dst)" } list.Concat(taicpu.op_reg_reg_const(A_EOR,dst,dst,bitsize-1)); end else begin list.Concat(taicpu.op_reg_reg(A_RBIT,makeregsize(dst,srcsize),src)); list.Concat(taicpu.op_reg_reg(A_CLZ,dst,dst)); end; { set dst to -1 if src was 0 } list.Concat(taicpu.op_reg_reg_reg_cond(A_CSINV,dst,dst,makeregsize(NR_XZR,dstsize),C_NE)); { mask the -1 to 255 if src was 0 (anyone find a two-instruction branch-free version? All of mine are 3...) } list.Concat(taicpu.op_reg_reg(A_UXTB,makeregsize(dst,OS_32),makeregsize(dst,OS_32))); end; procedure tcgaarch64.a_load_reg_ref_unaligned(list: TAsmList; fromsize, tosize: tcgsize; register: tregister; const ref: treference); var href: treference; hreg1, hreg2, tmpreg: tregister; begin if fromsize in [OS_64,OS_S64] then begin { split into two 32 bit stores } hreg1:=getintregister(list,OS_32); hreg2:=getintregister(list,OS_32); a_load_reg_reg(list,OS_32,OS_32,makeregsize(register,OS_32),hreg1); a_op_const_reg_reg(list,OP_SHR,OS_64,32,register,makeregsize(hreg2,OS_64)); if target_info.endian=endian_big then begin tmpreg:=hreg1; hreg1:=hreg2; hreg2:=tmpreg; end; { can we use STP? } if (ref.alignment=4) and (simple_ref_type(A_STP,OS_32,PF_None,ref)=sr_simple) then list.concat(taicpu.op_reg_reg_ref(A_STP,hreg1,hreg2,ref)) else begin a_load_reg_ref(list,OS_32,OS_32,hreg1,ref); href:=ref; inc(href.offset,4); a_load_reg_ref(list,OS_32,OS_32,hreg2,href); end; end else inherited; end; procedure tcgaarch64.maybeadjustresult(list: TAsmList; op: topcg; size: tcgsize; dst: tregister); const overflowops = [OP_MUL,OP_IMUL,OP_SHL,OP_ADD,OP_SUB,OP_NOT,OP_NEG]; begin if (op in overflowops) and (size in [OS_8,OS_S8,OS_16,OS_S16]) then a_load_reg_reg(list,OS_32,size,makeregsize(dst,OS_32),makeregsize(dst,OS_32)) end; procedure tcgaarch64.a_op_const_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; reg: tregister); begin optimize_op_const(size,op,a); case op of OP_NONE: exit; OP_MOVE: a_load_const_reg(list,size,a,reg); OP_NEG,OP_NOT: internalerror(200306011); else a_op_const_reg_reg(list,op,size,a,reg,reg); end; end; procedure tcgaarch64.a_op_reg_reg(list:TAsmList;op:topcg;size:tcgsize;src,dst:tregister); begin Case op of OP_NEG, OP_NOT: begin if (op=OP_NOT) and (size in [OS_8,OS_S8]) then list.concat(taicpu.op_reg_reg_const(A_EOR,dst,src,255)) else begin list.concat(taicpu.op_reg_reg(TOpCG2AsmOpReg[op],dst,src)); maybeadjustresult(list,op,size,dst); end; end else a_op_reg_reg_reg(list,op,size,src,dst,dst); end; end; procedure tcgaarch64.a_op_const_reg_reg(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister); var l: tlocation; begin a_op_const_reg_reg_checkoverflow(list,op,size,a,src,dst,false,l); end; procedure tcgaarch64.a_op_reg_reg_reg(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister); var hreg: tregister; begin { no ROLV opcode... } if op=OP_ROL then begin case size of OS_32,OS_S32, OS_64,OS_S64: begin hreg:=getintregister(list,size); a_load_const_reg(list,size,tcgsize2size[size]*8,hreg); a_op_reg_reg(list,OP_SUB,size,src1,hreg); a_op_reg_reg_reg(list,OP_ROR,size,hreg,src2,dst); exit; end; else internalerror(2014111005); end; end else if (op=OP_ROR) and not(size in [OS_32,OS_S32,OS_64,OS_S64]) then internalerror(2014111006); if TOpCG2AsmOpReg[op]=A_NONE then internalerror(2014111007); list.concat(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1)); maybeadjustresult(list,op,size,dst); end; procedure tcgaarch64.a_op_const_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; a: tcgint; src, dst: tregister; setflags : boolean; var ovloc : tlocation); var shiftcountmask: longint; constreg: tregister; begin { add/sub instructions have only positive immediate operands } if (op in [OP_ADD,OP_SUB]) and (a<0) and { this might result in a false positive overflow in case of a+0 } (a<>$8000000000000000) then begin if op=OP_ADD then op:=op_SUB else op:=OP_ADD; { avoid range/overflow error in case a = low(tcgint) } {$push}{$r-}{$q-} a:=-a; {$pop} end; ovloc.loc:=LOC_VOID; optimize_op_const(size,op,a); case op of OP_NONE: begin a_load_reg_reg(list,size,size,src,dst); exit; end; OP_MOVE: begin a_load_const_reg(list,size,a,dst); exit; end; else ; end; case op of OP_ADD, OP_SUB: begin handle_reg_imm12_reg(list,TOpCG2AsmOpImm[op],size,src,a,dst,NR_NO,setflags,true); { on a 64 bit target, overflows with smaller data types are handled via range errors } if setflags and (size in [OS_64,OS_S64]) then begin location_reset(ovloc,LOC_FLAGS,OS_8); if size=OS_64 then if op=OP_ADD then ovloc.resflags:=F_CS else ovloc.resflags:=F_CC else ovloc.resflags:=F_VS; end; end; OP_OR, OP_AND, OP_XOR: begin if not(size in [OS_64,OS_S64]) then a:=cardinal(a); if is_shifter_const(a,size) then list.concat(taicpu.op_reg_reg_const(TOpCG2AsmOpReg[op],dst,src,a)) else begin constreg:=getintregister(list,size); a_load_const_reg(list,size,a,constreg); a_op_reg_reg_reg(list,op,size,constreg,src,dst); end; end; OP_SHL, OP_SHR, OP_SAR: begin if size in [OS_64,OS_S64] then shiftcountmask:=63 else shiftcountmask:=31; if (a and shiftcountmask)<>0 Then list.concat(taicpu.op_reg_reg_const( TOpCG2AsmOpImm[Op],dst,src,a and shiftcountmask)) else a_load_reg_reg(list,size,size,src,dst); if (a and not(tcgint(shiftcountmask)))<>0 then internalError(2014112101); end; OP_ROL, OP_ROR: begin case size of OS_32,OS_S32: if (a and not(tcgint(31)))<>0 then internalError(2014112102); OS_64,OS_S64: if (a and not(tcgint(63)))<>0 then internalError(2014112103); else internalError(2014112104); end; { there's only a ror opcode } if op=OP_ROL then a:=(tcgsize2size[size]*8)-a; list.concat(taicpu.op_reg_reg_const(A_ROR,dst,src,a)); end; OP_MUL, OP_IMUL, OP_DIV, OP_IDIV: begin constreg:=getintregister(list,size); a_load_const_reg(list,size,a,constreg); a_op_reg_reg_reg_checkoverflow(list,op,size,constreg,src,dst,setflags,ovloc); end; else internalerror(2014111403); end; maybeadjustresult(list,op,size,dst); end; procedure tcgaarch64.a_op_reg_reg_reg_checkoverflow(list: TAsmList; op: topcg; size: tcgsize; src1, src2, dst: tregister; setflags : boolean; var ovloc : tlocation); var tmpreg1, tmpreg2: tregister; begin ovloc.loc:=LOC_VOID; { overflow can only occur with 64 bit calculations on 64 bit cpus } if setflags and (size in [OS_64,OS_S64]) then begin case op of OP_ADD, OP_SUB: begin list.concat(setoppostfix(taicpu.op_reg_reg_reg(TOpCG2AsmOpReg[op],dst,src2,src1),PF_S)); ovloc.loc:=LOC_FLAGS; if size=OS_64 then if op=OP_ADD then ovloc.resflags:=F_CS else ovloc.resflags:=F_CC else ovloc.resflags:=F_VS; { finished } exit; end; OP_MUL: begin { check whether the upper 64 bit of the 128 bit product is 0 } tmpreg1:=getintregister(list,OS_64); list.concat(taicpu.op_reg_reg_reg(A_UMULH,tmpreg1,src2,src1)); list.concat(taicpu.op_reg_const(A_CMP,tmpreg1,0)); ovloc.loc:=LOC_FLAGS; ovloc.resflags:=F_NE; { still have to perform the actual multiplication } end; OP_IMUL: begin { check whether the upper 64 bits of the 128 bit multiplication result have the same value as the replicated sign bit of the lower 64 bits } tmpreg1:=getintregister(list,OS_64); list.concat(taicpu.op_reg_reg_reg(A_SMULH,tmpreg1,src2,src1)); { calculate lower 64 bits (afterwards, because dst may be equal to src1 or src2) } a_op_reg_reg_reg(list,op,size,src1,src2,dst); { replicate sign bit } tmpreg2:=getintregister(list,OS_64); a_op_const_reg_reg(list,OP_SAR,OS_S64,63,dst,tmpreg2); list.concat(taicpu.op_reg_reg(A_CMP,tmpreg1,tmpreg2)); ovloc.loc:=LOC_FLAGS; ovloc.resflags:=F_NE; { finished } exit; end; OP_IDIV, OP_DIV: begin { not handled here, needs div-by-zero check (dividing by zero just gives a 0 result on aarch64), and low(int64) div -1 check for overflow) } internalerror(2014122101); end; else internalerror(2019050936); end; end; a_op_reg_reg_reg(list,op,size,src1,src2,dst); end; {*************** compare instructructions ****************} procedure tcgaarch64.a_cmp_const_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; a: tcgint; reg: tregister; l: tasmlabel); var op: tasmop; begin if a>=0 then op:=A_CMP else op:=A_CMN; { avoid range/overflow error in case a=low(tcgint) } {$push}{$r-}{$q-} handle_reg_imm12_reg(list,op,size,reg,abs(a),NR_XZR,NR_NO,false,false); {$pop} a_jmp_cond(list,cmp_op,l); end; procedure tcgaarch64.a_cmp_reg_reg_label(list: TAsmList; size: tcgsize; cmp_op: topcmp; reg1,reg2: tregister; l: tasmlabel); begin list.concat(taicpu.op_reg_reg(A_CMP,reg2,reg1)); a_jmp_cond(list,cmp_op,l); end; procedure tcgaarch64.a_jmp_always(list: TAsmList; l: TAsmLabel); var ai: taicpu; begin ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(l.name,AT_FUNCTION)); ai.is_jmp:=true; list.Concat(ai); end; procedure tcgaarch64.a_jmp_name(list: TAsmList; const s: string); var ai: taicpu; begin ai:=TAiCpu.op_sym(A_B,current_asmdata.RefAsmSymbol(s,AT_FUNCTION)); ai.is_jmp:=true; list.Concat(ai); end; procedure tcgaarch64.a_jmp_cond(list: TAsmList; cond: TOpCmp; l: TAsmLabel); var ai: taicpu; begin ai:=TAiCpu.op_sym(A_B,l); ai.is_jmp:=true; ai.SetCondition(TOpCmp2AsmCond[cond]); list.Concat(ai); end; procedure tcgaarch64.a_jmp_flags(list: TAsmList; const f: tresflags; l: tasmlabel); var ai : taicpu; begin ai:=Taicpu.op_sym(A_B,l); ai.is_jmp:=true; ai.SetCondition(flags_to_cond(f)); list.Concat(ai); end; procedure tcgaarch64.g_flags2reg(list: TAsmList; size: tcgsize; const f: tresflags; reg: tregister); begin list.concat(taicpu.op_reg_cond(A_CSET,reg,flags_to_cond(f))); end; procedure tcgaarch64.g_overflowcheck(list: TAsmList; const loc: tlocation; def: tdef); begin { we need an explicit overflow location, because there are many possibilities (not just the overflow flag, which is only used for signed add/sub) } internalerror(2014112303); end; procedure tcgaarch64.g_overflowcheck_loc(list: TAsmList; const loc: tlocation; def: tdef; ovloc : tlocation); var hl : tasmlabel; hflags : tresflags; begin if not(cs_check_overflow in current_settings.localswitches) then exit; current_asmdata.getjumplabel(hl); case ovloc.loc of LOC_FLAGS: begin hflags:=ovloc.resflags; inverse_flags(hflags); cg.a_jmp_flags(list,hflags,hl); end; else internalerror(2014112304); end; a_call_name(list,'FPC_OVERFLOW',false); a_label(list,hl); end; { *********** entry/exit code and address loading ************ } function tcgaarch64.save_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister): longint; var ref: treference; sr: tsuperregister; pairreg: tregister; sehreg,sehregp : TAsmSehDirective; begin result:=0; reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]); ref.addressmode:=AM_PREINDEXED; pairreg:=NR_NO; { for SEH on Win64 we can only store consecutive register pairs, others need to be stored with STR } if target_info.system=system_aarch64_win64 then begin if rt=R_INTREGISTER then begin sehreg:=ash_savereg_x; sehregp:=ash_saveregp_x; end else if rt=R_MMREGISTER then begin sehreg:=ash_savefreg_x; sehregp:=ash_savefregp_x; end else internalerror(2020041304); for sr:=lowsr to highsr do if sr in rg[rt].used_in_proc then if pairreg=NR_NO then pairreg:=newreg(rt,sr,sub) else begin inc(result,16); if getsupreg(pairreg)=sr-1 then begin list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref)); list.concat(cai_seh_directive.create_reg_offset(sehregp,pairreg,16)); pairreg:=NR_NO; end else begin list.concat(taicpu.op_reg_ref(A_STR,pairreg,ref)); list.concat(cai_seh_directive.create_reg_offset(sehreg,pairreg,16)); pairreg:=newreg(rt,sr,sub); end; end; if pairreg<>NR_NO then begin inc(result,16); list.concat(taicpu.op_reg_ref(A_STR,pairreg,ref)); list.concat(cai_seh_directive.create_reg_offset(sehreg,pairreg,16)); end; end else begin { store all used registers pairwise } for sr:=lowsr to highsr do if sr in rg[rt].used_in_proc then if pairreg=NR_NO then pairreg:=newreg(rt,sr,sub) else begin inc(result,16); list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,newreg(rt,sr,sub),ref)); pairreg:=NR_NO end; { one left -> store twice (stack must be 16 bytes aligned) } if pairreg<>NR_NO then begin list.concat(taicpu.op_reg_reg_ref(A_STP,pairreg,pairreg,ref)); inc(result,16); end; end; end; procedure FixupOffsets(p:TObject;arg:pointer); var sym: tabstractnormalvarsym absolute p; begin if (tsym(p).typ in [paravarsym,localvarsym]) and (sym.localloc.loc=LOC_REFERENCE) and (sym.localloc.reference.base=NR_STACK_POINTER_REG) then begin sym.localloc.reference.base:=NR_FRAME_POINTER_REG; dec(sym.localloc.reference.offset,PLongint(arg)^); end; end; procedure tcgaarch64.g_stackpointer_alloc(list : TAsmList;localsize : longint); var href : treference; i : integer; again : tasmlabel; begin if localsize>0 then begin { windows guards only a few pages for stack growing, so we have to access every page first } if (target_info.system=system_aarch64_win64) and (localsize>=winstackpagesize) then begin if localsize div winstackpagesize<=4 then begin handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true); for i:=1 to localsize div winstackpagesize do begin reference_reset_base(href,NR_SP,localsize-i*winstackpagesize+4,ctempposinvalid,4,[]); list.concat(Taicpu.op_reg_ref(A_STR,NR_WZR,href)); end; reference_reset_base(href,NR_SP,0,ctempposinvalid,4,[]); list.concat(Taicpu.op_reg_ref(A_STR,NR_WZR,href)); end else begin current_asmdata.getjumplabel(again); getcpuregister(list,NR_IP0); a_load_const_reg(list,OS_ADDR,localsize div winstackpagesize,NR_IP0); a_label(list,again); handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,winstackpagesize,NR_SP,NR_IP1,false,true); reference_reset_base(href,NR_SP,0,ctempposinvalid,4,[]); list.concat(Taicpu.op_reg_ref(A_STR,NR_WZR,href)); list.concat(setoppostfix(Taicpu.op_reg_reg_const(A_SUB,NR_IP0,NR_IP0,1),PF_S)); a_jmp_cond(list,OC_NE,again); handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize mod winstackpagesize,NR_SP,NR_IP1,false,true); ungetcpuregister(list,NR_IP0); end end else begin handle_reg_imm12_reg(list,A_SUB,OS_ADDR,NR_SP,localsize,NR_SP,NR_IP0,false,true); if target_info.system=system_aarch64_win64 then list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize)); end; end; end; procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean); var hitem: tlinkedlistitem; seh_proc: tai_seh_directive; templist: TAsmList; suppress_endprologue: boolean; ref: treference; totalstackframesize: longint; begin { on aarch64, we need to store the link register and the generate a frame pointer if the subroutine either - receives parameters on the stack - is not a leaf procedure - has nested procedures - helpers retrieve the stack pointer } hitem:=list.last; { pi_has_unwind_info may already be set at this point if there are SEH directives in assembler body. In this case, .seh_endprologue is expected to be one of those directives, and not generated here. } suppress_endprologue:=(pi_has_unwind_info in current_procinfo.flags); if not nostackframe then begin { stack pointer has to be aligned to 16 bytes at all times } localsize:=align(localsize,16); if target_info.system=system_aarch64_win64 then include(current_procinfo.flags,pi_has_unwind_info); if not(pi_no_framepointer_needed in current_procinfo.flags) then begin { save stack pointer and return address } reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]); ref.addressmode:=AM_PREINDEXED; list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref)); current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint)); current_asmdata.asmcfi.cfa_offset(list,NR_FP,-16); current_asmdata.asmcfi.cfa_offset(list,NR_LR,-8); if target_info.system=system_aarch64_win64 then list.concat(cai_seh_directive.create_offset(ash_savefplr_x,16)); { initialise frame pointer } if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then begin a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP); current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FP); if target_info.system=system_aarch64_win64 then list.concat(cai_seh_directive.create(ash_setfp)); end else begin gen_load_frame_for_exceptfilter(list); localsize:=current_procinfo.maxpushedparasize; end; end; totalstackframesize:=localsize; { save modified integer registers } inc(totalstackframesize, save_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE)); { only the lower 64 bits of the modified vector registers need to be saved; if the caller needs the upper 64 bits, it has to save them itself } inc(totalstackframesize, save_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD)); { allocate stack space } if localsize<>0 then begin localsize:=align(localsize,16); current_procinfo.final_localsize:=localsize; g_stackpointer_alloc(list,localsize); end; { By default, we use the frame pointer to access parameters passed via the stack and the stack pointer to address local variables and temps because a) we can use bigger positive than negative offsets (so accessing locals via negative offsets from the frame pointer would be less efficient) b) we don't know the local size while generating the code, so accessing the parameters via the stack pointer is not possible without copying them The problem with this is the get_frame() intrinsic: a) it must return the same value as what we pass as parentfp parameter, since that's how it's used in the TP-style objects unit b) its return value must usable to access all local data from a routine (locals and parameters), since it's all the nested routines have access to c) its return value must be usable to construct a backtrace, as it's also used by the exception handling routines The solution we use here, based on something similar that's done in the MIPS port, is to generate all accesses to locals in the routine itself SP-relative, and then after the code is generated and the local size is known (namely, here), we change all SP-relative variables/ parameters into FP-relative ones. This means that they'll be accessed less efficiently from nested routines, but those accesses are indirect anyway and at least this way they can be accessed at all } if current_procinfo.has_nestedprocs or ( (target_info.system=system_aarch64_win64) and (current_procinfo.flags*[pi_has_implicit_finally,pi_needs_implicit_finally,pi_uses_exceptions]<>[]) ) then begin current_procinfo.procdef.localst.SymList.ForEachCall(@FixupOffsets,@totalstackframesize); current_procinfo.procdef.parast.SymList.ForEachCall(@FixupOffsets,@totalstackframesize); end; end; if not (pi_has_unwind_info in current_procinfo.flags) then exit; { Generate unwind data for aarch64-win64 } seh_proc:=cai_seh_directive.create_name(ash_proc,current_procinfo.procdef.mangledname); if assigned(hitem) then list.insertafter(seh_proc,hitem) else list.insert(seh_proc); { the directive creates another section } inc(list.section_count); templist:=TAsmList.Create; if not suppress_endprologue then begin templist.concat(cai_seh_directive.create(ash_endprologue)); end; if assigned(current_procinfo.endprologue_ai) then current_procinfo.aktproccode.insertlistafter(current_procinfo.endprologue_ai,templist) else list.concatlist(templist); templist.free; end; procedure tcgaarch64.g_maybe_got_init(list : TAsmList); begin { nothing to do on Darwin or Linux } end; procedure tcgaarch64.g_restore_registers(list:TAsmList); begin { done in g_proc_exit } end; procedure tcgaarch64.load_regs(list: TAsmList; rt: tregistertype; lowsr, highsr: tsuperregister; sub: tsubregister); var ref: treference; sr, highestsetsr: tsuperregister; pairreg: tregister; i, regcount: longint; aiarr : array of tai; begin reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]); ref.addressmode:=AM_POSTINDEXED; regcount:=0; { due to SEH on Win64 we can only load consecutive registers and single ones are done using LDR, so we need to handle this differently there } if target_info.system=system_aarch64_win64 then begin setlength(aiarr,highsr-lowsr+1); pairreg:=NR_NO; for sr:=lowsr to highsr do if sr in rg[rt].used_in_proc then begin if pairreg=NR_NO then pairreg:=newreg(rt,sr,sub) else begin if getsupreg(pairreg)=sr-1 then begin aiarr[regcount]:=taicpu.op_reg_reg_ref(A_LDP,pairreg,newreg(rt,sr,sub),ref); inc(regcount); pairreg:=NR_NO; end else begin aiarr[regcount]:=taicpu.op_reg_ref(A_LDR,pairreg,ref); inc(regcount); pairreg:=newreg(rt,sr,sub); end; end; end; if pairreg<>NR_NO then begin aiarr[regcount]:=taicpu.op_reg_ref(A_LDR,pairreg,ref); inc(regcount); pairreg:=NR_NO; end; for i:=regcount-1 downto 0 do list.concat(aiarr[i]); end else begin { highest reg stored twice? } highestsetsr:=RS_NO; for sr:=lowsr to highsr do if sr in rg[rt].used_in_proc then begin inc(regcount); highestsetsr:=sr; end; if odd(regcount) then begin list.concat(taicpu.op_reg_ref(A_LDR,newreg(rt,highestsetsr,sub),ref)); highestsetsr:=pred(highestsetsr); end; { load all (other) used registers pairwise } pairreg:=NR_NO; for sr:=highestsetsr downto lowsr do if sr in rg[rt].used_in_proc then if pairreg=NR_NO then pairreg:=newreg(rt,sr,sub) else begin list.concat(taicpu.op_reg_reg_ref(A_LDP,newreg(rt,sr,sub),pairreg,ref)); pairreg:=NR_NO end; end; { There can't be any register left } if pairreg<>NR_NO then internalerror(2014112602); end; procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean); var ref: treference; regsstored: boolean; sr: tsuperregister; begin if not(nostackframe) and { we do not need an exit stack frame when we never return * the final ret is left so the peephole optimizer can easily do call/ret -> jmp or call conversions * the entry stack frame must be normally generated because the subroutine could be still left by an exception and then the unwinding code might need to restore the registers stored by the entry code } not(po_noreturn in current_procinfo.procdef.procoptions) then begin { if no registers have been stored, we don't have to subtract the allocated temp space from the stack pointer } regsstored:=false; for sr:=RS_X19 to RS_X28 do if sr in rg[R_INTREGISTER].used_in_proc then begin regsstored:=true; break; end; if not regsstored then for sr:=RS_D8 to RS_D15 do if sr in rg[R_MMREGISTER].used_in_proc then begin regsstored:=true; break; end; { restore registers (and stack pointer) } if regsstored then begin if current_procinfo.final_localsize<>0 then handle_reg_imm12_reg(list,A_ADD,OS_ADDR,NR_SP,current_procinfo.final_localsize,NR_SP,NR_IP0,false,true); load_regs(list,R_MMREGISTER,RS_D8,RS_D15,R_SUBMMD); load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE); end else if current_procinfo.final_localsize<>0 then begin { restore stack pointer } if pi_no_framepointer_needed in current_procinfo.flags then handle_reg_imm12_reg(list,A_ADD,OS_ADDR,current_procinfo.framepointer,current_procinfo.final_localsize, current_procinfo.framepointer,NR_IP0,false,true) else a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP); end; if not(pi_no_framepointer_needed in current_procinfo.flags) then begin { restore framepointer and return address } reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]); ref.addressmode:=AM_POSTINDEXED; list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref)); end; end; { return } list.concat(taicpu.op_none(A_RET)); if (pi_has_unwind_info in current_procinfo.flags) then begin tcpuprocinfo(current_procinfo).dump_scopes(list); list.concat(cai_seh_directive.create(ash_endproc)); end; end; procedure tcgaarch64.g_save_registers(list : TAsmList); begin { done in g_proc_entry } end; procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint); var sourcebasereplaced, destbasereplaced: boolean; { get optimal memory operation to use for loading/storing data in an unrolled loop } procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean); begin if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then begin memop:=unscaledop; needsimplify:=true; end else if (unscaledop<>A_NONE) and (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then begin memop:=unscaledop; needsimplify:=false; end else begin memop:=scaledop; needsimplify:=true; end; end; { adjust the offset and/or addressing mode after a load/store so it's correct for the next one of the same size } procedure updaterefafterloadstore(var ref: treference; oplen: longint); begin case ref.addressmode of AM_OFFSET: inc(ref.offset,oplen); AM_POSTINDEXED: { base register updated by instruction, next offset can remain the same } ; AM_PREINDEXED: begin { base register updated by instruction -> next instruction can use post-indexing with offset = sizeof(operation) } ref.offset:=0; ref.addressmode:=AM_OFFSET; end; end; end; { generate a load/store and adjust the reference offset to the next memory location if necessary } procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize); begin list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix)); updaterefafterloadstore(ref,tcgsize2size[opsize]); end; { generate a dual load/store (ldp/stp) and adjust the reference offset to the next memory location if necessary } procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize); begin list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix)); updaterefafterloadstore(ref,tcgsize2size[opsize]*2); end; { turn a reference into a pre- or post-indexed reference for use in a load/store of a particular size } procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean); var tmpreg: tregister; scaledoffset: longint; orgaddressmode: taddressmode; begin scaledoffset:=tcgsize2size[opsize]; if scaledop in [A_LDP,A_STP] then scaledoffset:=scaledoffset*2; { can we use the reference as post-indexed without changes? } if forcepostindexing then begin orgaddressmode:=ref.addressmode; ref.addressmode:=AM_POSTINDEXED; if (orgaddressmode=AM_POSTINDEXED) or ((ref.offset=0) and (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then begin { just change the post-indexed offset to the access size } ref.offset:=scaledoffset; { and replace the base register if that didn't happen yet (could be sp or a regvar) } if not basereplaced then begin tmpreg:=getaddressregister(list); a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg); ref.base:=tmpreg; basereplaced:=true; end; exit; end; ref.addressmode:=orgaddressmode; end; {$ifdef dummy} This could in theory be useful in case you have a concatcopy from e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems very unlikely. Disabled because it still needs fixes, as it also generates pre-indexed loads right now at the very end for the left-over gencopies { can we turn it into a pre-indexed reference for free? (after the first operation, it will be turned into an offset one) } if not forcepostindexing and (ref.offset<>0) then begin orgaddressmode:=ref.addressmode; ref.addressmode:=AM_PREINDEXED; tmpreg:=ref.base; if not basereplaced and (ref.base=tmpreg) then begin tmpreg:=getaddressregister(list); a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg); ref.base:=tmpreg; basereplaced:=true; end; if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO); exit; end; {$endif dummy} if not forcepostindexing then begin ref.addressmode:=AM_OFFSET; make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO); { this may still cause problems if the final offset is no longer a simple ref; it's a bit complicated to pass all information through at all places and check that here, so play safe: we currently never generate unrolled copies for more than 64 bytes (32 with non-double-register copies) } if ref.index=NR_NO then begin if ((scaledop in [A_LDP,A_STP]) and (ref.offset<((64-8)*tcgsize2size[opsize]))) or ((scaledop in [A_LDUR,A_STUR]) and (ref.offset<(255-8*tcgsize2size[opsize]))) or ((scaledop in [A_LDR,A_STR]) and (ref.offset<((4096-8)*tcgsize2size[opsize]))) then exit; end; end; tmpreg:=getaddressregister(list); a_loadaddr_ref_reg(list,ref,tmpreg); basereplaced:=true; if forcepostindexing then begin reference_reset_base(ref,tmpreg,scaledoffset,ref.temppos,ref.alignment,ref.volatility); ref.addressmode:=AM_POSTINDEXED; end else begin reference_reset_base(ref,tmpreg,0,ref.temppos,ref.alignment,ref.volatility); ref.addressmode:=AM_OFFSET; end end; { prepare a reference for use by gencopy. This is done both after the unrolled and regular copy loop -> get rid of post-indexing mode, make sure ref is valid } procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean); var simplify: boolean; begin if ref.addressmode=AM_POSTINDEXED then ref.offset:=tcgsize2size[opsize]; getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify); if simplify then begin makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced); op:=scaledop; end; end; { generate a copy from source to dest of size opsize/postfix } procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize); var reg: tregister; loadop, storeop: tasmop; begin preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced); preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced); reg:=getintregister(list,opsize); genloadstore(list,loadop,reg,source,postfix,opsize); genloadstore(list,storeop,reg,dest,postfix,opsize); end; { copy the leftovers after an unrolled or regular copy loop } procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint); begin { stop post-indexing if we did so in the loop, since in that case all offsets definitely can be represented now } if source.addressmode=AM_POSTINDEXED then begin source.addressmode:=AM_OFFSET; source.offset:=0; end; if dest.addressmode=AM_POSTINDEXED then begin dest.addressmode:=AM_OFFSET; dest.offset:=0; end; { transfer the leftovers } if len>=8 then begin dec(len,8); gencopy(list,source,dest,PF_NONE,OS_64); end; if len>=4 then begin dec(len,4); gencopy(list,source,dest,PF_NONE,OS_32); end; if len>=2 then begin dec(len,2); gencopy(list,source,dest,PF_H,OS_16); end; if len>=1 then begin dec(len); gencopy(list,source,dest,PF_B,OS_8); end; end; const { load_length + loop dec + cbnz } loopoverhead=12; { loop overhead + load + store } totallooplen=loopoverhead + 8; var totalalign: longint; maxlenunrolled: tcgint; loadop, storeop: tasmop; opsize: tcgsize; postfix: toppostfix; tmpsource, tmpdest: treference; scaledstoreop, unscaledstoreop, scaledloadop, unscaledloadop: tasmop; regs: array[1..8] of tregister; countreg: tregister; i, regcount: longint; hl: tasmlabel; simplifysource, simplifydest: boolean; begin if len=0 then exit; sourcebasereplaced:=false; destbasereplaced:=false; { maximum common alignment } totalalign:=max(1,newalignment(source.alignment,dest.alignment)); { use a simple load/store? } if (len in [1,2,4,8]) and ((totalalign>=(len div 2)) or (source.alignment=len) or (dest.alignment=len)) then begin opsize:=int_cgsize(len); a_load_ref_ref(list,opsize,opsize,source,dest); exit; end; { alignment > length is not useful, and would break some checks below } while totalalign>len do totalalign:=totalalign div 2; { operation sizes to use based on common alignment } case totalalign of 1: begin postfix:=PF_B; opsize:=OS_8; end; 2: begin postfix:=PF_H; opsize:=OS_16; end; 4: begin postfix:=PF_None; opsize:=OS_32; end else begin totalalign:=8; postfix:=PF_None; opsize:=OS_64; end; end; { maximum length to handled with an unrolled loop (4 loads + 4 stores) } maxlenunrolled:=min(totalalign,8)*4; { ldp/stp -> 2 registers per instruction } if (totalalign>=4) and (len>=totalalign*2) then begin maxlenunrolled:=maxlenunrolled*2; scaledstoreop:=A_STP; scaledloadop:=A_LDP; unscaledstoreop:=A_NONE; unscaledloadop:=A_NONE; end else begin scaledstoreop:=A_STR; scaledloadop:=A_LDR; unscaledstoreop:=A_STUR; unscaledloadop:=A_LDUR; end; { we only need 4 instructions extra to call FPC_MOVE } if cs_opt_size in current_settings.optimizerswitches then maxlenunrolled:=maxlenunrolled div 2; if (len>maxlenunrolled) and (len>totalalign*8) and (pi_do_call in current_procinfo.flags) then begin g_concatcopy_move(list,source,dest,len); exit; end; simplifysource:=true; simplifydest:=true; tmpsource:=source; tmpdest:=dest; { can we directly encode all offsets in an unrolled loop? } if len<=maxlenunrolled then begin {$ifdef extdebug} list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign)))); {$endif extdebug} { the leftovers will be handled separately -> -(len mod opsize) } inc(tmpsource.offset,len-(len mod tcgsize2size[opsize])); { additionally, the last regular load/store will be at offset+len-opsize (if len-(len mod opsize)>len) } if tmpsource.offset>source.offset then dec(tmpsource.offset,tcgsize2size[opsize]); getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource); inc(tmpdest.offset,len-(len mod tcgsize2size[opsize])); if tmpdest.offset>dest.offset then dec(tmpdest.offset,tcgsize2size[opsize]); getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest); tmpsource:=source; tmpdest:=dest; { if we can't directly encode all offsets, simplify } if simplifysource then begin loadop:=scaledloadop; makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced); end; if simplifydest then begin storeop:=scaledstoreop; makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced); end; regcount:=len div tcgsize2size[opsize]; { in case we transfer two registers at a time, we copy an even number of registers } if loadop=A_LDP then regcount:=regcount and not(1); { initialise for dfa } regs[low(regs)]:=NR_NO; { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) } for i:=1 to regcount do regs[i]:=getintregister(list,opsize); if loadop=A_LDP then begin { load registers } for i:=1 to (regcount div 2) do gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize); { store registers } for i:=1 to (regcount div 2) do gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize); end else begin for i:=1 to regcount do genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize); for i:=1 to regcount do genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize); end; { leftover } len:=len-regcount*tcgsize2size[opsize]; {$ifdef extdebug} list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len)))); {$endif extdebug} end else begin {$ifdef extdebug} list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign)))); {$endif extdebug} { regular loop -> definitely use post-indexing } loadop:=scaledloadop; makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced); storeop:=scaledstoreop; makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced); current_asmdata.getjumplabel(hl); countreg:=getintregister(list,OS_32); if loadop=A_LDP then a_load_const_reg(list,OS_32,len div (tcgsize2size[opsize]*2),countreg) else a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg); a_label(list,hl); a_op_const_reg(list,OP_SUB,OS_32,1,countreg); if loadop=A_LDP then begin regs[1]:=getintregister(list,opsize); regs[2]:=getintregister(list,opsize); gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize); gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize); end else begin regs[1]:=getintregister(list,opsize); genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize); genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize); end; list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0)); if loadop=A_LDP then len:=len mod (tcgsize2size[opsize]*2) else len:=len mod tcgsize2size[opsize]; end; gencopyleftovers(list,tmpsource,tmpdest,len); end; procedure tcgaarch64.g_adjust_self_value(list:TAsmList;procdef: tprocdef;ioffset: tcgint); begin { This method is integrated into g_intf_wrapper and shouldn't be called separately } InternalError(2013020102); end; procedure tcgaarch64.g_check_for_fpu_exception(list: TAsmList;force,clear : boolean); var r, tmpreg: TRegister; ai: taicpu; l1,l2: TAsmLabel; begin { so far, we assume all flavours of AArch64 need explicit floating point exception checking } if ((cs_check_fpu_exceptions in current_settings.localswitches) and (force or current_procinfo.FPUExceptionCheckNeeded)) then begin r:=getintregister(list,OS_INT); tmpreg:=getintregister(list,OS_INT); list.concat(taicpu.op_reg_reg(A_MRS,r,NR_FPSR)); list.concat(taicpu.op_reg_reg_const(A_AND,tmpreg,r,$1f)); current_asmdata.getjumplabel(l1); current_asmdata.getjumplabel(l2); ai:=taicpu.op_reg_sym_ofs(A_CBNZ,tmpreg,l1,0); ai.is_jmp:=true; list.concat(ai); list.concat(taicpu.op_reg_reg_const(A_AND,tmpreg,r,$80)); ai:=taicpu.op_reg_sym_ofs(A_CBZ,tmpreg,l2,0); ai.is_jmp:=true; list.concat(ai); a_label(list,l1); alloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default)); cg.a_call_name(list,'FPC_THROWFPUEXCEPTION',false); dealloccpuregisters(list,R_INTREGISTER,paramanager.get_volatile_registers_int(pocall_default)); a_label(list,l2); if clear then current_procinfo.FPUExceptionCheckNeeded:=false; end; end; procedure tcgaarch64.g_profilecode(list : TAsmList); begin if target_info.system = system_aarch64_linux then begin list.concat(taicpu.op_reg_reg(A_MOV,NR_X0,NR_X30)); a_call_name(list,'_mcount',false); end else internalerror(2020021901); end; procedure create_codegen; begin cg:=tcgaarch64.Create; cg128:=tcg128.Create; end; end.