+ inlined versions of concatcopy

git-svn-id: trunk@29959 -
This commit is contained in:
Jonas Maebe 2015-02-23 22:55:26 +00:00
parent e6ab39765d
commit 07455fb889

View File

@ -92,9 +92,8 @@ interface
procedure g_maybe_got_init(list: TAsmList); override;
procedure g_restore_registers(list: TAsmList);override;
procedure g_save_registers(list: TAsmList);override;
procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len: tcgint);
procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
procedure g_concatcopy_unaligned(list: TAsmList; const source, dest: treference; len: tcgint);override;
procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len : tcgint);
procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
private
@ -1468,7 +1467,6 @@ implementation
end;
procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
var
ref: treference;
@ -1676,152 +1674,433 @@ implementation
end;
procedure tcgaarch64.g_concatcopy(list:TAsmList;const source,dest:treference;len:tcgint);
(*
procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
var
tmpreg1,
hreg,
countreg: tregister;
src, dst: treference;
lab: tasmlabel;
count, count2: aint;
*)
begin
(*
{ anybody wants to determine a good value here :)? }
if len>100 then
*)
g_concatcopy_move(list,source,dest,len)
(*
else
begin
count:=len div 4;
if (count<=4) and reference_is_reusable(source) then
src:=source
else
sourcebasereplaced, destbasereplaced: boolean;
{ get optimal memory operation to use for loading/storing data
in an unrolled loop }
procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
begin
if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
(simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
begin
memop:=unscaledop;
needsimplify:=true;
end
else if (unscaledop<>A_NONE) and
(simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
(simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
begin
memop:=unscaledop;
needsimplify:=false;
end
else
begin
memop:=scaledop;
needsimplify:=true;
end;
end;
{ adjust the offset and/or addressing mode after a load/store so it's
correct for the next one of the same size }
procedure updaterefafterloadstore(var ref: treference; oplen: longint);
begin
case ref.addressmode of
AM_OFFSET:
inc(ref.offset,oplen);
AM_POSTINDEXED:
{ base register updated by instruction, next offset can remain
the same }
;
AM_PREINDEXED:
begin
reference_reset_base(src,getintregister(list,OS_ADDR),0,sizeof(aint));
a_loadaddr_ref_reg(list,source,src.base);
end;
if (count<=4) and reference_is_reusable(dest) then
dst:=dest
else
begin
reference_reset_base(dst,getintregister(list,OS_ADDR),0,sizeof(aint));
a_loadaddr_ref_reg(list,dest,dst.base);
end;
{ generate a loop }
if count>4 then
begin
countreg:=GetIntRegister(list,OS_INT);
tmpreg1:=GetIntRegister(list,OS_INT);
a_load_const_reg(list,OS_INT,count,countreg);
current_asmdata.getjumplabel(lab);
a_label(list, lab);
list.concat(taicpu.op_ref_reg(A_LD,src,tmpreg1));
list.concat(taicpu.op_reg_ref(A_ST,tmpreg1,dst));
list.concat(taicpu.op_reg_const_reg(A_ADD,src.base,4,src.base));
list.concat(taicpu.op_reg_const_reg(A_ADD,dst.base,4,dst.base));
list.concat(taicpu.op_reg_const_reg(A_SUBcc,countreg,1,countreg));
a_jmp_cond(list,OC_NE,lab);
len := len mod 4;
end;
{ unrolled loop }
count:=len div 4;
if count>0 then
begin
tmpreg1:=GetIntRegister(list,OS_INT);
for count2 := 1 to count do
begin
list.concat(taicpu.op_ref_reg(A_LD,src,tmpreg1));
list.concat(taicpu.op_reg_ref(A_ST,tmpreg1,dst));
inc(src.offset,4);
inc(dst.offset,4);
end;
len := len mod 4;
end;
if (len and 4) <> 0 then
begin
hreg:=GetIntRegister(list,OS_INT);
a_load_ref_reg(list,OS_32,OS_32,src,hreg);
a_load_reg_ref(list,OS_32,OS_32,hreg,dst);
inc(src.offset,4);
inc(dst.offset,4);
end;
{ copy the leftovers }
if (len and 2) <> 0 then
begin
hreg:=GetIntRegister(list,OS_INT);
a_load_ref_reg(list,OS_16,OS_16,src,hreg);
a_load_reg_ref(list,OS_16,OS_16,hreg,dst);
inc(src.offset,2);
inc(dst.offset,2);
end;
if (len and 1) <> 0 then
begin
hreg:=GetIntRegister(list,OS_INT);
a_load_ref_reg(list,OS_8,OS_8,src,hreg);
a_load_reg_ref(list,OS_8,OS_8,hreg,dst);
{ base register updated by instruction -> next instruction can
use post-indexing with offset = sizeof(operation) }
ref.offset:=0;
ref.addressmode:=AM_OFFSET;
end;
end;
*)
end;
end;
{ generate a load/store and adjust the reference offset to the next
memory location if necessary }
procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
begin
list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
updaterefafterloadstore(ref,tcgsize2size[opsize]);
end;
{ generate a dual load/store (ldp/stp) and adjust the reference offset to
the next memory location if necessary }
procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
begin
list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
end;
{ turn a reference into a pre- or post-indexed reference for use in a
load/store of a particular size }
procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
var
tmpreg: tregister;
scaledoffset: longint;
orgaddressmode: taddressmode;
begin
scaledoffset:=tcgsize2size[opsize];
if scaledop in [A_LDP,A_STP] then
scaledoffset:=scaledoffset*2;
{ can we use the reference as post-indexed without changes? }
if forcepostindexing then
begin
orgaddressmode:=ref.addressmode;
ref.addressmode:=AM_POSTINDEXED;
if (orgaddressmode=AM_POSTINDEXED) or
((ref.offset=0) and
(simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
begin
{ just change the post-indexed offset to the access size }
ref.offset:=scaledoffset;
{ and replace the base register if that didn't happen yet
(could be sp or a regvar) }
if not basereplaced then
begin
tmpreg:=getaddressregister(list);
a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
ref.base:=tmpreg;
basereplaced:=true;
end;
exit;
end;
ref.addressmode:=orgaddressmode;
end;
{$ifdef dummy}
This could in theory be useful in case you have a concatcopy from
e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
very unlikely. Disabled because it still needs fixes, as it
also generates pre-indexed loads right now at the very end for the
left-over gencopies
{ can we turn it into a pre-indexed reference for free? (after the
first operation, it will be turned into an offset one) }
if not forcepostindexing and
(ref.offset<>0) then
begin
orgaddressmode:=ref.addressmode;
ref.addressmode:=AM_PREINDEXED;
tmpreg:=ref.base;
if not basereplaced and
(ref.base=tmpreg) then
begin
tmpreg:=getaddressregister(list);
a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
ref.base:=tmpreg;
basereplaced:=true;
end;
if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
exit;
end;
{$endif dummy}
if not forcepostindexing then
begin
ref.addressmode:=AM_OFFSET;
make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
{ this may still cause problems if the final offset is no longer
a simple ref; it's a bit complicated to pass all information
through at all places and check that here, so play safe: we
currently never generate unrolled copies for more than 64
bytes (32 with non-double-register copies) }
if ref.index=NR_NO then
begin
if ((scaledop in [A_LDP,A_STP]) and
(ref.offset<((64-8)*tcgsize2size[opsize]))) or
((scaledop in [A_LDUR,A_STUR]) and
(ref.offset<(255-8*tcgsize2size[opsize]))) or
((scaledop in [A_LDR,A_STR]) and
(ref.offset<((4096-8)*tcgsize2size[opsize]))) then
exit;
end;
end;
tmpreg:=getaddressregister(list);
a_loadaddr_ref_reg(list,ref,tmpreg);
basereplaced:=true;
if forcepostindexing then
begin
reference_reset_base(ref,tmpreg,scaledoffset,ref.alignment);
ref.addressmode:=AM_POSTINDEXED;
end
else
begin
reference_reset_base(ref,tmpreg,0,ref.alignment);
ref.addressmode:=AM_OFFSET;
end
end;
{ prepare a reference for use by gencopy. This is done both after the
unrolled and regular copy loop -> get rid of post-indexing mode, make
sure ref is valid }
procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
var
simplify: boolean;
begin
if ref.addressmode=AM_POSTINDEXED then
ref.offset:=tcgsize2size[opsize];
getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
if simplify then
begin
makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
op:=scaledop;
end;
end;
{ generate a copy from source to dest of size opsize/postfix }
procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
var
reg: tregister;
loadop, storeop: tasmop;
begin
preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
reg:=getintregister(list,opsize);
genloadstore(list,loadop,reg,source,postfix,opsize);
genloadstore(list,storeop,reg,dest,postfix,opsize);
end;
procedure tcgaarch64.g_concatcopy_unaligned(list : TAsmList;const source,dest : treference;len : tcgint);
(*
{ copy the leftovers after an unrolled or regular copy loop }
procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
begin
{ stop post-indexing if we did so in the loop, since in that case all
offsets definitely can be represented now }
if source.addressmode=AM_POSTINDEXED then
begin
source.addressmode:=AM_OFFSET;
source.offset:=0;
end;
if dest.addressmode=AM_POSTINDEXED then
begin
dest.addressmode:=AM_OFFSET;
dest.offset:=0;
end;
{ transfer the leftovers }
if len>=8 then
begin
dec(len,8);
gencopy(list,source,dest,PF_NONE,OS_64);
end;
if len>=4 then
begin
dec(len,4);
gencopy(list,source,dest,PF_NONE,OS_32);
end;
if len>=2 then
begin
dec(len,2);
gencopy(list,source,dest,PF_H,OS_16);
end;
if len>=1 then
begin
dec(len);
gencopy(list,source,dest,PF_B,OS_8);
end;
end;
const
{ load_length + loop dec + cbnz }
loopoverhead=12;
{ loop overhead + load + store }
totallooplen=loopoverhead + 8;
var
src, dst: treference;
tmpreg1,
totalalign: longint;
maxlenunrolled: tcgint;
loadop, storeop: tasmop;
opsize: tcgsize;
postfix: toppostfix;
tmpsource, tmpdest: treference;
scaledstoreop, unscaledstoreop,
scaledloadop, unscaledloadop: tasmop;
regs: array[1..8] of tregister;
countreg: tregister;
i : aint;
lab: tasmlabel;
*)
i, regcount: longint;
hl: tasmlabel;
simplifysource, simplifydest: boolean;
begin
(*
if len>31 then
*)
g_concatcopy_move(list,source,dest,len)
(*
if len=0 then
exit;
sourcebasereplaced:=false;
destbasereplaced:=false;
{ maximum common alignment }
totalalign:=max(1,newalignment(source.alignment,dest.alignment));
{ use a simple load/store? }
if (len in [1,2,4,8]) and
((totalalign>=(len div 2)) or
(source.alignment=len) or
(dest.alignment=len)) then
begin
opsize:=int_cgsize(len);
a_load_ref_ref(list,opsize,opsize,source,dest);
exit;
end;
{ alignment > length is not useful, and would break some checks below }
while totalalign>len do
totalalign:=totalalign div 2;
{ operation sizes to use based on common alignment }
case totalalign of
1:
begin
postfix:=PF_B;
opsize:=OS_8;
end;
2:
begin
postfix:=PF_H;
opsize:=OS_16;
end;
4:
begin
postfix:=PF_None;
opsize:=OS_32;
end
else
begin
totalalign:=8;
postfix:=PF_None;
opsize:=OS_64;
end;
end;
{ maximum length to handled with an unrolled loop (4 loads + 4 stores) }
maxlenunrolled:=min(totalalign,8)*4;
{ ldp/stp -> 2 registers per instruction }
if (totalalign>=4) and
(len>=totalalign*2) then
begin
maxlenunrolled:=maxlenunrolled*2;
scaledstoreop:=A_STP;
scaledloadop:=A_LDP;
unscaledstoreop:=A_NONE;
unscaledloadop:=A_NONE;
end
else
begin
reference_reset(src,source.alignment);
reference_reset(dst,dest.alignment);
{ load the address of source into src.base }
src.base:=GetAddressRegister(list);
a_loadaddr_ref_reg(list,source,src.base);
{ load the address of dest into dst.base }
dst.base:=GetAddressRegister(list);
a_loadaddr_ref_reg(list,dest,dst.base);
{ generate a loop }
if len>4 then
scaledstoreop:=A_STR;
scaledloadop:=A_LDR;
unscaledstoreop:=A_STUR;
unscaledloadop:=A_LDUR;
end;
{ we only need 4 instructions extra to call FPC_MOVE }
if cs_opt_size in current_settings.optimizerswitches then
maxlenunrolled:=maxlenunrolled div 2;
if (len>maxlenunrolled) and
(len>totalalign*8) then
begin
g_concatcopy_move(list,source,dest,len);
exit;
end;
simplifysource:=true;
simplifydest:=true;
tmpsource:=source;
tmpdest:=dest;
{ can we directly encode all offsets in an unrolled loop? }
if len<=maxlenunrolled then
begin
{$ifdef extdebug}
list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
{$endif extdebug}
{ the leftovers will be handled separately -> -(len mod opsize) }
inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
{ additionally, the last regular load/store will be at
offset+len-opsize (if len-(len mod opsize)>len) }
if tmpsource.offset>source.offset then
dec(tmpsource.offset,tcgsize2size[opsize]);
getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
if tmpdest.offset>dest.offset then
dec(tmpdest.offset,tcgsize2size[opsize]);
getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
tmpsource:=source;
tmpdest:=dest;
{ if we can't directly encode all offsets, simplify }
if simplifysource then
begin
countreg:=GetIntRegister(list,OS_INT);
tmpreg1:=GetIntRegister(list,OS_INT);
a_load_const_reg(list,OS_INT,len,countreg);
current_asmdata.getjumplabel(lab);
a_label(list, lab);
list.concat(taicpu.op_ref_reg(A_LDUB,src,tmpreg1));
list.concat(taicpu.op_reg_ref(A_STB,tmpreg1,dst));
list.concat(taicpu.op_reg_const_reg(A_ADD,src.base,1,src.base));
list.concat(taicpu.op_reg_const_reg(A_ADD,dst.base,1,dst.base));
list.concat(taicpu.op_reg_const_reg(A_SUBcc,countreg,1,countreg));
a_jmp_cond(list,OC_NE,lab);
loadop:=scaledloadop;
makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
end;
if simplifydest then
begin
storeop:=scaledstoreop;
makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
end;
regcount:=len div tcgsize2size[opsize];
{ in case we transfer two registers at a time, we copy an even
number of registers }
if loadop=A_LDP then
regcount:=regcount and not(1);
{ max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
for i:=1 to regcount do
regs[i]:=getintregister(list,opsize);
if loadop=A_LDP then
begin
{ load registers }
for i:=1 to (regcount div 2) do
gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
{ store registers }
for i:=1 to (regcount div 2) do
gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
end
else
begin
{ unrolled loop }
tmpreg1:=GetIntRegister(list,OS_INT);
for i:=1 to len do
begin
list.concat(taicpu.op_ref_reg(A_LDUB,src,tmpreg1));
list.concat(taicpu.op_reg_ref(A_STB,tmpreg1,dst));
inc(src.offset);
inc(dst.offset);
end;
for i:=1 to regcount do
genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
for i:=1 to regcount do
genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
end;
{ leftover }
len:=len-regcount*tcgsize2size[opsize];
{$ifdef extdebug}
list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
{$endif extdebug}
end
else
begin
{$ifdef extdebug}
list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
{$endif extdebug}
{ regular loop -> definitely use post-indexing }
loadop:=scaledloadop;
makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
storeop:=scaledstoreop;
makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
current_asmdata.getjumplabel(hl);
countreg:=getintregister(list,OS_32);
if loadop=A_LDP then
a_load_const_reg(list,OS_32,len div tcgsize2size[opsize]*2,countreg)
else
a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
a_label(list,hl);
a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
if loadop=A_LDP then
begin
regs[1]:=getintregister(list,opsize);
regs[2]:=getintregister(list,opsize);
gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
end
else
begin
regs[1]:=getintregister(list,opsize);
genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
end;
list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
len:=len mod tcgsize2size[opsize];
end;
*)
gencopyleftovers(list,tmpsource,tmpdest,len);
end;