* perform 4 instead of 8 byte copies at a time if source or dest is

unaligned
  * use integer instead of floating point for 8 byte copies because the
    integer unit has lower latency

git-svn-id: trunk@9347 -
This commit is contained in:
Jonas Maebe 2007-11-29 17:18:52 +00:00
parent bc545b077f
commit 96180029b2

View File

@ -1738,18 +1738,14 @@ end;
{ ************* concatcopy ************ } { ************* concatcopy ************ }
const
maxmoveunit = 8;
procedure tcgppc.g_concatcopy(list: TAsmList; const source, dest: treference; procedure tcgppc.g_concatcopy(list: TAsmList; const source, dest: treference;
len: aint); len: aint);
var var
countreg, tempreg: TRegister; countreg, tempreg:TRegister;
src, dst: TReference; src, dst: TReference;
lab: tasmlabel; lab: tasmlabel;
count, count2: longint; count, count2, step: longint;
size: tcgsize; size: tcgsize;
begin begin
@ -1759,7 +1755,8 @@ begin
list.concat(tai_comment.create(strpnew('g_concatcopy1 ' + inttostr(len) + ' bytes left '))); list.concat(tai_comment.create(strpnew('g_concatcopy1 ' + inttostr(len) + ' bytes left ')));
{$ENDIF extdebug} {$ENDIF extdebug}
{ if the references are equal, exit, there is no need to copy anything } { if the references are equal, exit, there is no need to copy anything }
if (references_equal(source, dest)) then if references_equal(source, dest) or
(len=0) then
exit; exit;
{ make sure short loads are handled as optimally as possible; { make sure short loads are handled as optimally as possible;
@ -1768,7 +1765,7 @@ begin
NOTE: maybe use some scratch registers to pair load/store instructions NOTE: maybe use some scratch registers to pair load/store instructions
} }
if (len <= maxmoveunit) then begin if (len <= 8) then begin
src := source; dst := dest; src := source; dst := dest;
{$IFDEF extdebug} {$IFDEF extdebug}
list.concat(tai_comment.create(strpnew('g_concatcopy3 ' + inttostr(src.offset) + ' ' + inttostr(dst.offset)))); list.concat(tai_comment.create(strpnew('g_concatcopy3 ' + inttostr(src.offset) + ' ' + inttostr(dst.offset))));
@ -1798,16 +1795,29 @@ begin
{$ENDIF extdebug} {$ENDIF extdebug}
count := len div maxmoveunit; if not(source.alignment in [1,2]) and
not(dest.alignment in [1,2]) then
begin
count:=len div 8;
step:=8;
size:=OS_64;
end
else
begin
count:=len div 4;
step:=4;
size:=OS_32;
end;
tempreg:=getintregister(list,size);
reference_reset(src); reference_reset(src);
reference_reset(dst); reference_reset(dst);
{ load the address of source into src.base } { load the address of source into src.base }
if (count > 4) or if (count > 4) or
not issimpleref(source) or not issimpleref(source) or
((source.index <> NR_NO) and ((source.index <> NR_NO) and
((source.offset + len) > high(smallint))) then begin ((source.offset + len) > high(smallint))) then begin
src.base := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE); src.base := getaddressregister(list);
a_loadaddr_ref_reg(list, source, src.base); a_loadaddr_ref_reg(list, source, src.base);
end else begin end else begin
src := source; src := source;
@ -1817,7 +1827,7 @@ begin
not issimpleref(dest) or not issimpleref(dest) or
((dest.index <> NR_NO) and ((dest.index <> NR_NO) and
((dest.offset + len) > high(smallint))) then begin ((dest.offset + len) > high(smallint))) then begin
dst.base := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE); dst.base := getaddressregister(list);
a_loadaddr_ref_reg(list, dest, dst.base); a_loadaddr_ref_reg(list, dest, dst.base);
end else begin end else begin
dst := dest; dst := dest;
@ -1826,64 +1836,63 @@ begin
{ generate a loop } { generate a loop }
if count > 4 then begin if count > 4 then begin
{ the offsets are zero after the a_loadaddress_ref_reg and just { the offsets are zero after the a_loadaddress_ref_reg and just
have to be set to 8. I put an Inc there so debugging may be have to be set to step. I put an Inc there so debugging may be
easier (should offset be different from zero here, it will be easier (should offset be different from zero here, it will be
easy to notice in the generated assembler } easy to notice in the generated assembler }
inc(dst.offset, 8); inc(dst.offset, step);
inc(src.offset, 8); inc(src.offset, step);
list.concat(taicpu.op_reg_reg_const(A_SUBI, src.base, src.base, 8)); list.concat(taicpu.op_reg_reg_const(A_SUBI, src.base, src.base, step));
list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, 8)); list.concat(taicpu.op_reg_reg_const(A_SUBI, dst.base, dst.base, step));
countreg := rg[R_INTREGISTER].getregister(list, R_SUBWHOLE); countreg := getintregister(list, OS_INT);
a_load_const_reg(list, OS_64, count, countreg); a_load_const_reg(list, OS_INT, count, countreg);
{ explicitely allocate F0 since it can be used safely here
(for holding date that's being copied) }
a_reg_alloc(list, NR_F0);
current_asmdata.getjumplabel(lab); current_asmdata.getjumplabel(lab);
a_label(list, lab); a_label(list, lab);
list.concat(taicpu.op_reg_reg_const(A_SUBIC_, countreg, countreg, 1)); list.concat(taicpu.op_reg_reg_const(A_SUBIC_, countreg, countreg, 1));
list.concat(taicpu.op_reg_ref(A_LFDU, NR_F0, src)); if (size=OS_64) then
list.concat(taicpu.op_reg_ref(A_STFDU, NR_F0, dst)); begin
list.concat(taicpu.op_reg_ref(A_LDU, tempreg, src));
list.concat(taicpu.op_reg_ref(A_STDU, tempreg, dst));
end
else
begin
list.concat(taicpu.op_reg_ref(A_LWZU, tempreg, src));
list.concat(taicpu.op_reg_ref(A_STWU, tempreg, dst));
end;
a_jmp(list, A_BC, C_NE, 0, lab); a_jmp(list, A_BC, C_NE, 0, lab);
a_reg_dealloc(list, NR_F0); a_reg_sync(list,src.base);
len := len mod 8; a_reg_sync(list,dst.base);
a_reg_sync(list,countreg);
len := len mod step;
count := 0;
end; end;
count := len div 8;
{ unrolled loop } { unrolled loop }
if count > 0 then begin if count > 0 then begin
a_reg_alloc(list, NR_F0);
for count2 := 1 to count do begin for count2 := 1 to count do begin
a_loadfpu_ref_reg(list, OS_F64, OS_F64, src, NR_F0); a_load_ref_reg(list, size, size, src, tempreg);
a_loadfpu_reg_ref(list, OS_F64, OS_F64, NR_F0, dst); a_load_reg_ref(list, size, size, tempreg, dst);
inc(src.offset, 8); inc(src.offset, step);
inc(dst.offset, 8); inc(dst.offset, step);
end; end;
a_reg_dealloc(list, NR_F0); len := len mod step;
len := len mod 8;
end; end;
if (len and 4) <> 0 then begin if (len and 4) <> 0 then begin
a_reg_alloc(list, NR_R0); a_load_ref_reg(list, OS_32, OS_32, src, tempreg);
a_load_ref_reg(list, OS_32, OS_32, src, NR_R0); a_load_reg_ref(list, OS_32, OS_32, tempreg, dst);
a_load_reg_ref(list, OS_32, OS_32, NR_R0, dst);
inc(src.offset, 4); inc(src.offset, 4);
inc(dst.offset, 4); inc(dst.offset, 4);
a_reg_dealloc(list, NR_R0);
end; end;
{ copy the leftovers } { copy the leftovers }
if (len and 2) <> 0 then begin if (len and 2) <> 0 then begin
a_reg_alloc(list, NR_R0); a_load_ref_reg(list, OS_16, OS_16, src, tempreg);
a_load_ref_reg(list, OS_16, OS_16, src, NR_R0); a_load_reg_ref(list, OS_16, OS_16, tempreg, dst);
a_load_reg_ref(list, OS_16, OS_16, NR_R0, dst);
inc(src.offset, 2); inc(src.offset, 2);
inc(dst.offset, 2); inc(dst.offset, 2);
a_reg_dealloc(list, NR_R0);
end; end;
if (len and 1) <> 0 then begin if (len and 1) <> 0 then begin
a_reg_alloc(list, NR_R0); a_load_ref_reg(list, OS_8, OS_8, src, tempreg);
a_load_ref_reg(list, OS_8, OS_8, src, NR_R0); a_load_reg_ref(list, OS_8, OS_8, tempreg, dst);
a_load_reg_ref(list, OS_8, OS_8, NR_R0, dst);
a_reg_dealloc(list, NR_R0);
end; end;
end; end;