+ concatcopy variants using sse and avx, only activated if optimization for size is done because at least on an i7-4770 it has shown no benefit

git-svn-id: trunk@26588 -
This commit is contained in:
florian 2014-01-26 12:37:54 +00:00
parent 060aa2a7fe
commit 48ae2d215a

View File

@ -2168,9 +2168,9 @@ unit cgx86;
push_segment_size = S_W;
{$endif}
type copymode=(copy_move,copy_mmx,copy_string);
type copymode=(copy_move,copy_mmx,copy_string,copy_mm,copy_avx);
var srcref,dstref:Treference;
var srcref,dstref,tmpref:Treference;
r,r0,r1,r2,r3:Tregister;
helpsize:tcgint;
copysize:byte;
@ -2182,6 +2182,28 @@ unit cgx86;
helpsize:=3*sizeof(aword);
if cs_opt_size in current_settings.optimizerswitches then
helpsize:=2*sizeof(aword);
{$ifndef i8086}
{ avx helps only to reduce size, using it in general does at least not help on
an i7-4770 (FK) }
if (CPUX86_HAS_AVXUNIT in cpu_capabilities[current_settings.cputype]) and
// (cs_opt_size in current_settings.optimizerswitches) and
((len=8) or (len=16) or (len=24) or (len=32) { or (len=40) or (len=48)}) then
cm:=copy_avx
else
{$ifdef dummy}
{ I'am not sure what CPUs would benefit from using sse instructions for moves (FK) }
if
{$ifdef x86_64}
((current_settings.fputype>=fpu_sse64)
{$else x86_64}
((current_settings.fputype>=fpu_sse)
{$endif x86_64}
or (CPUX86_HAS_SSEUNIT in cpu_capabilities[current_settings.cputype])) and
((len=8) or (len=16) or (len=24) or (len=32) or (len=40) or (len=48)) then
cm:=copy_mm
else
{$endif dummy}
{$endif i8086}
if (cs_mmx in current_settings.localswitches) and
not(pi_uses_fpu in current_procinfo.flags) and
((len=8) or (len=16) or (len=24) or (len=32)) then
@ -2189,7 +2211,7 @@ unit cgx86;
if (len>helpsize) then
cm:=copy_string;
if (cs_opt_size in current_settings.optimizerswitches) and
not((len<=16) and (cm=copy_mmx)) and
not((len<=16) and (cm in [copy_mmx,copy_mm,copy_avx])) and
not(len in copy_len_sizes) then
cm:=copy_string;
{$ifndef i8086}
@ -2239,6 +2261,7 @@ unit cgx86;
inc(dstref.offset,copysize);
end;
end;
copy_mmx:
begin
dstref:=dest;
@ -2282,6 +2305,129 @@ unit cgx86;
inc(dstref.offset,8);
a_loadmm_reg_ref(list,OS_M64,OS_M64,r3,dstref,nil);
end;
end;
copy_mm:
begin
dstref:=dest;
srcref:=source;
r0:=NR_NO;
r1:=NR_NO;
r2:=NR_NO;
r3:=NR_NO;
if len>=16 then
begin
r0:=getmmregister(list,OS_M128);
a_loadmm_ref_reg(list,OS_M128,OS_M128,srcref,r0,nil);
inc(srcref.offset,16);
end;
if len>=32 then
begin
r1:=getmmregister(list,OS_M128);
a_loadmm_ref_reg(list,OS_M128,OS_M128,srcref,r1,nil);
inc(srcref.offset,16);
end;
if len>=48 then
begin
r2:=getmmregister(list,OS_M128);
a_loadmm_ref_reg(list,OS_M128,OS_M128,srcref,r2,nil);
inc(srcref.offset,16);
end;
if (len=8) or (len=24) or (len=40) then
begin
r3:=getmmregister(list,OS_M64);
a_loadmm_ref_reg(list,OS_M64,OS_M64,srcref,r3,nil);
end;
if len>=16 then
begin
a_loadmm_reg_ref(list,OS_M128,OS_M128,r0,dstref,nil);
inc(dstref.offset,16);
end;
if len>=32 then
begin
a_loadmm_reg_ref(list,OS_M128,OS_M128,r1,dstref,nil);
inc(dstref.offset,16);
end;
if len>=48 then
begin
a_loadmm_reg_ref(list,OS_M128,OS_M128,r2,dstref,nil);
inc(dstref.offset,16);
end;
if (len=8) or (len=24) or (len=40) then
begin
a_loadmm_reg_ref(list,OS_M64,OS_M64,r3,dstref,nil);
end;
end;
copy_avx:
begin
dstref:=dest;
srcref:=source;
r0:=NR_NO;
r1:=NR_NO;
r2:=NR_NO;
r3:=NR_NO;
if len>=16 then
begin
r0:=getmmregister(list,OS_M128);
{ we want to force the use of vmovups, so do not use a_loadmm_ref_reg }
tmpref:=srcref;
make_simple_ref(list,tmpref);
list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,tmpref,r0));
inc(srcref.offset,16);
end;
if len>=32 then
begin
r1:=getmmregister(list,OS_M128);
tmpref:=srcref;
make_simple_ref(list,tmpref);
list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,tmpref,r1));
inc(srcref.offset,16);
end;
if len>=48 then
begin
r2:=getmmregister(list,OS_M128);
tmpref:=srcref;
make_simple_ref(list,tmpref);
list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,tmpref,r2));
inc(srcref.offset,16);
end;
if (len=8) or (len=24) or (len=40) then
begin
r3:=getmmregister(list,OS_M64);
tmpref:=srcref;
make_simple_ref(list,tmpref);
list.concat(taicpu.op_ref_reg(A_VMOVSD,S_NO,tmpref,r3));
end;
if len>=16 then
begin
tmpref:=dstref;
make_simple_ref(list,tmpref);
list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r0,tmpref));
inc(dstref.offset,16);
end;
if len>=32 then
begin
tmpref:=dstref;
make_simple_ref(list,tmpref);
list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r1,tmpref));
inc(dstref.offset,16);
end;
if len>=48 then
begin
tmpref:=dstref;
make_simple_ref(list,tmpref);
list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r2,tmpref));
inc(dstref.offset,16);
end;
if (len=8) or (len=24) or (len=40) then
begin
tmpref:=dstref;
make_simple_ref(list,tmpref);
list.concat(taicpu.op_reg_ref(A_VMOVSD,S_NO,r3,tmpref));
end;
end
else {copy_string, should be a good fallback in case of unhandled}
begin