mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-20 11:29:27 +02:00
+ concatcopy variants using sse and avx, only activated if optimization for size is done because at least on an i7-4770 it has shown no benefit
git-svn-id: trunk@26588 -
This commit is contained in:
parent
060aa2a7fe
commit
48ae2d215a
@ -2168,9 +2168,9 @@ unit cgx86;
|
||||
push_segment_size = S_W;
|
||||
{$endif}
|
||||
|
||||
type copymode=(copy_move,copy_mmx,copy_string);
|
||||
type copymode=(copy_move,copy_mmx,copy_string,copy_mm,copy_avx);
|
||||
|
||||
var srcref,dstref:Treference;
|
||||
var srcref,dstref,tmpref:Treference;
|
||||
r,r0,r1,r2,r3:Tregister;
|
||||
helpsize:tcgint;
|
||||
copysize:byte;
|
||||
@ -2182,6 +2182,28 @@ unit cgx86;
|
||||
helpsize:=3*sizeof(aword);
|
||||
if cs_opt_size in current_settings.optimizerswitches then
|
||||
helpsize:=2*sizeof(aword);
|
||||
{$ifndef i8086}
|
||||
{ avx helps only to reduce size, using it in general does at least not help on
|
||||
an i7-4770 (FK) }
|
||||
if (CPUX86_HAS_AVXUNIT in cpu_capabilities[current_settings.cputype]) and
|
||||
// (cs_opt_size in current_settings.optimizerswitches) and
|
||||
((len=8) or (len=16) or (len=24) or (len=32) { or (len=40) or (len=48)}) then
|
||||
cm:=copy_avx
|
||||
else
|
||||
{$ifdef dummy}
|
||||
{ I'am not sure what CPUs would benefit from using sse instructions for moves (FK) }
|
||||
if
|
||||
{$ifdef x86_64}
|
||||
((current_settings.fputype>=fpu_sse64)
|
||||
{$else x86_64}
|
||||
((current_settings.fputype>=fpu_sse)
|
||||
{$endif x86_64}
|
||||
or (CPUX86_HAS_SSEUNIT in cpu_capabilities[current_settings.cputype])) and
|
||||
((len=8) or (len=16) or (len=24) or (len=32) or (len=40) or (len=48)) then
|
||||
cm:=copy_mm
|
||||
else
|
||||
{$endif dummy}
|
||||
{$endif i8086}
|
||||
if (cs_mmx in current_settings.localswitches) and
|
||||
not(pi_uses_fpu in current_procinfo.flags) and
|
||||
((len=8) or (len=16) or (len=24) or (len=32)) then
|
||||
@ -2189,7 +2211,7 @@ unit cgx86;
|
||||
if (len>helpsize) then
|
||||
cm:=copy_string;
|
||||
if (cs_opt_size in current_settings.optimizerswitches) and
|
||||
not((len<=16) and (cm=copy_mmx)) and
|
||||
not((len<=16) and (cm in [copy_mmx,copy_mm,copy_avx])) and
|
||||
not(len in copy_len_sizes) then
|
||||
cm:=copy_string;
|
||||
{$ifndef i8086}
|
||||
@ -2239,6 +2261,7 @@ unit cgx86;
|
||||
inc(dstref.offset,copysize);
|
||||
end;
|
||||
end;
|
||||
|
||||
copy_mmx:
|
||||
begin
|
||||
dstref:=dest;
|
||||
@ -2282,6 +2305,129 @@ unit cgx86;
|
||||
inc(dstref.offset,8);
|
||||
a_loadmm_reg_ref(list,OS_M64,OS_M64,r3,dstref,nil);
|
||||
end;
|
||||
end;
|
||||
|
||||
copy_mm:
|
||||
begin
|
||||
dstref:=dest;
|
||||
srcref:=source;
|
||||
r0:=NR_NO;
|
||||
r1:=NR_NO;
|
||||
r2:=NR_NO;
|
||||
r3:=NR_NO;
|
||||
if len>=16 then
|
||||
begin
|
||||
r0:=getmmregister(list,OS_M128);
|
||||
a_loadmm_ref_reg(list,OS_M128,OS_M128,srcref,r0,nil);
|
||||
inc(srcref.offset,16);
|
||||
end;
|
||||
if len>=32 then
|
||||
begin
|
||||
r1:=getmmregister(list,OS_M128);
|
||||
a_loadmm_ref_reg(list,OS_M128,OS_M128,srcref,r1,nil);
|
||||
inc(srcref.offset,16);
|
||||
end;
|
||||
if len>=48 then
|
||||
begin
|
||||
r2:=getmmregister(list,OS_M128);
|
||||
a_loadmm_ref_reg(list,OS_M128,OS_M128,srcref,r2,nil);
|
||||
inc(srcref.offset,16);
|
||||
end;
|
||||
if (len=8) or (len=24) or (len=40) then
|
||||
begin
|
||||
r3:=getmmregister(list,OS_M64);
|
||||
a_loadmm_ref_reg(list,OS_M64,OS_M64,srcref,r3,nil);
|
||||
end;
|
||||
|
||||
if len>=16 then
|
||||
begin
|
||||
a_loadmm_reg_ref(list,OS_M128,OS_M128,r0,dstref,nil);
|
||||
inc(dstref.offset,16);
|
||||
end;
|
||||
if len>=32 then
|
||||
begin
|
||||
a_loadmm_reg_ref(list,OS_M128,OS_M128,r1,dstref,nil);
|
||||
inc(dstref.offset,16);
|
||||
end;
|
||||
if len>=48 then
|
||||
begin
|
||||
a_loadmm_reg_ref(list,OS_M128,OS_M128,r2,dstref,nil);
|
||||
inc(dstref.offset,16);
|
||||
end;
|
||||
if (len=8) or (len=24) or (len=40) then
|
||||
begin
|
||||
a_loadmm_reg_ref(list,OS_M64,OS_M64,r3,dstref,nil);
|
||||
end;
|
||||
end;
|
||||
|
||||
copy_avx:
|
||||
begin
|
||||
dstref:=dest;
|
||||
srcref:=source;
|
||||
r0:=NR_NO;
|
||||
r1:=NR_NO;
|
||||
r2:=NR_NO;
|
||||
r3:=NR_NO;
|
||||
if len>=16 then
|
||||
begin
|
||||
r0:=getmmregister(list,OS_M128);
|
||||
{ we want to force the use of vmovups, so do not use a_loadmm_ref_reg }
|
||||
tmpref:=srcref;
|
||||
make_simple_ref(list,tmpref);
|
||||
list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,tmpref,r0));
|
||||
inc(srcref.offset,16);
|
||||
end;
|
||||
if len>=32 then
|
||||
begin
|
||||
r1:=getmmregister(list,OS_M128);
|
||||
tmpref:=srcref;
|
||||
make_simple_ref(list,tmpref);
|
||||
list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,tmpref,r1));
|
||||
inc(srcref.offset,16);
|
||||
end;
|
||||
if len>=48 then
|
||||
begin
|
||||
r2:=getmmregister(list,OS_M128);
|
||||
tmpref:=srcref;
|
||||
make_simple_ref(list,tmpref);
|
||||
list.concat(taicpu.op_ref_reg(A_VMOVUPS,S_NO,tmpref,r2));
|
||||
inc(srcref.offset,16);
|
||||
end;
|
||||
if (len=8) or (len=24) or (len=40) then
|
||||
begin
|
||||
r3:=getmmregister(list,OS_M64);
|
||||
tmpref:=srcref;
|
||||
make_simple_ref(list,tmpref);
|
||||
list.concat(taicpu.op_ref_reg(A_VMOVSD,S_NO,tmpref,r3));
|
||||
end;
|
||||
|
||||
if len>=16 then
|
||||
begin
|
||||
tmpref:=dstref;
|
||||
make_simple_ref(list,tmpref);
|
||||
list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r0,tmpref));
|
||||
inc(dstref.offset,16);
|
||||
end;
|
||||
if len>=32 then
|
||||
begin
|
||||
tmpref:=dstref;
|
||||
make_simple_ref(list,tmpref);
|
||||
list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r1,tmpref));
|
||||
inc(dstref.offset,16);
|
||||
end;
|
||||
if len>=48 then
|
||||
begin
|
||||
tmpref:=dstref;
|
||||
make_simple_ref(list,tmpref);
|
||||
list.concat(taicpu.op_reg_ref(A_VMOVUPS,S_NO,r2,tmpref));
|
||||
inc(dstref.offset,16);
|
||||
end;
|
||||
if (len=8) or (len=24) or (len=40) then
|
||||
begin
|
||||
tmpref:=dstref;
|
||||
make_simple_ref(list,tmpref);
|
||||
list.concat(taicpu.op_reg_ref(A_VMOVSD,S_NO,r3,tmpref));
|
||||
end;
|
||||
end
|
||||
else {copy_string, should be a good fallback in case of unhandled}
|
||||
begin
|
||||
|
Loading…
Reference in New Issue
Block a user