mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-06 14:08:02 +02:00
534 lines
15 KiB
PHP
534 lines
15 KiB
PHP
{$ifndef FPC_SYSTEM_HAS_MOVE}
|
||
{$define FPC_SYSTEM_HAS_MOVE}
|
||
|
||
{ at least valgrind up to 3.3 has a bug which prevents the default code to
|
||
work so we use a rather simple implementation here }
|
||
procedure Move_8OrMore_Valgrind; assembler; nostackframe;
|
||
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
|
||
asm
|
||
sub %edx, %eax
|
||
jae .LForward
|
||
mov %ecx, %ebx
|
||
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
|
||
jb .LBack { if no overlap, still do forward move }
|
||
|
||
.LForward:
|
||
{$ifdef FPC_ENABLED_CLD}
|
||
cld
|
||
{$endif FPC_ENABLED_CLD}
|
||
push %esi
|
||
push %edi
|
||
lea (%eax,%edx), %esi
|
||
mov %edx, %edi
|
||
rep movsb
|
||
pop %edi
|
||
pop %esi
|
||
pop %ebx
|
||
ret
|
||
|
||
.LBack:
|
||
add %ecx, %edx
|
||
.LNextb:
|
||
dec %edx
|
||
mov (%eax,%edx), %bl
|
||
mov %bl, (%edx)
|
||
dec %ecx
|
||
jnz .LNextb
|
||
pop %ebx
|
||
end;
|
||
|
||
procedure Move_8OrMore_IA32; assembler; nostackframe;
|
||
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
|
||
asm
|
||
fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
|
||
fildq -8(%eax,%ecx)
|
||
cmp $16, %ecx
|
||
jle .L9to16
|
||
cmp $32, %ecx
|
||
jg .L33OrMore
|
||
fildq 8(%eax)
|
||
fildq -16(%eax,%ecx)
|
||
fistpq -16(%edx,%ecx)
|
||
fistpq 8(%edx)
|
||
.L9to16:
|
||
fistpq -8(%edx,%ecx) { 9–16 bytes }
|
||
fistpq (%edx)
|
||
pop %ebx
|
||
ret
|
||
|
||
.Lcancel:
|
||
fucompp { Pop two elements loaded at the beginning. }
|
||
pop %ebx
|
||
ret
|
||
.byte 0x66,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16f into a no-op. }
|
||
|
||
.L33OrMore:
|
||
sub %edx, %eax { eax = src - dest }
|
||
jz .Lcancel { exit if src=dest }
|
||
jnb .LForward { src>dest => forward move }
|
||
|
||
mov %ecx, %ebx
|
||
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
|
||
jb .Lback { if no overlap, still do forward move }
|
||
|
||
.LForward:
|
||
mov %edx, %ebx { remember original dest to write first 16 bytes }
|
||
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
|
||
add $8, %edx
|
||
and $-8, %edx
|
||
sub %edx, %ecx
|
||
|
||
sub $16, %ecx
|
||
jbe .LPost16f
|
||
|
||
.balign 16 { no-op }
|
||
.Lloop16f:
|
||
fildq (%eax,%edx)
|
||
fistpq (%edx)
|
||
fildq 8(%eax,%edx)
|
||
fistpq 8(%edx)
|
||
add $16, %edx
|
||
sub $16, %ecx
|
||
ja .Lloop16f
|
||
|
||
.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
|
||
cmp $-8, %ecx
|
||
jle .LFirstAndLast8f
|
||
fildq (%eax,%edx)
|
||
fistpq (%edx)
|
||
.LFirstAndLast8f:
|
||
fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
|
||
fistpq (%ebx) { Important for <8-byte step between src and dest. }
|
||
pop %ebx
|
||
ret
|
||
.byte 0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. }
|
||
|
||
{ backwards move }
|
||
.Lback:
|
||
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
|
||
mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
|
||
and $-8, %ecx
|
||
sub %edx, %ecx
|
||
add %ecx, %edx
|
||
|
||
sub $16, %ecx
|
||
jbe .LPost16b
|
||
|
||
.balign 16 { no-op }
|
||
.Lloop16b:
|
||
sub $16, %edx
|
||
fildq 8(%eax,%edx)
|
||
fistpq 8(%edx)
|
||
fildq (%eax,%edx)
|
||
fistpq (%edx)
|
||
sub $16, %ecx
|
||
ja .Lloop16b
|
||
|
||
.LPost16b:
|
||
cmp $-8, %ecx
|
||
jle .LFirstAndLast8b
|
||
fildq -8(%eax,%edx)
|
||
fistpq -8(%edx)
|
||
.LFirstAndLast8b:
|
||
sub %ecx, %edx
|
||
fistpq -7(%ebx)
|
||
fistpq -16(%edx)
|
||
pop %ebx
|
||
end;
|
||
|
||
procedure Move_8OrMore_MMX; assembler; nostackframe;
|
||
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
|
||
asm
|
||
cmp $72, %ecx { Size at which using MMX becomes worthwhile. }
|
||
jl Move_8OrMore_IA32
|
||
movq (%eax), %mm4 { First and last 8 bytes. }
|
||
movq -8(%eax,%ecx), %mm5
|
||
sub %edx, %eax { eax = src - dest }
|
||
jz .Lquit { exit if src=dest }
|
||
jnb .LForward { src>dest => forward move }
|
||
|
||
mov %ecx, %ebx
|
||
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
|
||
jb .Lback { if no overlap, still do forward move }
|
||
|
||
.LForward:
|
||
mov %edx, %ebx { remember original dest to write first 16 bytes }
|
||
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
|
||
add $8, %edx
|
||
and $-8, %edx
|
||
sub %edx, %ecx
|
||
|
||
sub $16, %ecx
|
||
jbe .LPost16f
|
||
|
||
.balign 16
|
||
.Lloop16f:
|
||
movq (%eax,%edx), %mm0
|
||
movq %mm0, (%edx)
|
||
movq 8(%eax,%edx), %mm0
|
||
movq %mm0, 8(%edx)
|
||
add $16, %edx
|
||
sub $16, %ecx
|
||
ja .Lloop16f
|
||
|
||
.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
|
||
cmp $-8, %ecx
|
||
jle .LFirstAndLast8f
|
||
movq (%eax,%edx), %mm0
|
||
movq %mm0, (%edx)
|
||
.LFirstAndLast8f:
|
||
movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
|
||
movq %mm4, (%ebx) { Important for <8-byte step between src and dest. }
|
||
.Lquit:
|
||
emms
|
||
pop %ebx
|
||
ret
|
||
.byte 0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. }
|
||
|
||
{ backwards move }
|
||
.Lback:
|
||
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
|
||
mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
|
||
and $-8, %ecx
|
||
sub %edx, %ecx
|
||
add %ecx, %edx
|
||
|
||
sub $16, %ecx
|
||
jbe .LPost16b
|
||
|
||
.balign 16 { no-op }
|
||
.Lloop16b:
|
||
sub $16, %edx
|
||
movq 8(%eax,%edx), %mm0
|
||
movq %mm0, 8(%edx)
|
||
movq (%eax,%edx), %mm0
|
||
movq %mm0, (%edx)
|
||
sub $16, %ecx
|
||
ja .Lloop16b
|
||
|
||
.LPost16b:
|
||
cmp $-8, %ecx
|
||
jle .LFirstAndLast8b
|
||
movq -8(%eax,%edx), %mm0
|
||
movq %mm0, -8(%edx)
|
||
.LFirstAndLast8b:
|
||
sub %ecx, %edx
|
||
movq %mm4, -16(%edx)
|
||
movq %mm5, -7(%ebx)
|
||
emms
|
||
pop %ebx
|
||
end;
|
||
|
||
{$ifndef FASTMOVE_DISABLE_SSE}
|
||
procedure Move_8OrMore_SSE; assembler; nostackframe;
|
||
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
|
||
const
|
||
ErmsThreshold = 1536;
|
||
NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
|
||
PrefetchDistance = 512;
|
||
asm
|
||
cmp $16, %ecx
|
||
jle .L9to16
|
||
movups (%eax), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
|
||
movups -16(%eax,%ecx), %xmm5
|
||
cmp $32, %ecx
|
||
jg .L33OrMore
|
||
movups %xmm4, (%edx) { 17–32 bytes }
|
||
movups %xmm5, -16(%edx,%ecx)
|
||
pop %ebx
|
||
ret
|
||
|
||
.L9to16:
|
||
movq (%eax), %xmm0
|
||
movq -8(%eax,%ecx), %xmm1
|
||
movq %xmm0, (%edx)
|
||
movq %xmm1, -8(%edx,%ecx)
|
||
.Lquit:
|
||
pop %ebx
|
||
ret
|
||
.byte 0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop32f into a no-op. }
|
||
|
||
.L33OrMore:
|
||
sub %edx, %eax { eax = src - dest }
|
||
jz .Lquit { exit if src=dest }
|
||
jnb .LForward { src>dest => forward move }
|
||
|
||
mov %ecx, %ebx
|
||
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
|
||
jb .Lback { if no overlap, still do forward move }
|
||
|
||
.LForward:
|
||
mov %edx, %ebx { remember original dest to write first 16 bytes }
|
||
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
|
||
add $16, %edx
|
||
and $-16, %edx
|
||
sub %edx, %ecx
|
||
|
||
.LRestAfterNTf:
|
||
sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
|
||
jbe .LPost32f
|
||
cmp $NtThreshold-32, %ecx
|
||
jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
|
||
.LNtIsNotBetter:
|
||
cmp $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
|
||
jae .LRepMovsF
|
||
.LRepMovsIsNotBetter:
|
||
test $15, %eax
|
||
jz .Lalignedloop32f
|
||
|
||
.balign 16 { no-op }
|
||
.Lloop32f:
|
||
movups (%eax,%edx), %xmm0
|
||
movaps %xmm0, (%edx)
|
||
movups 16(%eax,%edx), %xmm0
|
||
movaps %xmm0, 16(%edx)
|
||
add $32, %edx
|
||
sub $32, %ecx
|
||
ja .Lloop32f
|
||
|
||
.LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
|
||
cmp $-16, %ecx
|
||
jle .LFirstAndLast16f
|
||
movups (%eax,%edx), %xmm0
|
||
movaps %xmm0, (%edx)
|
||
.LFirstAndLast16f:
|
||
movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. }
|
||
movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. }
|
||
pop %ebx
|
||
ret
|
||
|
||
.balign 16
|
||
.Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. }
|
||
movaps (%eax,%edx), %xmm0
|
||
movaps %xmm0, (%edx)
|
||
movaps 16(%eax,%edx), %xmm0
|
||
movaps %xmm0, 16(%edx)
|
||
add $32, %edx
|
||
sub $32, %ecx
|
||
ja .Lalignedloop32f
|
||
|
||
.LalignedPost32f:
|
||
cmp $-16, %ecx
|
||
jle .LalignedFirstAndLast16f
|
||
movaps (%eax,%edx), %xmm0
|
||
movaps %xmm0, (%edx)
|
||
.LalignedFirstAndLast16f:
|
||
movups %xmm5, 16(%edx,%ecx)
|
||
movups %xmm4, (%ebx)
|
||
pop %ebx
|
||
ret
|
||
|
||
.LRepMovsF:
|
||
{$ifdef FPC_PIC}
|
||
push %ebx
|
||
call fpc_geteipasebx
|
||
addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
||
movl fast_large_repmovstosb@GOT(%ebx), %ebx
|
||
cmpb $1, (%ebx)
|
||
pop %ebx
|
||
{$else FPC_PIC}
|
||
cmpb $1, fast_large_repmovstosb
|
||
{$endif FPC_PIC}
|
||
jne .LRepMovsIsNotBetter
|
||
push %esi
|
||
push %edi
|
||
lea (%eax,%edx), %esi
|
||
mov %edx, %edi
|
||
add $32, %ecx
|
||
rep movsb
|
||
movups %xmm4, (%ebx) { last 16 aren't required }
|
||
pop %edi
|
||
pop %esi
|
||
pop %ebx
|
||
ret
|
||
|
||
.Lntf:
|
||
cmp $NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
|
||
jb .LNtIsNotBetter { (this check is performed here to not stand in the way of smaller counts) }
|
||
sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
|
||
test $15, %eax
|
||
jz .Lalignedntloop64f
|
||
|
||
.balign 16
|
||
.Lntloop64f:
|
||
prefetchnta 0+PrefetchDistance(%eax,%edx,1)
|
||
movups (%eax,%edx,1), %xmm0
|
||
movntps %xmm0, (%edx)
|
||
movups 16(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 16(%edx)
|
||
movups 32(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 32(%edx)
|
||
movups 48(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 48(%edx)
|
||
add $64, %edx
|
||
sub $64, %ecx
|
||
jae .Lntloop64f
|
||
|
||
sfence
|
||
add $PrefetchDistance+64, %ecx
|
||
jmp .LRestAfterNTf { go handle remaining bytes }
|
||
|
||
.balign 16
|
||
.Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. }
|
||
prefetchnta 0+PrefetchDistance(%eax,%edx,1)
|
||
movaps (%eax,%edx,1), %xmm0
|
||
movntps %xmm0, (%edx)
|
||
movaps 16(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 16(%edx)
|
||
movaps 32(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 32(%edx)
|
||
movaps 48(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 48(%edx)
|
||
add $64, %edx
|
||
sub $64, %ecx
|
||
jae .Lalignedntloop64f
|
||
|
||
sfence
|
||
add $PrefetchDistance+64, %ecx
|
||
jmp .LRestAfterNTf
|
||
.byte 0x66,0x0F,0x1F,0x44,0,0 { Turns .balign 16 before .Lloop32b into a no-op. }
|
||
|
||
{ backwards move }
|
||
.Lback:
|
||
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes }
|
||
mov %ebx, %ecx { move dest to the previous 16-byte boundary... }
|
||
and $-16, %ecx
|
||
sub %edx, %ecx
|
||
add %ecx, %edx
|
||
|
||
.LRestAfterNTb:
|
||
sub $32, %ecx
|
||
jbe .LPost32b
|
||
cmp $NtThreshold-32, %ecx
|
||
jae .Lntb
|
||
|
||
.balign 16 { no-op }
|
||
.Lloop32b:
|
||
sub $32, %edx
|
||
movups 16(%eax,%edx), %xmm0
|
||
movaps %xmm0, 16(%edx)
|
||
movups (%eax,%edx), %xmm0
|
||
movaps %xmm0, (%edx)
|
||
sub $32, %ecx
|
||
ja .Lloop32b
|
||
|
||
.LPost32b:
|
||
cmp $-16, %ecx
|
||
jle .LFirstAndLast16b
|
||
movups -16(%eax,%edx), %xmm0
|
||
movaps %xmm0, -16(%edx)
|
||
.LFirstAndLast16b:
|
||
sub %ecx, %edx
|
||
movups %xmm4, -32(%edx)
|
||
movups %xmm5, -15(%ebx)
|
||
pop %ebx
|
||
ret
|
||
|
||
.Lntb:
|
||
cmp $-NtThreshold, %eax
|
||
jnb .Lloop32b
|
||
sub $PrefetchDistance+32, %ecx
|
||
|
||
.balign 16
|
||
.Lntloop64b:
|
||
prefetchnta -PrefetchDistance(%eax,%edx,1)
|
||
sub $64, %edx
|
||
movups 48(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 48(%edx)
|
||
movups 32(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 32(%edx)
|
||
movups 16(%eax,%edx,1), %xmm0
|
||
movntps %xmm0, 16(%edx)
|
||
movups (%eax,%edx,1), %xmm0
|
||
movntps %xmm0, (%edx)
|
||
sub $64, %ecx
|
||
jae .Lntloop64b
|
||
|
||
sfence
|
||
add $PrefetchDistance+64, %ecx
|
||
jmp .LRestAfterNTb
|
||
end;
|
||
{$endif ndef FASTMOVE_DISABLE_SSE}
|
||
|
||
procedure Move_8OrMore_Dispatch; forward;
|
||
|
||
var
|
||
fastmoveproc : pointer = @Move_8OrMore_Dispatch;
|
||
{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
|
||
valgrind_used : boolean;external name '__fpc_valgrind';
|
||
{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
|
||
|
||
function Move_8OrMore_HumanFriendlyDispatch: pointer;
|
||
begin
|
||
{ workaround valgrind bug }
|
||
{$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
|
||
if EntryInformation.valgrind_used then
|
||
{$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
|
||
if valgrind_used then
|
||
{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
|
||
result:=@Move_8OrMore_Valgrind
|
||
{$ifndef FASTMOVE_DISABLE_SSE}
|
||
else if has_sse_support then
|
||
result:=@Move_8OrMore_SSE
|
||
{$endif ndef FASTMOVE_DISABLE_SSE}
|
||
else if has_mmx_support then
|
||
result:=@Move_8OrMore_MMX
|
||
else
|
||
result:=@Move_8OrMore_IA32;
|
||
if fpc_cpucodeinit_performed then
|
||
fastmoveproc:=result;
|
||
end;
|
||
|
||
procedure Move_8OrMore_Dispatch; assembler; nostackframe;
|
||
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
|
||
asm
|
||
push %eax
|
||
push %edx
|
||
push %ecx
|
||
call Move_8OrMore_HumanFriendlyDispatch
|
||
mov %eax, %ebx
|
||
pop %ecx
|
||
pop %edx
|
||
pop %eax
|
||
jmp %ebx
|
||
end;
|
||
|
||
procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
|
||
asm
|
||
push %ebx
|
||
cmp $8, %ecx
|
||
jle .L8OrLess
|
||
{$ifdef FPC_PIC}
|
||
call fpc_geteipasebx
|
||
addl $_GLOBAL_OFFSET_TABLE_, %ebx
|
||
movl fastmoveproc@GOT(%ebx), %ebx
|
||
jmp (%ebx)
|
||
{$else}
|
||
jmp fastmoveproc
|
||
{$endif}
|
||
|
||
.L8OrLess:
|
||
cmp $3, %ecx
|
||
jle .L3OrLess
|
||
mov (%eax), %ebx
|
||
mov -4(%eax,%ecx), %eax
|
||
mov %ebx, (%edx)
|
||
mov %eax, -4(%edx,%ecx)
|
||
pop %ebx
|
||
ret
|
||
|
||
.L3OrLess:
|
||
cmp $1, %ecx
|
||
jl .LZero
|
||
movzbl (%eax), %ebx
|
||
je .LOne
|
||
movzwl -2(%eax,%ecx), %eax
|
||
mov %ax, -2(%edx,%ecx)
|
||
.LOne:
|
||
mov %bl, (%edx)
|
||
.LZero:
|
||
pop %ebx
|
||
end;
|
||
|
||
{$endif FPC_SYSTEM_HAS_MOVE}
|