fpc/rtl/i386/fastmove.inc
2023-12-10 13:26:39 +00:00

534 lines
15 KiB
PHP
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{$ifndef FPC_SYSTEM_HAS_MOVE}
{$define FPC_SYSTEM_HAS_MOVE}
{ at least valgrind up to 3.3 has a bug which prevents the default code to
work so we use a rather simple implementation here }
procedure Move_8OrMore_Valgrind; assembler; nostackframe;
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
asm
sub %edx, %eax
jae .LForward
mov %ecx, %ebx
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
jb .LBack { if no overlap, still do forward move }
.LForward:
{$ifdef FPC_ENABLED_CLD}
cld
{$endif FPC_ENABLED_CLD}
push %esi
push %edi
lea (%eax,%edx), %esi
mov %edx, %edi
rep movsb
pop %edi
pop %esi
pop %ebx
ret
.LBack:
add %ecx, %edx
.LNextb:
dec %edx
mov (%eax,%edx), %bl
mov %bl, (%edx)
dec %ecx
jnz .LNextb
pop %ebx
end;
procedure Move_8OrMore_IA32; assembler; nostackframe;
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
asm
fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
fildq -8(%eax,%ecx)
cmp $16, %ecx
jle .L9to16
cmp $32, %ecx
jg .L33OrMore
fildq 8(%eax)
fildq -16(%eax,%ecx)
fistpq -16(%edx,%ecx)
fistpq 8(%edx)
.L9to16:
fistpq -8(%edx,%ecx) { 916 bytes }
fistpq (%edx)
pop %ebx
ret
.Lcancel:
fucompp { Pop two elements loaded at the beginning. }
pop %ebx
ret
.byte 0x66,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16f into a no-op. }
.L33OrMore:
sub %edx, %eax { eax = src - dest }
jz .Lcancel { exit if src=dest }
jnb .LForward { src>dest => forward move }
mov %ecx, %ebx
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
jb .Lback { if no overlap, still do forward move }
.LForward:
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $8, %edx
and $-8, %edx
sub %edx, %ecx
sub $16, %ecx
jbe .LPost16f
.balign 16 { no-op }
.Lloop16f:
fildq (%eax,%edx)
fistpq (%edx)
fildq 8(%eax,%edx)
fistpq 8(%edx)
add $16, %edx
sub $16, %ecx
ja .Lloop16f
.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
cmp $-8, %ecx
jle .LFirstAndLast8f
fildq (%eax,%edx)
fistpq (%edx)
.LFirstAndLast8f:
fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
fistpq (%ebx) { Important for <8-byte step between src and dest. }
pop %ebx
ret
.byte 0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. }
{ backwards move }
.Lback:
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
and $-8, %ecx
sub %edx, %ecx
add %ecx, %edx
sub $16, %ecx
jbe .LPost16b
.balign 16 { no-op }
.Lloop16b:
sub $16, %edx
fildq 8(%eax,%edx)
fistpq 8(%edx)
fildq (%eax,%edx)
fistpq (%edx)
sub $16, %ecx
ja .Lloop16b
.LPost16b:
cmp $-8, %ecx
jle .LFirstAndLast8b
fildq -8(%eax,%edx)
fistpq -8(%edx)
.LFirstAndLast8b:
sub %ecx, %edx
fistpq -7(%ebx)
fistpq -16(%edx)
pop %ebx
end;
procedure Move_8OrMore_MMX; assembler; nostackframe;
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
asm
cmp $72, %ecx { Size at which using MMX becomes worthwhile. }
jl Move_8OrMore_IA32
movq (%eax), %mm4 { First and last 8 bytes. }
movq -8(%eax,%ecx), %mm5
sub %edx, %eax { eax = src - dest }
jz .Lquit { exit if src=dest }
jnb .LForward { src>dest => forward move }
mov %ecx, %ebx
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
jb .Lback { if no overlap, still do forward move }
.LForward:
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $8, %edx
and $-8, %edx
sub %edx, %ecx
sub $16, %ecx
jbe .LPost16f
.balign 16
.Lloop16f:
movq (%eax,%edx), %mm0
movq %mm0, (%edx)
movq 8(%eax,%edx), %mm0
movq %mm0, 8(%edx)
add $16, %edx
sub $16, %ecx
ja .Lloop16f
.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
cmp $-8, %ecx
jle .LFirstAndLast8f
movq (%eax,%edx), %mm0
movq %mm0, (%edx)
.LFirstAndLast8f:
movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
movq %mm4, (%ebx) { Important for <8-byte step between src and dest. }
.Lquit:
emms
pop %ebx
ret
.byte 0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. }
{ backwards move }
.Lback:
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
and $-8, %ecx
sub %edx, %ecx
add %ecx, %edx
sub $16, %ecx
jbe .LPost16b
.balign 16 { no-op }
.Lloop16b:
sub $16, %edx
movq 8(%eax,%edx), %mm0
movq %mm0, 8(%edx)
movq (%eax,%edx), %mm0
movq %mm0, (%edx)
sub $16, %ecx
ja .Lloop16b
.LPost16b:
cmp $-8, %ecx
jle .LFirstAndLast8b
movq -8(%eax,%edx), %mm0
movq %mm0, -8(%edx)
.LFirstAndLast8b:
sub %ecx, %edx
movq %mm4, -16(%edx)
movq %mm5, -7(%ebx)
emms
pop %ebx
end;
{$ifndef FASTMOVE_DISABLE_SSE}
procedure Move_8OrMore_SSE; assembler; nostackframe;
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
const
ErmsThreshold = 1536;
NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
PrefetchDistance = 512;
asm
cmp $16, %ecx
jle .L9to16
movups (%eax), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 1732 branch. }
movups -16(%eax,%ecx), %xmm5
cmp $32, %ecx
jg .L33OrMore
movups %xmm4, (%edx) { 1732 bytes }
movups %xmm5, -16(%edx,%ecx)
pop %ebx
ret
.L9to16:
movq (%eax), %xmm0
movq -8(%eax,%ecx), %xmm1
movq %xmm0, (%edx)
movq %xmm1, -8(%edx,%ecx)
.Lquit:
pop %ebx
ret
.byte 0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop32f into a no-op. }
.L33OrMore:
sub %edx, %eax { eax = src - dest }
jz .Lquit { exit if src=dest }
jnb .LForward { src>dest => forward move }
mov %ecx, %ebx
add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap }
jb .Lback { if no overlap, still do forward move }
.LForward:
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $16, %edx
and $-16, %edx
sub %edx, %ecx
.LRestAfterNTf:
sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
jbe .LPost32f
cmp $NtThreshold-32, %ecx
jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
.LNtIsNotBetter:
cmp $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. }
jae .LRepMovsF
.LRepMovsIsNotBetter:
test $15, %eax
jz .Lalignedloop32f
.balign 16 { no-op }
.Lloop32f:
movups (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
movups 16(%eax,%edx), %xmm0
movaps %xmm0, 16(%edx)
add $32, %edx
sub $32, %ecx
ja .Lloop32f
.LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
cmp $-16, %ecx
jle .LFirstAndLast16f
movups (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
.LFirstAndLast16f:
movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. }
movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. }
pop %ebx
ret
.balign 16
.Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. }
movaps (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
movaps 16(%eax,%edx), %xmm0
movaps %xmm0, 16(%edx)
add $32, %edx
sub $32, %ecx
ja .Lalignedloop32f
.LalignedPost32f:
cmp $-16, %ecx
jle .LalignedFirstAndLast16f
movaps (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
.LalignedFirstAndLast16f:
movups %xmm5, 16(%edx,%ecx)
movups %xmm4, (%ebx)
pop %ebx
ret
.LRepMovsF:
{$ifdef FPC_PIC}
push %ebx
call fpc_geteipasebx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
movl fast_large_repmovstosb@GOT(%ebx), %ebx
cmpb $1, (%ebx)
pop %ebx
{$else FPC_PIC}
cmpb $1, fast_large_repmovstosb
{$endif FPC_PIC}
jne .LRepMovsIsNotBetter
push %esi
push %edi
lea (%eax,%edx), %esi
mov %edx, %edi
add $32, %ecx
rep movsb
movups %xmm4, (%ebx) { last 16 aren't required }
pop %edi
pop %esi
pop %ebx
ret
.Lntf:
cmp $NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
jb .LNtIsNotBetter { (this check is performed here to not stand in the way of smaller counts) }
sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
test $15, %eax
jz .Lalignedntloop64f
.balign 16
.Lntloop64f:
prefetchnta 0+PrefetchDistance(%eax,%edx,1)
movups (%eax,%edx,1), %xmm0
movntps %xmm0, (%edx)
movups 16(%eax,%edx,1), %xmm0
movntps %xmm0, 16(%edx)
movups 32(%eax,%edx,1), %xmm0
movntps %xmm0, 32(%edx)
movups 48(%eax,%edx,1), %xmm0
movntps %xmm0, 48(%edx)
add $64, %edx
sub $64, %ecx
jae .Lntloop64f
sfence
add $PrefetchDistance+64, %ecx
jmp .LRestAfterNTf { go handle remaining bytes }
.balign 16
.Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. }
prefetchnta 0+PrefetchDistance(%eax,%edx,1)
movaps (%eax,%edx,1), %xmm0
movntps %xmm0, (%edx)
movaps 16(%eax,%edx,1), %xmm0
movntps %xmm0, 16(%edx)
movaps 32(%eax,%edx,1), %xmm0
movntps %xmm0, 32(%edx)
movaps 48(%eax,%edx,1), %xmm0
movntps %xmm0, 48(%edx)
add $64, %edx
sub $64, %ecx
jae .Lalignedntloop64f
sfence
add $PrefetchDistance+64, %ecx
jmp .LRestAfterNTf
.byte 0x66,0x0F,0x1F,0x44,0,0 { Turns .balign 16 before .Lloop32b into a no-op. }
{ backwards move }
.Lback:
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes }
mov %ebx, %ecx { move dest to the previous 16-byte boundary... }
and $-16, %ecx
sub %edx, %ecx
add %ecx, %edx
.LRestAfterNTb:
sub $32, %ecx
jbe .LPost32b
cmp $NtThreshold-32, %ecx
jae .Lntb
.balign 16 { no-op }
.Lloop32b:
sub $32, %edx
movups 16(%eax,%edx), %xmm0
movaps %xmm0, 16(%edx)
movups (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
sub $32, %ecx
ja .Lloop32b
.LPost32b:
cmp $-16, %ecx
jle .LFirstAndLast16b
movups -16(%eax,%edx), %xmm0
movaps %xmm0, -16(%edx)
.LFirstAndLast16b:
sub %ecx, %edx
movups %xmm4, -32(%edx)
movups %xmm5, -15(%ebx)
pop %ebx
ret
.Lntb:
cmp $-NtThreshold, %eax
jnb .Lloop32b
sub $PrefetchDistance+32, %ecx
.balign 16
.Lntloop64b:
prefetchnta -PrefetchDistance(%eax,%edx,1)
sub $64, %edx
movups 48(%eax,%edx,1), %xmm0
movntps %xmm0, 48(%edx)
movups 32(%eax,%edx,1), %xmm0
movntps %xmm0, 32(%edx)
movups 16(%eax,%edx,1), %xmm0
movntps %xmm0, 16(%edx)
movups (%eax,%edx,1), %xmm0
movntps %xmm0, (%edx)
sub $64, %ecx
jae .Lntloop64b
sfence
add $PrefetchDistance+64, %ecx
jmp .LRestAfterNTb
end;
{$endif ndef FASTMOVE_DISABLE_SSE}
procedure Move_8OrMore_Dispatch; forward;
var
fastmoveproc : pointer = @Move_8OrMore_Dispatch;
{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
valgrind_used : boolean;external name '__fpc_valgrind';
{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
function Move_8OrMore_HumanFriendlyDispatch: pointer;
begin
{ workaround valgrind bug }
{$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
if EntryInformation.valgrind_used then
{$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
if valgrind_used then
{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
result:=@Move_8OrMore_Valgrind
{$ifndef FASTMOVE_DISABLE_SSE}
else if has_sse_support then
result:=@Move_8OrMore_SSE
{$endif ndef FASTMOVE_DISABLE_SSE}
else if has_mmx_support then
result:=@Move_8OrMore_MMX
else
result:=@Move_8OrMore_IA32;
if fpc_cpucodeinit_performed then
fastmoveproc:=result;
end;
procedure Move_8OrMore_Dispatch; assembler; nostackframe;
{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). }
asm
push %eax
push %edx
push %ecx
call Move_8OrMore_HumanFriendlyDispatch
mov %eax, %ebx
pop %ecx
pop %edx
pop %eax
jmp %ebx
end;
procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
asm
push %ebx
cmp $8, %ecx
jle .L8OrLess
{$ifdef FPC_PIC}
call fpc_geteipasebx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
movl fastmoveproc@GOT(%ebx), %ebx
jmp (%ebx)
{$else}
jmp fastmoveproc
{$endif}
.L8OrLess:
cmp $3, %ecx
jle .L3OrLess
mov (%eax), %ebx
mov -4(%eax,%ecx), %eax
mov %ebx, (%edx)
mov %eax, -4(%edx,%ecx)
pop %ebx
ret
.L3OrLess:
cmp $1, %ecx
jl .LZero
movzbl (%eax), %ebx
je .LOne
movzwl -2(%eax,%ecx), %eax
mov %ax, -2(%edx,%ecx)
.LOne:
mov %bl, (%edx)
.LZero:
pop %ebx
end;
{$endif FPC_SYSTEM_HAS_MOVE}