fpc/rtl/i386/fastmove.inc

646 lines
19 KiB
PHP
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{$ifndef FPC_SYSTEM_HAS_MOVE}
{$define FPC_SYSTEM_HAS_MOVE}
{ at least valgrind up to 3.3 has a bug which prevents the default code to
work so we use a rather simple implementation here }
procedure Move_8OrMore_Valgrind; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
If FPC_PIC: ebx pushed. }
asm
sub %eax, %edx { edx = dest - src }
cmp %edx, %ecx
ja .LBack { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
{$ifdef FPC_ENABLED_CLD}
cld
{$endif FPC_ENABLED_CLD}
push %esi
push %edi
mov %eax, %esi
lea (%edx,%eax), %edi
rep movsb
pop %edi
pop %esi
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
.LBack:
{$ifndef FPC_PIC}
push %ebx
{$endif}
add %ecx, %eax
.LNextb:
dec %eax
mov (%eax), %bl
mov %bl, (%edx,%eax)
dec %ecx
jnz .LNextb
pop %ebx
end;
{$if not defined(CPUX86_HAS_SSEUNIT) or defined(FASTMOVE_DISABLE_SSE)}
{$define fastmove_has_ia32_and_mmx}
procedure Move_8OrMore_IA32; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
If FPC_PIC: ebx pushed. }
asm
fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
fildq -8(%eax,%ecx)
cmp $16, %ecx
jle .L9to16
cmp $32, %ecx
jg .L33OrMore
fildq 8(%eax)
fildq -16(%eax,%ecx)
fistpq -16(%edx,%ecx)
fistpq 8(%edx)
.L9to16:
fistpq -8(%edx,%ecx) { 916 bytes }
fistpq (%edx)
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
.Lcancel:
fstp %st(0) { Pop the “second int64 from the end” .L33OrMore loads. }
fucompp { Pop two elements loaded at the beginning. }
pop %ebx
ret
.byte 102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. }
.L33OrMore:
fildq -16(%eax,%ecx) { Second int64 from the end. }
{$ifndef FPC_PIC}
push %ebx
{$endif}
sub %edx, %eax { eax = src - dest }
jz .Lcancel { exit if src=dest }
mov %eax, %ebx
neg %ebx
cmp %ebx, %ecx
ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $8, %edx
and $-8, %edx
sub %edx, %ecx
sub $16, %ecx
jbe .LPost16f
.balign 16 { no-op }
.Lloop16f:
fildq (%eax,%edx)
fistpq (%edx)
fildq 8(%eax,%edx)
fistpq 8(%edx)
add $16, %edx
sub $16, %ecx
ja .Lloop16f
.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
fistpq (%edx,%ecx)
fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
fistpq (%ebx) { Important for <8-byte step between src and dest. }
pop %ebx
ret
.byte 102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }
{ backwards move }
.Lback:
fstp %st(0)
fildq 8(%eax,%edx) { Second int64 from the start. }
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
and $-8, %ecx
sub %edx, %ecx
add %ecx, %edx
sub $16, %ecx
jbe .LPost16b
.balign 16 { no-op }
.Lloop16b:
sub $16, %edx
fildq 8(%eax,%edx)
fistpq 8(%edx)
fildq (%eax,%edx)
fistpq (%edx)
sub $16, %ecx
ja .Lloop16b
.LPost16b:
sub %ecx, %edx
fistpq -8(%edx)
fistpq -7(%ebx)
fistpq -16(%edx)
pop %ebx
end;
procedure Move_8OrMore_MMX; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
If FPC_PIC: ebx pushed. }
asm
cmp $72, %ecx { Size at which using MMX becomes worthwhile. }
jl Move_8OrMore_IA32
{$ifndef FPC_PIC}
push %ebx
{$endif}
movq (%eax), %mm4 { First and last 8 bytes. }
movq -8(%eax,%ecx), %mm5
movq -16(%eax,%ecx), %mm3 { Second vector from the end. }
sub %edx, %eax { eax = src - dest }
jz .Lquit { exit if src=dest }
mov %eax, %ebx
neg %ebx
cmp %ebx, %ecx
ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $8, %edx
and $-8, %edx
sub %edx, %ecx
sub $16, %ecx
jbe .LPost16f
.balign 16
.Lloop16f:
movq (%eax,%edx), %mm0
movq %mm0, (%edx)
movq 8(%eax,%edx), %mm0
movq %mm0, 8(%edx)
add $16, %edx
sub $16, %ecx
ja .Lloop16f
.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
movq %mm3, (%edx,%ecx)
movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. }
movq %mm4, (%ebx) { Important for <8-byte step between src and dest. }
.Lquit:
emms
pop %ebx
ret
.byte 144 { Turns .balign 16 before .Lloop16b into a no-op. }
{ backwards move }
.Lback:
movq 8(%eax,%edx), %mm3 { Second vector from the start. }
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes }
mov %ebx, %ecx { move dest to the previous 8-byte boundary... }
and $-8, %ecx
sub %edx, %ecx
add %ecx, %edx
sub $16, %ecx
jbe .LPost16b
.balign 16 { no-op }
.Lloop16b:
sub $16, %edx
movq 8(%eax,%edx), %mm0
movq %mm0, 8(%edx)
movq (%eax,%edx), %mm0
movq %mm0, (%edx)
sub $16, %ecx
ja .Lloop16b
.LPost16b:
sub %ecx, %edx
movq %mm3, -8(%edx)
movq %mm4, -16(%edx)
movq %mm5, -7(%ebx)
emms
pop %ebx
end;
{$endif need IA32 and MMX versions}
{$ifndef FASTMOVE_DISABLE_SSE}
label
Move_8OrMore_SSE_9to15, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;
const
Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }
procedure Move_8OrMore_SSE; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
If FPC_PIC: ebx pushed. }
const
PrefetchDistance = 512;
asm
cmp $15, %ecx
jle Move_8OrMore_SSE_9to15
movups (%eax), %xmm4 { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 1732 branch. }
movups -16(%eax,%ecx), %xmm5
cmp $32, %ecx
jg Move_8OrMore_SSE_33OrMore
movups %xmm4, (%edx) { 1632 bytes }
movups %xmm5, -16(%edx,%ecx)
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
Move_8OrMore_SSE_9to15:
movlps (%eax), %xmm0
movlps -8(%eax,%ecx), %xmm1
movlps %xmm0, (%edx)
movlps %xmm1, -8(%edx,%ecx)
.Lquit:
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
.byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }
Move_8OrMore_SSE_33OrMore:
movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
{ but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }
sub %edx, %eax { eax = src - dest }
jz .Lquit { exit if src=dest }
{$ifndef FPC_PIC}
push %ebx
{$endif}
mov %eax, %ebx
neg %ebx
cmp %ebx, %ecx
ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
mov %edx, %ebx { remember original dest to write first 16 bytes }
add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
add $16, %edx
and $-16, %edx
sub %edx, %ecx
.LRestAfterNTf:
sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
jbe .LPost32f
cmp $Move_NtThreshold-32, %ecx
jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
.LNtIsNotBetterF:
test $15, %eax
jz .Lalignedloop32f
.balign 16 { no-op }
.Lloop32f:
movups (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
movups 16(%eax,%edx), %xmm0
movaps %xmm0, 16(%edx)
add $32, %edx
sub $32, %ecx
ja .Lloop32f
.LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
movups %xmm3, (%edx, %ecx)
movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. }
movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. }
pop %ebx
ret
.balign 16
.Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. }
movaps (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
movaps 16(%eax,%edx), %xmm0
movaps %xmm0, 16(%edx)
add $32, %edx
sub $32, %ecx
ja .Lalignedloop32f
.LalignedPost32f:
movups %xmm3, (%edx, %ecx)
movups %xmm5, 16(%edx,%ecx)
movups %xmm4, (%ebx)
pop %ebx
ret
.Lntf:
cmp $Move_NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
jb .LNtIsNotBetterF { (this check is performed here to not stand in the way of smaller counts) }
sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
test $15, %eax
jz .Lalignedntloop64f
.balign 16
.Lntloop64f:
prefetchnta 0+PrefetchDistance(%eax,%edx,1)
movups (%eax,%edx,1), %xmm0
movntps %xmm0, (%edx)
movups 16(%eax,%edx,1), %xmm0
movntps %xmm0, 16(%edx)
movups 32(%eax,%edx,1), %xmm0
movntps %xmm0, 32(%edx)
movups 48(%eax,%edx,1), %xmm0
movntps %xmm0, 48(%edx)
add $64, %edx
sub $64, %ecx
jae .Lntloop64f
sfence
add $PrefetchDistance+64, %ecx
jmp .LRestAfterNTf { go handle remaining bytes }
.balign 16
.Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. }
prefetchnta 0+PrefetchDistance(%eax,%edx,1)
movaps (%eax,%edx,1), %xmm0
movntps %xmm0, (%edx)
movaps 16(%eax,%edx,1), %xmm0
movntps %xmm0, 16(%edx)
movaps 32(%eax,%edx,1), %xmm0
movntps %xmm0, 32(%edx)
movaps 48(%eax,%edx,1), %xmm0
movntps %xmm0, 48(%edx)
add $64, %edx
sub $64, %ecx
jae .Lalignedntloop64f
sfence
add $PrefetchDistance+64, %ecx
jmp .LRestAfterNTf
.byte {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }
Move_8OrMore_SSE_CancelERMSBackwards:
{ Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 arent read, ebx isn't pushed if not FPC_PIC. }
{$ifndef FPC_PIC}
push %ebx
{$endif}
add %eax, %edx
movups (%eax), %xmm4
movups -16(%eax,%ecx), %xmm5
sub %edx, %eax
{ backwards move }
.Lback:
movups 16(%eax,%edx), %xmm3 { Second vector from the start. }
lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes }
mov %ebx, %ecx { move dest to the previous 16-byte boundary... }
and $-16, %ecx
sub %edx, %ecx
add %ecx, %edx
.LRestAfterNTb:
sub $32, %ecx
jbe .LPost32b
cmp $Move_NtThreshold-32, %ecx
jae .Lntb
.balign 16 { no-op }
.Lloop32b:
sub $32, %edx
movups 16(%eax,%edx), %xmm0
movaps %xmm0, 16(%edx)
movups (%eax,%edx), %xmm0
movaps %xmm0, (%edx)
sub $32, %ecx
ja .Lloop32b
.LPost32b:
sub %ecx, %edx
movups %xmm3, -16(%edx)
movups %xmm4, -32(%edx)
movups %xmm5, -15(%ebx)
pop %ebx
ret
.Lntb:
cmp $-Move_NtThreshold, %eax
ja .Lloop32b
sub $PrefetchDistance+32, %ecx
.balign 16
.Lntloop64b:
prefetchnta -PrefetchDistance(%eax,%edx,1)
sub $64, %edx
movups 48(%eax,%edx,1), %xmm0
movntps %xmm0, 48(%edx)
movups 32(%eax,%edx,1), %xmm0
movntps %xmm0, 32(%edx)
movups 16(%eax,%edx,1), %xmm0
movntps %xmm0, 16(%edx)
movups (%eax,%edx,1), %xmm0
movntps %xmm0, (%edx)
sub $64, %ecx
jae .Lntloop64b
sfence
add $PrefetchDistance+64, %ecx
jmp .LRestAfterNTb
end;
procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
If FPC_PIC: ebx pushed. }
const
ErmsThreshold = 1536;
asm
cmp $15, %ecx
jle Move_8OrMore_SSE_9to15
cmp $ErmsThreshold, %ecx
jae .LRepMovs
movups (%eax), %xmm4 { Same as in Move_8OrMore_SSE. }
movups -16(%eax,%ecx), %xmm5
cmp $32, %ecx
jg Move_8OrMore_SSE_33OrMore
movups %xmm4, (%edx) { 1632 bytes }
movups %xmm5, -16(%edx,%ecx)
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
.LRepMovs:
sub %eax, %edx { edx = dest - src }
jz .Lquit { exit if src=dest }
cmp %edx, %ecx { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
ja .Lback
cmp $Move_NtThreshold+16, %ecx
jae .LNtF { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT done by Move_8OrMore_SSE. }
.LNtIsNotBetterF:
push %esi
push %edi
mov %eax, %esi
lea (%edx,%eax), %edi
rep movsb
pop %edi
pop %esi
.Lquit:
{$ifdef FPC_PIC}
pop %ebx
{$endif}
ret
.LNtF:
cmp $-Move_NtThreshold, %edx { Check move distance. Bad case for forward NT is 0 < src - dest < NtThreshold => unsigned(dest - src) > unsigned(-NtThreshold). }
ja .LNtIsNotBetterF { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
add %eax, %edx { Recover edx = dest. }
jmp Move_8OrMore_SSE { Will perform NT. }
.Lback:
{ dst = 3
v
Move(abcdefghijXXX, count=10)
^
src = 0
= abcABCDEFGHIJ
can be moved right to left in non-overlapping groups of “dst - src”:
abcdefghijHIJ
^^^
abcdefgEFGhij
^^^
abcdBCDefghij
^^^
abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes.
^
Only REP MOVs with DF=0 are fast with ERMS, in case youre wondering why not just use DF=1. }
cmp $ErmsThreshold, %edx
jb Move_8OrMore_SSE_CancelERMSBackwards { Unfortunately this branch cant benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
cmp $Move_NtThreshold+16, %ecx
jae .LNtB
.LNtIsNotBetterB:
{$ifndef FPC_PIC}
push %ebx
{$endif}
mov %ecx, %ebx { ebx = remaining }
sub %edx, %ebx { edx = dst - src = step; remaining -= step. }
add %ecx, %eax
push %esi
push %edi
.LRepMovsNextPieceB: { At least 1 iteration is always performed. }
mov %eax, %edi { edi = src before subtracting step = dst = rep movsb dest }
sub %edx, %eax { src -= step }
mov %eax, %esi { esi = src = rep movsb source }
mov %edx, %ecx { ecx = step = rep movsb count }
rep movsb
sub %edx, %ebx { remaining -= step }
jnc .LRepMovsNextPieceB { CF=1 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
pop %edi
pop %esi
lea (%edx,%ebx), %ecx { ecx = remaining }
sub %ecx, %eax { eax = src }
add %eax, %edx { edx = dest }
pop %ebx
jmp Move { Remaining piece ("a" in the example above). Can save a bit of jumps by doing more checks and jumping to more specific places, but whatever. }
.LNtB:
cmp $Move_NtThreshold, %edx { Check move distance. Bad case for backward NT is dest - src < NtThreshold; src is always < dest. }
jb .LNtIsNotBetterB { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
add %eax, %edx { Recover edx = dest. }
jmp Move_8OrMore_SSE { Will perform NT. }
end;
{$endif ndef FASTMOVE_DISABLE_SSE}
procedure Move_8OrMore_Dispatch; forward;
var
fastmoveproc : pointer = @Move_8OrMore_Dispatch;
{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
valgrind_used : boolean;external name '__fpc_valgrind';
{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
function Move_8OrMore_HumanFriendlyDispatch: pointer;
begin
{ workaround valgrind bug }
{$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
if EntryInformation.valgrind_used then
{$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
if valgrind_used then
{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
result:=@Move_8OrMore_Valgrind
{$ifndef FASTMOVE_DISABLE_SSE}
else if fast_large_repmovstosb then
result:=@Move_8OrMore_SSE_ERMS
else {$ifdef fastmove_has_ia32_and_mmx} if has_sse_support then {$endif}
result:=@Move_8OrMore_SSE
{$endif ndef FASTMOVE_DISABLE_SSE}
{$ifdef fastmove_has_ia32_and_mmx}
else if has_mmx_support then
result:=@Move_8OrMore_MMX
else
result:=@Move_8OrMore_IA32
{$endif fastmove_has_ia32_and_mmx};
if fpc_cpucodeinit_performed then
fastmoveproc:=result;
end;
procedure Move_8OrMore_Dispatch; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
If FPC_PIC: ebx pushed. }
asm
{$ifndef FPC_PIC}
push %ebx
{$endif}
push %eax
push %edx
push %ecx
call Move_8OrMore_HumanFriendlyDispatch
mov %eax, %ebx
pop %ecx
pop %edx
pop %eax
{$ifdef FPC_PIC}
jmp %ebx
{$else}
call %ebx
pop %ebx
{$endif}
end;
procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
asm
cmp $8, %ecx
jle .L8OrLess
{$ifdef FPC_PIC}
push %ebx
call fpc_geteipasebx
addl $_GLOBAL_OFFSET_TABLE_, %ebx
movl fastmoveproc@GOT(%ebx), %ebx
jmp (%ebx)
{$else}
jmp fastmoveproc
{$endif}
.L8OrLess:
cmp $3, %ecx
jle .L3OrLess
push %ebx
mov (%eax), %ebx
mov -4(%eax,%ecx), %eax
mov %ebx, (%edx)
mov %eax, -4(%edx,%ecx)
pop %ebx
ret
.L3OrLess:
cmp $1, %ecx
jl .LZero
push %ebx
movzbl (%eax), %ebx
je .LOne
movzwl -2(%eax,%ecx), %eax
mov %ax, -2(%edx,%ecx)
.LOne:
mov %bl, (%edx)
pop %ebx
.LZero:
end;
{$endif FPC_SYSTEM_HAS_MOVE}