{$ifndef FPC_SYSTEM_HAS_MOVE}
{$define FPC_SYSTEM_HAS_MOVE}

{ at least valgrind up to 3.3 has a bug which prevents the default code to
  work so we use a rather simple implementation here }
procedure Move_8OrMore_Valgrind; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
  If FPC_PIC: ebx pushed. }
asm
    sub    %eax, %edx            { edx = dest - src }
    cmp    %edx, %ecx
    ja     .LBack                { count (ecx) > unsigned(dest - src) (edx) if regions overlap }

{$ifdef FPC_ENABLED_CLD}
    cld
{$endif FPC_ENABLED_CLD}
    push   %esi
    push   %edi
    mov    %eax, %esi
    lea    (%edx,%eax), %edi
    rep movsb
    pop    %edi
    pop    %esi
{$ifdef FPC_PIC}
    pop    %ebx
{$endif}
    ret

.LBack:
{$ifndef FPC_PIC}
    push   %ebx
{$endif}
    add    %ecx, %eax
.LNextb:
    dec    %eax
    mov    (%eax), %bl
    mov    %bl, (%edx,%eax)
    dec    %ecx
    jnz    .LNextb
    pop    %ebx
end;

{$if not defined(CPUX86_HAS_SSEUNIT) or defined(FASTMOVE_DISABLE_SSE)}
{$define fastmove_has_ia32_and_mmx}
procedure Move_8OrMore_IA32; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
  If FPC_PIC: ebx pushed. }
asm
    fildq  (%eax)                { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). }
    fildq  -8(%eax,%ecx)
    cmp    $16, %ecx
    jle    .L9to16
    cmp    $32, %ecx
    jg     .L33OrMore
    fildq  8(%eax)
    fildq  -16(%eax,%ecx)
    fistpq -16(%edx,%ecx)
    fistpq 8(%edx)
.L9to16:
    fistpq -8(%edx,%ecx)         { 9–16 bytes }
    fistpq (%edx)
{$ifdef FPC_PIC}
    pop    %ebx
{$endif}
    ret

.Lcancel:
    fstp   %st(0)                { Pop the “second int64 from the end” .L33OrMore loads. }
    fucompp                      { Pop two elements loaded at the beginning. }
    pop    %ebx
    ret
    .byte  102,102,144           { Turns .balign 16 before .Lloop16f into a no-op. }

.L33OrMore:
    fildq  -16(%eax,%ecx)        { Second int64 from the end. }
{$ifndef FPC_PIC}
    push   %ebx
{$endif}
    sub    %edx, %eax            { eax = src - dest }
    jz     .Lcancel              { exit if src=dest }
    mov    %eax, %ebx
    neg    %ebx
    cmp    %ebx, %ecx
    ja     .Lback                { count (ecx) > unsigned(dest - src) (edx) if regions overlap }

    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
    add    $8, %edx
    and    $-8, %edx
    sub    %edx, %ecx

    sub    $16, %ecx
    jbe    .LPost16f

    .balign 16                   { no-op }
.Lloop16f:
    fildq  (%eax,%edx)
    fistpq (%edx)
    fildq  8(%eax,%edx)
    fistpq 8(%edx)
    add    $16, %edx
    sub    $16, %ecx
    ja     .Lloop16f

.LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
    fistpq (%edx,%ecx)
    fistpq 8(%edx,%ecx)          { Write first and last 8 bytes after everything else. }
    fistpq (%ebx)                { Important for <8-byte step between src and dest. }
    pop    %ebx
    ret
    .byte  102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. }

{ backwards move }
.Lback:
    fstp   %st(0)
    fildq  8(%eax,%edx)          { Second int64 from the start. }
    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
    mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
    and    $-8, %ecx
    sub    %edx, %ecx
    add    %ecx, %edx

    sub    $16, %ecx
    jbe    .LPost16b

    .balign 16                   { no-op }
.Lloop16b:
    sub    $16, %edx
    fildq  8(%eax,%edx)
    fistpq 8(%edx)
    fildq  (%eax,%edx)
    fistpq (%edx)
    sub    $16, %ecx
    ja     .Lloop16b

.LPost16b:
    sub    %ecx, %edx
    fistpq -8(%edx)
    fistpq -7(%ebx)
    fistpq -16(%edx)
    pop    %ebx
end;

procedure Move_8OrMore_MMX; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
  If FPC_PIC: ebx pushed. }
asm
    cmp    $72, %ecx             { Size at which using MMX becomes worthwhile. }
    jl     Move_8OrMore_IA32
{$ifndef FPC_PIC}
    push   %ebx
{$endif}
    movq   (%eax), %mm4          { First and last 8 bytes. }
    movq   -8(%eax,%ecx), %mm5
    movq   -16(%eax,%ecx), %mm3  { Second vector from the end. }
    sub    %edx, %eax            { eax = src - dest }
    jz     .Lquit                { exit if src=dest }
    mov    %eax, %ebx
    neg    %ebx
    cmp    %ebx, %ecx
    ja     .Lback                { count (ecx) > unsigned(dest - src) (edx) if regions overlap }

    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
    add    $8, %edx
    and    $-8, %edx
    sub    %edx, %ecx

    sub    $16, %ecx
    jbe    .LPost16f

    .balign 16
.Lloop16f:
    movq   (%eax,%edx), %mm0
    movq   %mm0, (%edx)
    movq   8(%eax,%edx), %mm0
    movq   %mm0, 8(%edx)
    add    $16, %edx
    sub    $16, %ecx
    ja     .Lloop16f

.LPost16f:                       { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. }
    movq   %mm3, (%edx,%ecx)
    movq   %mm5, 8(%edx,%ecx)    { Write first and last 8 bytes after everything else. }
    movq   %mm4, (%ebx)          { Important for <8-byte step between src and dest. }
.Lquit:
    emms
    pop    %ebx
    ret
    .byte  144 { Turns .balign 16 before .Lloop16b into a no-op. }

{ backwards move }
.Lback:
    movq   8(%eax,%edx), %mm3    { Second vector from the start. }
    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 8 bytes }
    mov    %ebx, %ecx            { move dest to the previous 8-byte boundary... }
    and    $-8, %ecx
    sub    %edx, %ecx
    add    %ecx, %edx

    sub    $16, %ecx
    jbe    .LPost16b

    .balign 16                   { no-op }
.Lloop16b:
    sub    $16, %edx
    movq   8(%eax,%edx), %mm0
    movq   %mm0, 8(%edx)
    movq   (%eax,%edx), %mm0
    movq   %mm0, (%edx)
    sub    $16, %ecx
    ja     .Lloop16b

.LPost16b:
    sub    %ecx, %edx
    movq   %mm3, -8(%edx)
    movq   %mm4, -16(%edx)
    movq   %mm5, -7(%ebx)
    emms
    pop    %ebx
end;
{$endif need IA32 and MMX versions}

{$ifndef FASTMOVE_DISABLE_SSE}
label
  Move_8OrMore_SSE_9to15, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards;

const
  Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) }

procedure Move_8OrMore_SSE; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
  If FPC_PIC: ebx pushed. }
const
  PrefetchDistance = 512;
asm
    cmp    $15, %ecx
    jle    Move_8OrMore_SSE_9to15
    movups (%eax), %xmm4         { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. }
    movups -16(%eax,%ecx), %xmm5
    cmp    $32, %ecx
    jg     Move_8OrMore_SSE_33OrMore
    movups %xmm4, (%edx)         { 16–32 bytes }
    movups %xmm5, -16(%edx,%ecx)
{$ifdef FPC_PIC}
    pop    %ebx
{$endif}
    ret

Move_8OrMore_SSE_9to15:
    movlps (%eax), %xmm0
    movlps -8(%eax,%ecx), %xmm1
    movlps %xmm0, (%edx)
    movlps %xmm1, -8(%edx,%ecx)
.Lquit:
{$ifdef FPC_PIC}
    pop    %ebx
{$endif}
    ret
    .byte  {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }

Move_8OrMore_SSE_33OrMore:
    movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
                                 { but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }

    sub    %edx, %eax            { eax = src - dest }
    jz     .Lquit                { exit if src=dest }
{$ifndef FPC_PIC}
    push   %ebx
{$endif}
    mov    %eax, %ebx
    neg    %ebx
    cmp    %ebx, %ecx
    ja     .Lback                { count (ecx) > unsigned(dest - src) (edx) if regions overlap }

    mov    %edx, %ebx            { remember original dest to write first 16 bytes }
    add    %edx, %ecx            { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
    add    $16, %edx
    and    $-16, %edx
    sub    %edx, %ecx

.LRestAfterNTf:
    sub    $32, %ecx             { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
    jbe    .LPost32f
    cmp    $Move_NtThreshold-32, %ecx
    jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }
.LNtIsNotBetterF:
    test   $15, %eax
    jz     .Lalignedloop32f

    .balign 16                   { no-op }
.Lloop32f:
    movups (%eax,%edx), %xmm0
    movaps %xmm0, (%edx)
    movups 16(%eax,%edx), %xmm0
    movaps %xmm0, 16(%edx)
    add    $32, %edx
    sub    $32, %ecx
    ja     .Lloop32f

.LPost32f:                       { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. }
    movups %xmm3, (%edx, %ecx)
    movups %xmm5, 16(%edx,%ecx)  { Write first and last 16 bytes after everything else. }
    movups %xmm4, (%ebx)         { Important for <16-byte step between src and dest. }
    pop    %ebx
    ret

    .balign 16
.Lalignedloop32f:                { Same as above starting from .Lloop32f but with MOVAPSes. }
    movaps (%eax,%edx), %xmm0
    movaps %xmm0, (%edx)
    movaps 16(%eax,%edx), %xmm0
    movaps %xmm0, 16(%edx)
    add    $32, %edx
    sub    $32, %ecx
    ja     .Lalignedloop32f

.LalignedPost32f:
    movups %xmm3, (%edx, %ecx)
    movups %xmm5, 16(%edx,%ecx)
    movups %xmm4, (%ebx)
    pop    %ebx
    ret

.Lntf:
    cmp    $Move_NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
    jb     .LNtIsNotBetterF      { (this check is performed here to not stand in the way of smaller counts) }
    sub    $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. }
    test   $15, %eax
    jz     .Lalignedntloop64f

    .balign 16
.Lntloop64f:
    prefetchnta 0+PrefetchDistance(%eax,%edx,1)
    movups (%eax,%edx,1), %xmm0
    movntps %xmm0, (%edx)
    movups 16(%eax,%edx,1), %xmm0
    movntps %xmm0, 16(%edx)
    movups 32(%eax,%edx,1), %xmm0
    movntps %xmm0, 32(%edx)
    movups 48(%eax,%edx,1), %xmm0
    movntps %xmm0, 48(%edx)
    add    $64, %edx
    sub    $64, %ecx
    jae    .Lntloop64f

    sfence
    add    $PrefetchDistance+64, %ecx
    jmp    .LRestAfterNTf        { go handle remaining bytes }

    .balign 16
.Lalignedntloop64f:              { Same as above starting from .Lntloop64f but with MOVAPSes. }
    prefetchnta 0+PrefetchDistance(%eax,%edx,1)
    movaps (%eax,%edx,1), %xmm0
    movntps %xmm0, (%edx)
    movaps 16(%eax,%edx,1), %xmm0
    movntps %xmm0, 16(%edx)
    movaps 32(%eax,%edx,1), %xmm0
    movntps %xmm0, 32(%edx)
    movaps 48(%eax,%edx,1), %xmm0
    movntps %xmm0, 48(%edx)
    add    $64, %edx
    sub    $64, %ecx
    jae    .Lalignedntloop64f

    sfence
    add    $PrefetchDistance+64, %ecx
    jmp    .LRestAfterNTf
    .byte  {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }

Move_8OrMore_SSE_CancelERMSBackwards:
    { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. }
{$ifndef FPC_PIC}
    push   %ebx
{$endif}
    add    %eax, %edx
    movups (%eax), %xmm4
    movups -16(%eax,%ecx), %xmm5
    sub    %edx, %eax

{ backwards move }
.Lback:
    movups 16(%eax,%edx), %xmm3  { Second vector from the start. }
    lea    -1(%edx,%ecx), %ebx   { points to the end of dest; remember to write last 16 bytes }
    mov    %ebx, %ecx            { move dest to the previous 16-byte boundary... }
    and    $-16, %ecx
    sub    %edx, %ecx
    add    %ecx, %edx

.LRestAfterNTb:
    sub    $32, %ecx
    jbe    .LPost32b
    cmp    $Move_NtThreshold-32, %ecx
    jae    .Lntb

    .balign 16                   { no-op }
.Lloop32b:
    sub    $32, %edx
    movups 16(%eax,%edx), %xmm0
    movaps %xmm0, 16(%edx)
    movups (%eax,%edx), %xmm0
    movaps %xmm0, (%edx)
    sub    $32, %ecx
    ja     .Lloop32b

.LPost32b:
    sub    %ecx, %edx
    movups %xmm3, -16(%edx)
    movups %xmm4, -32(%edx)
    movups %xmm5, -15(%ebx)
    pop    %ebx
    ret

.Lntb:
    cmp    $-Move_NtThreshold, %eax
    ja     .Lloop32b
    sub    $PrefetchDistance+32, %ecx

    .balign 16
.Lntloop64b:
    prefetchnta -PrefetchDistance(%eax,%edx,1)
    sub    $64, %edx
    movups 48(%eax,%edx,1), %xmm0
    movntps %xmm0, 48(%edx)
    movups 32(%eax,%edx,1), %xmm0
    movntps %xmm0, 32(%edx)
    movups 16(%eax,%edx,1), %xmm0
    movntps %xmm0, 16(%edx)
    movups (%eax,%edx,1), %xmm0
    movntps %xmm0, (%edx)
    sub    $64, %ecx
    jae    .Lntloop64b

    sfence
    add    $PrefetchDistance+64, %ecx
    jmp    .LRestAfterNTb
end;

procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
  If FPC_PIC: ebx pushed. }
const
    ErmsThreshold = 1536;
asm
    cmp    $15, %ecx
    jle    Move_8OrMore_SSE_9to15
    cmp    $ErmsThreshold, %ecx
    jae    .LRepMovs
    movups (%eax), %xmm4         { Same as in Move_8OrMore_SSE. }
    movups -16(%eax,%ecx), %xmm5
    cmp    $32, %ecx
    jg     Move_8OrMore_SSE_33OrMore
    movups %xmm4, (%edx)         { 16–32 bytes }
    movups %xmm5, -16(%edx,%ecx)
{$ifdef FPC_PIC}
    pop    %ebx
{$endif}
    ret

.LRepMovs:
    sub    %eax, %edx            { edx = dest - src }
    jz     .Lquit                { exit if src=dest }
    cmp    %edx, %ecx            { count (ecx) > unsigned(dest - src) (edx) if regions overlap }
    ja     .Lback
    cmp    $Move_NtThreshold+16, %ecx
    jae    .LNtF                 { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT done by Move_8OrMore_SSE. }
.LNtIsNotBetterF:

    push   %esi
    push   %edi
    mov    %eax, %esi
    lea    (%edx,%eax), %edi
    rep movsb
    pop    %edi
    pop    %esi
.Lquit:
{$ifdef FPC_PIC}
    pop    %ebx
{$endif}
    ret

.LNtF:
    cmp    $-Move_NtThreshold, %edx { Check move distance. Bad case for forward NT is 0 < src - dest < NtThreshold => unsigned(dest - src) > unsigned(-NtThreshold). }
    ja     .LNtIsNotBetterF      { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
    add    %eax, %edx            { Recover edx = dest. }
    jmp    Move_8OrMore_SSE      { Will perform NT. }

.Lback:
    {         dst = 3
              v
      Move(abcdefghijXXX, count=10)
           ^
           src = 0

         = abcABCDEFGHIJ

      can be moved right to left in non-overlapping groups of “dst - src”:

      abcdefghijHIJ
             ^^^

      abcdefgEFGhij
          ^^^

      abcdBCDefghij
       ^^^

      abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes.
      ^

      Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. }
    cmp    $ErmsThreshold, %edx
    jb     Move_8OrMore_SSE_CancelERMSBackwards { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( }
    cmp    $Move_NtThreshold+16, %ecx
    jae    .LNtB
.LNtIsNotBetterB:

{$ifndef FPC_PIC}
    push   %ebx
{$endif}
    mov    %ecx, %ebx            { ebx = remaining }
    sub    %edx, %ebx            { edx = dst - src = step; remaining -= step. }
    add    %ecx, %eax
    push   %esi
    push   %edi
.LRepMovsNextPieceB:             { At least 1 iteration is always performed. }
    mov    %eax, %edi            { edi = src before subtracting step = dst = rep movsb dest }
    sub    %edx, %eax            { src -= step }
    mov    %eax, %esi            { esi = src = rep movsb source }
    mov    %edx, %ecx            { ecx = step = rep movsb count }
    rep movsb
    sub    %edx, %ebx            { remaining -= step }
    jnc    .LRepMovsNextPieceB   { CF=1 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. }
    pop    %edi
    pop    %esi
    lea    (%edx,%ebx), %ecx     { ecx = remaining }
    sub    %ecx, %eax            { eax = src }
    add    %eax, %edx            { edx = dest }
    pop    %ebx
    jmp    Move                  { Remaining piece ("a" in the example above). Can save a bit of jumps by doing more checks and jumping to more specific places, but whatever. }

.LNtB:
    cmp    $Move_NtThreshold, %edx { Check move distance. Bad case for backward NT is dest - src < NtThreshold; src is always < dest. }
    jb     .LNtIsNotBetterB      { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. }
    add    %eax, %edx            { Recover edx = dest. }
    jmp    Move_8OrMore_SSE      { Will perform NT. }
end;
{$endif ndef FASTMOVE_DISABLE_SSE}

procedure Move_8OrMore_Dispatch; forward;

var
  fastmoveproc : pointer = @Move_8OrMore_Dispatch;
{$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  valgrind_used : boolean;external name '__fpc_valgrind';
{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}

function Move_8OrMore_HumanFriendlyDispatch: pointer;
begin
  { workaround valgrind bug }
{$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  if EntryInformation.valgrind_used then
{$else FPC_HAS_INDIRECT_ENTRY_INFORMATION}
  if valgrind_used then
{$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION}
    result:=@Move_8OrMore_Valgrind
{$ifndef FASTMOVE_DISABLE_SSE}
  else if fast_large_repmovstosb then
    result:=@Move_8OrMore_SSE_ERMS
  else {$ifdef fastmove_has_ia32_and_mmx} if has_sse_support then {$endif}
    result:=@Move_8OrMore_SSE
{$endif ndef FASTMOVE_DISABLE_SSE}
{$ifdef fastmove_has_ia32_and_mmx}
  else if has_mmx_support then
    result:=@Move_8OrMore_MMX
  else
    result:=@Move_8OrMore_IA32
{$endif fastmove_has_ia32_and_mmx};
  if fpc_cpucodeinit_performed then
    fastmoveproc:=result;
end;

procedure Move_8OrMore_Dispatch; assembler; nostackframe;
{ eax = source, edx = dest, ecx = count (ecx >= 8).
  If FPC_PIC: ebx pushed. }
asm
{$ifndef FPC_PIC}
    push %ebx
{$endif}
    push %eax
    push %edx
    push %ecx
    call Move_8OrMore_HumanFriendlyDispatch
    mov  %eax, %ebx
    pop  %ecx
    pop  %edx
    pop  %eax
{$ifdef FPC_PIC}
    jmp  %ebx
{$else}
    call %ebx
    pop  %ebx
{$endif}
end;

procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe;
asm
    cmp    $8, %ecx
    jle    .L8OrLess
{$ifdef FPC_PIC}
    push   %ebx
    call   fpc_geteipasebx
    addl   $_GLOBAL_OFFSET_TABLE_, %ebx
    movl   fastmoveproc@GOT(%ebx), %ebx
    jmp    (%ebx)
{$else}
    jmp    fastmoveproc
{$endif}

.L8OrLess:
    cmp    $3, %ecx
    jle    .L3OrLess
    push   %ebx
    mov    (%eax), %ebx
    mov    -4(%eax,%ecx), %eax
    mov    %ebx, (%edx)
    mov    %eax, -4(%edx,%ecx)
    pop    %ebx
    ret

.L3OrLess:
    cmp    $1, %ecx
    jl     .LZero
    push   %ebx
    movzbl (%eax), %ebx
    je     .LOne
    movzwl -2(%eax,%ecx), %eax
    mov    %ax, -2(%edx,%ecx)
.LOne:
    mov    %bl, (%edx)
    pop    %ebx
.LZero:
end;

{$endif  FPC_SYSTEM_HAS_MOVE}