{
    This file is part of the Free Pascal run time library.
    Copyright (c) 2002 by Florian Klaempfl and Sergei Gorelkin
    Members of the Free Pascal development team

    Processor dependent implementation for the system unit for
    the x86-64 architecture

    See the file COPYING.FPC, included in this distribution,
    for details about the copyright.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 **********************************************************************}

{$asmmode GAS}

{****************************************************************************
                               Primitives
****************************************************************************}

{$ifndef win64}
  {$define use_fast_repmovstos} { REP STOS uses nonvolatile RDI and would require a stack frame on Win64 to be SEH-compliant. }
{$endif}

{$ifdef use_fast_repmovstos}
var
  fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
{$endif}

{$define FPC_SYSTEM_HAS_SPTR}
Function Sptr : Pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
asm
        movq    %rsp,%rax
end;

{$IFNDEF INTERNAL_BACKTRACE}
{$define FPC_SYSTEM_HAS_GET_FRAME}
function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
asm
        movq    %rbp,%rax
end;
{$ENDIF not INTERNAL_BACKTRACE}

{$define FPC_SYSTEM_HAS_GET_PC_ADDR}
function get_pc_addr:pointer;assembler;nostackframe;
asm
        movq    (%rsp),%rax
end;

{$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;{$ifdef SYSTEMINLINE}inline;{$endif}
begin
  get_caller_addr:=framebp;
  if assigned(framebp) then
    get_caller_addr:=PPointer(framebp)[1];
end;


{$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;{$ifdef SYSTEMINLINE}inline;{$endif}
begin
  get_caller_frame:=framebp;
  if assigned(framebp) then
    get_caller_frame:=PPointer(framebp)^;
end;

// The following assembler procedures are disabled for FreeBSD due to
// multiple issues with its old GNU assembler (Mantis #19188).
// Even after fixing them, it can be enabled only for the trunk version,
// otherwise bootstrapping won't be possible.
// Modified to use oldbinutils as in cpu.pp source, to allow easier use for other targets.
{$ifdef freebsd}
  {$ifndef overridebinutils}
    {$define oldbinutils}
  {$endif}
{$endif freebsd}

{$ifndef oldbinutils}


{$ifndef FPC_SYSTEM_HAS_MOVE}
{$define FPC_SYSTEM_HAS_MOVE}
procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
{ Linux: rdi source, rsi dest, rdx count
  win64: rcx source, rdx dest, r8 count }
asm
{$ifndef win64}
    mov    %rdx, %r8
    mov    %rsi, %rdx
    mov    %rdi, %rcx
{$endif win64}

    cmp    $3, %r8
    jle    .L3OrLess
    cmp    $8, %r8
    jle    .L4to8
    cmp    $16, %r8
    jle    .L9to16
    movdqu (%rcx), %xmm4         { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. }
    movdqu -16(%rcx,%r8), %xmm5
    cmp    $32, %r8
    jg     .L33OrMore
    movdqu %xmm4, (%rdx)         { 17–32 bytes }
    movdqu %xmm5, -16(%rdx,%r8)
    ret

    .balign 16
.L3OrLess:
    cmp    $1, %r8
    jl     .LZero
    movzbl (%rcx), %eax
    je     .LOne
    movzwl -2(%rcx,%r8), %r9d
    mov    %r9w, -2(%rdx,%r8)
.LOne:
    mov    %al, (%rdx)
.LZero:
    ret

.L4to8:
    mov    (%rcx), %eax
    mov    -4(%rcx,%r8), %r9d
    mov    %eax, (%rdx)
    mov    %r9d, -4(%rdx,%r8)
    ret

.L9to16:
    mov    (%rcx), %rax
    mov    -8(%rcx,%r8), %r9
    mov    %rax, (%rdx)
    mov    %r9, -8(%rdx,%r8)
.Lquit:
    ret
    .byte  102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. }

.L33OrMore:
    movdqu -32(%rcx,%r8), %xmm3  { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), }
                                 { but -32(%rcx,%r8) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. }

    sub    %rdx, %rcx            { rcx = src - dest }
    jz     .Lquit                { exit if src=dest }

    mov    %rcx, %rax
    neg    %rax
    cmp    %rax, %r8
    ja     .Lback                { count (r8) > unsigned(dest - src) (rax) if regions overlap }

    mov    %rdx, %r9             { remember original dest to write first 16 bytes }
    add    %rdx, %r8             { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. }
    add    $16, %rdx
    and    $-16, %rdx
    sub    %rdx, %r8

.LRestAfterNTf:
    sub    $32, %r8              { During the N× loop, r8 is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. }
    jbe    .LPost32f
    cmp    $0x40000, %r8         { this limit must be processor-specific (1/2 L2 cache size) }
    jae    .Lntf                 { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... }

    .balign 16                   { no-op }
.Lloop32f:
    movdqu (%rcx,%rdx), %xmm0
    movdqa %xmm0, (%rdx)
    movdqu 16(%rcx,%rdx), %xmm0
    movdqa %xmm0, 16(%rdx)
    add    $32, %rdx
    sub    $32, %r8
    ja     .Lloop32f

.LPost32f:                       { +32 fixup not applied after 32× loop, r8 = remaining - 32 here. }
    movdqu %xmm3, (%rdx, %r8)
    movdqu %xmm5, 16(%rdx,%r8)   { Write first and last 16 bytes after everything else. }
    movdqu %xmm4, (%r9)          { Important for <16-byte step between src and dest. }
    ret

.Lntf:
    cmp    $0x1000, %rcx         { Maybe change mind: don't bother bypassing cache if src and dest are close to each other }
    jb     .Lloop32f             { (this check is performed here to not stand in the way of smaller counts) }
    sub    $0xFE0, %r8           { r8 = remaining - 0x1000, but 32 was subtracted already, so must subtract only (0x1000 - 32) = 0xFE0. }

.Lntloopf:
    mov    $32, %eax

    .balign 16
.Lpref:
    prefetchnta (%rcx,%rdx,1)
    prefetchnta 0x40(%rcx,%rdx,1)
    add    $0x80, %rdx
    dec    %eax
    jnz    .Lpref

    sub    $0x1000, %rdx
    mov    $64, %eax

    .balign 16
.Lntloop64f:
    add    $64, %rdx
    movdqu -64(%rcx,%rdx,1), %xmm0
    movntdq %xmm0, -64(%rdx)
    movdqu -48(%rcx,%rdx,1), %xmm0
    movntdq %xmm0, -48(%rdx)
    movdqu -32(%rcx,%rdx,1), %xmm0
    movntdq %xmm0, -32(%rdx)
    movdqu -16(%rcx,%rdx,1), %xmm0
    movntdq %xmm0, -16(%rdx)
    dec    %eax
    jnz    .Lntloop64f

    sub    $0x1000, %r8
    jae    .Lntloopf

    mfence
    add    $0x1000, %r8
    jmpq   .LRestAfterNTf        { go handle remaining bytes }
    .byte  102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. }

{ backwards move }
.Lback:
    movdqu 16(%rcx,%rdx), %xmm3  { Second vector from the start. }
    lea    (%rdx,%r8), %r9       { points to the end of dest; remember to write last 16 bytes }
    lea    -1(%r9), %r8          { move dest to the previous 16-byte boundary... }
    and    $-16, %r8
    sub    %rdx, %r8
    add    %r8, %rdx

.LRestAfterNTb:
    sub    $32, %r8
    jbe    .LPost32b
    cmp    $0x40000, %r8
    jae    .Lntb

    .balign 16                   { no-op }
.Lloop32b:
    sub    $32, %rdx
    movdqu 16(%rcx,%rdx), %xmm0
    movdqa %xmm0, 16(%rdx)
    movdqu (%rcx,%rdx), %xmm0
    movdqa %xmm0, (%rdx)
    sub    $32, %r8
    ja     .Lloop32b

.LPost32b:
    sub    %r8, %rdx
    movdqu %xmm3, -16(%rdx)
    movdqu %xmm4, -32(%rdx)
    movdqu %xmm5, -16(%r9)
    ret

.Lntb:
    cmp    $0xfffffffffffff000,%rcx
    jnb    .Lloop32b
    sub    $0xFE0, %r8

.Lntloopb:
    mov    $32, %eax

    .balign 16
.Lprefb:
    sub    $0x80, %rdx
    prefetchnta (%rcx,%rdx,1)
    prefetchnta 0x40(%rcx,%rdx,1)
    dec    %eax
    jnz    .Lprefb

    add    $0x1000, %rdx
    mov    $0x40, %eax

    .balign 16
.Lntloop64b:
    sub    $64, %rdx
    movdqu 48(%rcx,%rdx,1), %xmm0
    movntdq %xmm0, 48(%rdx)
    movdqu 32(%rcx,%rdx,1), %xmm0
    movntdq %xmm0, 32(%rdx)
    movdqu 16(%rcx,%rdx,1), %xmm0
    movntdq %xmm0, 16(%rdx)
    movdqu (%rcx,%rdx,1), %xmm0
    movntdq %xmm0, (%rdx)
    dec    %eax
    jnz    .Lntloop64b

    sub    $0x1000, %r8
    jae    .Lntloopb

    mfence
    add    $0x1000, %r8
    jmpq   .LRestAfterNTb
end;
{$endif FPC_SYSTEM_HAS_MOVE}

{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  or not defined(FPC_SYSTEM_HAS_FILLWORD)
  or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
procedure FillXxxx_MoreThanTwoXmms; assembler; nostackframe;
{ Input:
  rcx = 'x'
  rdx = byte count
  xmm0 = pattern for unaligned writes
  xmm1 = pattern for aligned writes }
const
{$ifdef use_fast_repmovstos}
  ErmsThreshold = 1536;
{$endif}
  NtThreshold = 4 * 1024 * 1024;
asm
    { x can start and end misaligned on the vector boundary:

      x = ~~][H1][H2][...][T2][T1]~
          [UH]                 [UT]

      UH (“unaligned head”) potentially overlaps with H1 and is already written with 'movdqu' by the caller.
      At least 1 of its bytes is exclusive to it, i.e. if x is already aligned, H1 starts at byte 16.

      H1 and so on are called “aligned heads” or just “heads”.
      T1 and so on are called “aligned tails” or just “tails”.

      UT (“unaligned tail”) is written with another 'movdqu' after the loop.
      At least 1 of its bytes is exclusive to it as well, that’s why 65 is subtracted below instead of 64. }

    lea    -65(%rcx,%rdx), %r8 { r8 = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
    and    $-16, %rcx { align rcx to the LEFT (so needs to be offset by an additional +16 for a while). }
    movdqa %xmm1, 16(%rcx) { Write H1. }
    mov    %r8, %rax
    and    $-16, %rax { rax = “T4” (possibly fictive) = aligned r8 = loop bound. }
    cmp    $49, %rdx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
    jle    .LOneAlignedTailWrite
    movdqa %xmm1, 32(%rcx) { Write H2. }
    cmp    $81, %rdx  { 50~81 bytes might contain 2~4 heads+tails; write as H1–2 and T2–1. }
    jle    .LTwoAlignedTailWrites
    cmp    $113, %rdx  { 82~113 bytes might contain 4~6 heads+tails; write as H1–2 and T4–1. }
    jle    .LFourAlignedTailWrites

    add    $48, %rcx
{$ifdef use_fast_repmovstos}
    cmp    $ErmsThreshold, %rdx
    jae    .LRepStos
{$else}
    cmp    $NtThreshold, %rdx
    jae    .L64xNT_Body
{$endif}

.balign 16
.L64x_Body:
    movdqa %xmm1, (%rcx)
    movdqa %xmm1, 16(%rcx)
    movdqa %xmm1, 32(%rcx)
    movdqa %xmm1, 48(%rcx)
    add    $64, %rcx
    cmp    %rax, %rcx
    jb     .L64x_Body

.LFourAlignedTailWrites:
    movdqa %xmm1, (%rax) { T4 }
    movdqa %xmm1, 16(%rax) { T3 }
.LTwoAlignedTailWrites:
    movdqa %xmm1, 32(%rax) { T2 }
.LOneAlignedTailWrite:
    movdqa %xmm1, 48(%rax) { T1 }
    movdqu %xmm0, 65-16(%r8) { UT }
    ret

{$ifdef use_fast_repmovstos}
.LRepStos:
{$ifdef FPC_PIC}
    movq   fast_large_repmovstosb@GOTPCREL(%rip), %r9
    cmpb   $1, (%r9)
{$else FPC_PIC}
    cmpb   $1, fast_large_repmovstosb(%rip)
{$endif FPC_PIC}
    jne    .LRepStosIsNotBetter
{$ifdef win64}
    push   %rdi { For tests on Windows; however this is SEH incompliant so the entire use_fast_repmovstos branch is disabled by default! }
{$endif}
    mov    %rcx, %rdi { rdi = REP STOS destination. }
    lea    65-16+8-1(%r8), %rcx
    sub    %rdi, %rcx
    shr    $3, %rcx { rcx = count of REP STOSQ blocks before UT. }
    movq   %xmm1, %rax { recover pattern for aligned writes back to GPR :) }
    rep stosq
    movdqu %xmm0, 65-16(%r8) { UT }
{$ifdef win64}
    pop    %rdi
{$endif}
    ret
{$endif}

.LRepStosIsNotBetter:
    cmp    $NtThreshold, %rdx
    jb     .L64x_Body

.balign 16
.L64xNT_Body:
    movntdq %xmm1, (%rcx)
    movntdq %xmm1, 16(%rcx)
    movntdq %xmm1, 32(%rcx)
    movntdq %xmm1, 48(%rcx)
    add    $64, %rcx
    cmp    %rax, %rcx
    jb     .L64xNT_Body
    sfence
    jmp    .LFourAlignedTailWrites
end;
{$endif FPC_SYSTEM_HAS_FILLxxxx}

{$ifndef FPC_SYSTEM_HAS_FILLCHAR}
{$define FPC_SYSTEM_HAS_FILLCHAR}
Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
  asm
{ win64: rcx dest, rdx count, r8b value
  linux: rdi dest, rsi count, rdx value }
    movzbl {$ifdef win64} %r8b {$else} %dl {$endif}, %eax
    imul   $0x01010101, %eax
{$ifndef win64}
    mov    %rsi, %rdx
    mov    %rdi, %rcx
{$endif win64}

    cmp    $3, %rdx
    jle    .L3OrLess
    cmp    $16, %rdx
    jl     .L4to15

    movd   %eax, %xmm0
    pshufd $0, %xmm0, %xmm0
    movdqu %xmm0, (%rcx)
    movdqa %xmm0, %xmm1

    cmp    $32, %rdx
    jg     FillXxxx_MoreThanTwoXmms
    movdqu %xmm0, -16(%rcx,%rdx)
    ret

.L4to15:
    mov    %eax, (%rcx)
    cmp    $8, %edx
    jle    .LLast4
    mov    %eax, 4(%rcx)
    mov    %eax, -8(%rcx,%rdx)
.LLast4:
    mov    %eax, -4(%rcx,%rdx)
    ret

.L3OrLess:
    test   %rdx, %rdx
    jle    .LQuit
    mov    %al, (%rcx)
    mov    %al, -1(%rcx,%rdx)
    shr    $1, %edx
    mov    %al, (%rcx,%rdx)
.LQuit:
  end;
{$endif FPC_SYSTEM_HAS_FILLCHAR}

{$ifndef FPC_SYSTEM_HAS_FILLWORD}
{$define FPC_SYSTEM_HAS_FILLWORD}
procedure FillWord(var x;count:SizeInt;value:word);assembler;nostackframe;
  asm
{$ifdef win64}
    movzwl %r8w, %eax
    shl    $16, %r8d
    or     %r8d, %eax
{$else}
    movzwl %dx, %eax
    shl    $16, %edx
    or     %edx, %eax
    mov    %rsi, %rdx
    mov    %rdi, %rcx
{$endif}

    cmp    $3, %rdx
    jle    .L3OrLess
    cmp    $8, %rdx
    jle    .L4to8

    movd   %eax, %xmm0
    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
    movdqu %xmm0, (%rcx)

    cmp    $16, %rdx
    jle    .LTail

    shl    $1, %rdx { rdx = byte count }
    mov    %rcx, %r8
    shl    $3, %ecx
    rol    %cl, %eax { misalign the pattern by the misalignment of x }
    mov    %r8, %rcx
    movd   %eax, %xmm1
    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
    jmp    FillXxxx_MoreThanTwoXmms

.LTail:
    movdqu %xmm0, -16(%rcx,%rdx,2)
    ret

.L4to8:
    mov    %eax, %r8d
    shl    $32, %r8
    or     %r8, %rax
    mov    %rax, (%rcx)
    mov    %rax, -8(%rcx,%rdx,2)
    ret

.L3OrLess:
    test   %rdx, %rdx
    jle    .LQuit
    mov    %ax, (%rcx)
    mov    %ax, -2(%rcx,%rdx,2)
    shr    $1, %edx
    mov    %ax, (%rcx,%rdx,2)
.LQuit:
  end;
{$endif FPC_SYSTEM_HAS_FILLWORD}

{$ifndef FPC_SYSTEM_HAS_FILLDWORD}
{$define FPC_SYSTEM_HAS_FILLDWORD}
procedure FillDWord(var x;count:SizeInt;value:DWord);assembler;nostackframe;
  asm
{$ifdef win64}
    mov    %r8d, %eax
{$else}
    mov    %edx, %eax
    mov    %rsi, %rdx
    mov    %rdi, %rcx
{$endif win64}

    cmp    $3, %rdx
    jle    .L3OrLess
    cmp    $8, %rdx
    jle    .L4to8

    movd   %eax, %xmm0
    pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
    movdqu %xmm0, (%rcx)

    shl    $2, %rdx { rdx = byte count }
    mov    %rcx, %r8
    shl    $3, %ecx
    rol    %cl, %eax { misalign the pattern by the misalignment of x }
    mov    %r8, %rcx
    movd   %eax, %xmm1
    pshufd $0, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
    jmp    FillXxxx_MoreThanTwoXmms

.L4to8:
{$ifndef win64} { on win64, eax = r8d already. }
    mov    %eax, %r8d
{$endif}
    shl    $32, %r8
    or     %r8, %rax
    mov    %rax, (%rcx)
    mov    %rax, 8(%rcx)
    mov    %rax, -16(%rcx,%rdx,4)
    mov    %rax, -8(%rcx,%rdx,4)
    ret

.L3OrLess:
    test   %rdx, %rdx
    jle    .LQuit
    mov    %eax, (%rcx)
    mov    %eax, -4(%rcx,%rdx,4)
    shr    $1, %edx
    mov    %eax, (%rcx,%rdx,4)
.LQuit:
  end;
{$endif FPC_SYSTEM_HAS_FILLDWORD}

{$ifndef FPC_SYSTEM_HAS_FILLQWORD}
{$define FPC_SYSTEM_HAS_FILLQWORD}
procedure FillQWord(var x;count:SizeInt;value:QWord);assembler;nostackframe;
  asm
{$ifdef win64}
    mov    %r8, %rax
{$else}
    mov    %rdx, %rax
    mov    %rsi, %rdx
    mov    %rdi, %rcx
{$endif win64}

    cmp    $2, %rdx
    jle    .L2OrLess
    cmp    $6, %rdx
    jle    .L3to6

    movq   %rax, %xmm0
    pshufd $0b01000100, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
    movdqu %xmm0, (%rcx)

    shl    $3, %rdx { rdx = byte count }
    mov    %rcx, %r8
    shl    $3, %ecx
    rol    %cl, %rax { misalign the pattern by the misalignment of x }
    mov    %r8, %rcx
    movq   %rax, %xmm1
    pshufd $0b01000100, %xmm1, %xmm1 { xmm1 = pattern for aligned writes }
    jmp    FillXxxx_MoreThanTwoXmms

.L3to6:
    mov    %rax, (%rcx)
    mov    %rax, 8(%rcx)
    mov    %rax, 16(%rcx)
    mov    %rax, -24(%rcx,%rdx,8)
    mov    %rax, -16(%rcx,%rdx,8)
    mov    %rax, -8(%rcx,%rdx,8)
    ret

.L2OrLess:
    test   %rdx, %rdx
    jle    .LQuit
    mov    %rax, (%rcx)
    mov    %rax, -8(%rcx,%rdx,8)
.LQuit:
  end;
{$endif FPC_SYSTEM_HAS_FILLQWORD}

{$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
{$define FPC_SYSTEM_HAS_INDEXBYTE}
function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
{ win64: rcx buf, rdx len, r8b word
  linux: rdi buf, rsi len, rdx word }
asm
    test   {$ifdef win64} %rdx, %rdx {$else} %rsi, %rsi {$endif}
    jz     .Lnotfound                  { exit if len=0 }
{$ifdef win64}
    movd   %r8d, %xmm1
{$else}
    movd   %edx, %xmm1
    movq   %rdi, %rcx
    movq   %rsi, %rdx
{$endif}
    mov    %rcx, %r8
    punpcklbw  %xmm1, %xmm1
    and    $-0x10, %rcx                { highest aligned address before buf }
    punpcklbw  %xmm1, %xmm1
    add    $16, %rcx                   { first aligned address after buf }
    pshufd $0, %xmm1, %xmm1
    movdqa -16(%rcx), %xmm0            { Fetch first 16 bytes (up to 15 bytes before target) }
    sub    %r8, %rcx                   { rcx=number of valid bytes, r8=original ptr }

    pcmpeqb %xmm1, %xmm0               { compare with pattern and get bitmask }
    pmovmskb %xmm0, %eax

    shl    %cl, %eax                   { shift valid bits into high word }
    and    $0xffff0000, %eax           { clear low word containing invalid bits }
    shr    %cl, %eax                   { shift back }
    jmp   .Lcontinue

    .balign 16
.Lloop:
    movdqa (%r8,%rcx), %xmm0           { r8 and rcx may have any values, }
    add    $16, %rcx                   { but their sum is evenly divisible by 16. }
    pcmpeqb %xmm1, %xmm0
    pmovmskb %xmm0, %eax
.Lcontinue:
    test   %eax, %eax
    jnz    .Lmatch
    cmp    %rcx, %rdx
    ja     .Lloop
.Lnotfound:
    or     $-1, %rax
    retq

.Lmatch:
    bsf    %eax, %eax
    lea    -16(%rcx,%rax), %rax
    cmp    %rax, %rdx                  { check against the buffer length }
    jbe    .Lnotfound
end;
{$endif FPC_SYSTEM_HAS_INDEXBYTE}

{$ifndef FPC_SYSTEM_HAS_INDEXWORD}
{$define FPC_SYSTEM_HAS_INDEXWORD}
function IndexWord(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
{ win64: rcx buf, rdx len, r8b word
  linux: rdi buf, rsi len, rdx word }
asm
    test   {$ifdef win64} %rdx, %rdx {$else} %rsi, %rsi {$endif}
    jz     .Lnotfound                  { exit if len=0 }
{$ifdef win64}
    movd   %r8d, %xmm1
{$else}
    movd   %edx, %xmm1
    movq   %rdi, %rcx
    movq   %rsi, %rdx
{$endif}
    mov    %rcx, %r8
    punpcklwd  %xmm1, %xmm1
    and    $-0x10, %rcx
    pshufd $0, %xmm1, %xmm1
    add    $16, %rcx
    movdqa -16(%rcx), %xmm0            { Fetch first 16 bytes (up to 14 bytes before target) }
    sub    %r8, %rcx                   { rcx=number of valid bytes }

    test   $1, %r8b                    { if buffer isn't aligned to word boundary, }
    jnz    .Lunaligned                 { use a different algorithm }

    pcmpeqw  %xmm1, %xmm0
    pmovmskb %xmm0, %eax

    shl    %cl, %eax
    and    $0xffff0000, %eax
    shr    %cl, %eax
    shr    $1, %ecx                    { bytes->words }
    jmp    .Lcontinue

    .balign 16
.Lloop:
    movdqa (%r8,%rcx,2), %xmm0
    add    $8, %rcx
    pcmpeqw  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
.Lcontinue:
    test   %eax, %eax
    jnz    .Lmatch
    cmp    %rcx, %rdx
    ja     .Lloop

.Lnotfound:
    or    $-1, %rax
    retq

.Lmatch:
    bsf    %eax, %eax
    shr    $1, %eax                    { in words }
    lea    -8(%rcx,%rax), %rax
    cmp    %rax, %rdx
    jbe    .Lnotfound                  { if match is after the specified length, ignore it }
    retq

.Lunaligned:
    movdqa  %xmm1, %xmm2               { (mis)align the pattern (in this particular case: }
    psllw   $8, %xmm1                  {   swap bytes of each word of pattern) }
    psrlw   $8, %xmm2
    por     %xmm2, %xmm1

    pcmpeqb  %xmm1, %xmm0
    pmovmskb %xmm0, %eax

    shl    %cl, %eax
    and    $0xffff0000, %eax
    shr    %cl, %eax

    add    %rdx, %rdx                  { length words -> bytes }
    xor    %r10d, %r10d                { nothing to merge yet }
    jmp    .Lcontinue_u

    .balign 16
.Lloop_u:
    movdqa (%r8,%rcx), %xmm0
    add    $16, %rcx
    pcmpeqb %xmm1, %xmm0               { compare by bytes }
    shr    $16, %r10d                  { bit 16 shifts into 0 }
    pmovmskb %xmm0, %eax
.Lcontinue_u:
    shl    $1, %eax                    { 15:0 -> 16:1 }
    or     %r10d, %eax                 { merge bit 0 from previous round }
    mov    %eax, %r10d
    shr    $1, %eax                    { now AND together adjacent pairs of bits }
    and    %r10d, %eax
    and    $0x5555, %eax               { also reset odd bits }
    jnz    .Lmatch_u
    cmpq   %rcx, %rdx
    ja     .Lloop_u

.Lnotfound_u:
    or     $-1, %rax
    retq
.Lmatch_u:
    bsf    %eax, %eax
    lea    -16(%rcx,%rax), %rax
    cmp    %rax, %rdx
    jbe    .Lnotfound_u                { if match is after the specified length, ignore it }
    sar    $1, %rax                    { in words }
end;
{$endif FPC_SYSTEM_HAS_INDEXWORD}

{$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
{$define FPC_SYSTEM_HAS_INDEXDWORD}
function IndexDWord(Const buf;len:SizeInt;b:dword):SizeInt; assembler; nostackframe;
asm
{$ifdef win64}
    mov      %rcx, %rax
{$else}
    mov      %rdx, %r8
    mov      %rsi, %rdx
    mov      %rdi, %rax
{$endif}
    cmp      $4, %rdx
    jle      .LDwordwise_Prepare
    sub      $4, %rdx
    movd     %r8d, %xmm1
    pshufd   $0, %xmm1, %xmm1
.balign 16
.L4x_Body:
    movdqu   (%rax), %xmm0
    pcmpeqd  %xmm1, %xmm0
    pmovmskb %xmm0, %r8d
    test     %r8d, %r8d
    jnz      .LFoundAtMask
    add      $16, %rax
    sub      $4, %rdx
    jg       .L4x_Body

    lea      (%rax,%rdx,4), %rax
    movdqu   (%rax), %xmm0
    pcmpeqd  %xmm1, %xmm0
    pmovmskb %xmm0, %r8d
    test     %r8d, %r8d
    jnz      .LFoundAtMask
    or       $-1, %rax
    ret

.balign 16 { no-op }
.LDwordwise_Body:
    cmp      (%rax), %r8d
    je       .LFoundAtRax
    add      $4, %rax
.LDwordwise_Prepare:
    sub      $1, %rdx
    jae      .LDwordwise_Body
    or       $-1, %rax
    ret

.LFoundAtMask:
    bsf      %r8d, %r8d
    add      %r8, %rax
.LFoundAtRax:
    sub      {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
    shr      $2, %rax
end;
{$endif FPC_SYSTEM_HAS_INDEXDWORD}

{$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
{$define FPC_SYSTEM_HAS_INDEXQWORD}
function IndexQWord(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
{ win64: rcx=buf, rdx=len, r8=b
  else:  rdi=buf, rsi=len, rdx=b }
asm
    mov      {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
    sub      $8, %rax
.balign 16
.LQwordwise_Next:
    add      $8, %rax
    sub      $1, {$ifdef win64} %rdx {$else} %rsi {$endif}
    jb       .LNothing
    cmp      {$ifdef win64} %r8 {$else} %rdx {$endif}, (%rax)
    jne      .LQwordwise_Next
    sub      {$ifdef win64} %rcx {$else} %rdi {$endif}, %rax
    shr      $3, %rax
    ret

.LNothing:
    mov      $-1, %rax
end;
{$endif FPC_SYSTEM_HAS_INDEXQWORD}

{$endif freebsd}

{$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
{$define FPC_SYSTEM_HAS_COMPAREBYTE}
function CompareByte(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
{ win64: rcx buf, rdx buf, r8 len
  linux: rdi buf, rsi buf, rdx len }
asm
{$ifndef win64}
    mov      %rdx, %r8
    mov      %rsi, %rdx
    mov      %rdi, %rcx
{$endif win64}
    { rcx = buf1, rdx = buf2, r8 = len }
    cmp      $1, %r8
    jle      .L1OrLess

    cmp      $16, %r8
    jae      .LVecOrMore

    { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but is faster. }
    mov      %ecx, %eax
    or       %edx, %eax
    and      $4095, %eax
    cmp      $4080, %eax
    ja       .LCantOverReadBoth

    { Over-read both as XMMs. }
    movdqu   (%rcx), %xmm0
    movdqu   (%rdx), %xmm1
    pcmpeqb  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jz       .LNothing
    bsf      %eax, %eax
    cmp      %r8d, %eax { Ignore garbage beyond 'len'. }
    jae      .LNothing
    movzbl   (%rdx,%rax), %edx
    movzbl   (%rcx,%rax), %eax
    sub      %rdx, %rax
    ret

.balign 16
.LNothing:
    xor      %eax, %eax
    ret

.LAligned32xLoop_TwoVectorsDiffer:
    add      %rcx, %rdx { restore rdx = buf2 }
    pmovmskb %xmm0, %r8d { Is there a difference in the first vector? }
    inc      %r8w
    jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, eax = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
    mov      %r8d, %eax
.LVec0Differs:
    bsf      %eax, %eax
    movzbl   (%rdx,%rax), %edx
    movzbl   (%rcx,%rax), %eax
    sub      %rdx, %rax
    ret
    .byte    0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }

.LVecOrMore:
    { Compare first vectors. }
    movdqu   (%rcx), %xmm0
    movdqu   (%rdx), %xmm1
    pcmpeqb  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LVec0Differs

    sub      $32, %r8
    jbe      .LLastVec

    { Compare second vectors. }
    movdqu   16(%rcx), %xmm0
    movdqu   16(%rdx), %xmm1
    pcmpeqb  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LVec1Differs

    cmp      $32, %r8
    jbe      .LLastTwoVectors

    { More than four vectors: aligned loop. }
    lea      -32(%rcx,%r8), %r8 { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32). }
    sub      %rcx, %rdx { rdx = buf2 - buf1 }
    and      $-16, %rcx { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
    sub      %rcx, %r8 { r8 = count to be handled with loop }
.balign 16 { no-op }
.LAligned32xLoop_Body:
    add      $32, %rcx
    { Compare two XMMs, reduce the result with 'and'. }
    movdqu   (%rdx,%rcx), %xmm0
    pcmpeqb  (%rcx), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
    movdqu   16(%rdx,%rcx), %xmm1
    pcmpeqb  16(%rcx), %xmm1
    pand     %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
    pmovmskb %xmm1, %eax
    inc      %ax
    jnz      .LAligned32xLoop_TwoVectorsDiffer
    sub      $32, %r8
    ja       .LAligned32xLoop_Body
    add      %rcx, %rdx { restore rdx = buf2 }
    add      $32, %r8
.LLastTwoVectors:
    movdqu   (%rcx,%r8), %xmm0
    movdqu   (%rdx,%r8), %xmm1
    pcmpeqb  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LVecEm2Differs
.LLastVec:
    movdqu   16(%rcx,%r8), %xmm0
    movdqu   16(%rdx,%r8), %xmm1
    pcmpeqb  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LVecEm1Differs
    xor      %eax, %eax
    ret

.LVec1Differs:
    xor      %r8d, %r8d
.LVecEm1Differs:
    add      $16, %r8
.LVecEm2Differs:
    bsf      %eax, %eax
    add      %r8, %rax
    movzbl   (%rdx,%rax), %edx
    movzbl   (%rcx,%rax), %eax
    sub      %rdx, %rax
    ret

.LCantOverReadBoth:
    cmp      $8, %r8d
    ja       .L9to15
    cmp      $3, %r8d
    jle      .L2to3
    mov      (%rcx), %eax
    mov      (%rdx), %r9d
    cmp      %r9d, %eax
    jne      .L4xOr8xDiffer
    mov      -4(%rcx,%r8), %eax
    mov      -4(%rdx,%r8), %r9d
    cmp      %r9d, %eax
    jne      .L4xOr8xDiffer
    xor      %eax, %eax
    ret

.L9to15:
    mov      (%rcx), %rax
    mov      (%rdx), %r9
    cmp      %r9, %rax
    jne      .L4xOr8xDiffer
    mov      -8(%rcx,%r8), %rax
    mov      -8(%rdx,%r8), %r9
    cmp      %r9, %rax
    jne      .L4xOr8xDiffer
    xor      %eax, %eax
    ret

.L4xOr8xDiffer:
    bswap    %r9
    bswap    %rax
    cmp      %r9, %rax
    sbb      %rax, %rax
    or       $1, %rax
    ret

.L2to3:
    movzwl   (%rcx), %eax
    bswap    %eax
    shr      $1, %eax
    mov      -1(%rcx,%r8), %al
    movzwl   (%rdx), %ecx
    bswap    %ecx
    shr      $1, %ecx
    mov      -1(%rdx,%r8), %cl
    sub      %rcx, %rax
    ret

.L1OrLess:
    jl       .LUnbounded_Prepare
    movzbl   (%rcx), %eax
    movzbl   (%rdx), %edx
    sub      %rdx, %rax
    ret

.LUnbounded_Prepare:
    sub      %rcx, %rdx { rdx = buf2 - buf1 }
    test     %r8, %r8
    jnz      .LUnbounded_Body
    xor      %eax, %eax
    ret

.balign 16
.LUnbounded_Next:
    add      $1, %rcx
.LUnbounded_Body:
    movzbl   (%rdx,%rcx), %eax
    cmp      %al, (%rcx)
    je       .LUnbounded_Next
    sbb      %rax, %rax
    or       $1, %rax
end;
{$endif FPC_SYSTEM_HAS_COMPAREBYTE}


{$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
{$define FPC_SYSTEM_HAS_COMPAREWORD}
function CompareWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
asm
{$ifndef win64}
    mov      %rdx, %r8
    mov      %rsi, %rdx
    mov      %rdi, %rcx
{$endif win64}
    sub      %rcx, %rdx { rdx = buf2 - buf1 }
    cmp      $1, %r8
    jle      .LWordwise_Prepare
    mov      %r8, %rax
    shr      $62, %rax
    jnz      .LWordwise_Prepare
    cmp      $8, %r8
    jge      .LVecOrMore

    lea      (%rdx,%rcx), %eax
    or       %ecx, %eax
    and      $4095, %eax
    cmp      $4080, %eax
    ja       .LWordwise_Prepare
    movdqu   (%rdx,%rcx), %xmm0
    movdqu   (%rcx), %xmm1
    pcmpeqw  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    shl      $1, %r8 { convert to bytes }
    inc      %ax
    jz       .LNothing
    bsf      %eax, %eax
    cmp      %r8d, %eax
    jb       .LSubtractWords
.LNothing:
    xor      %eax, %eax
    ret

.balign 16
.LWordwise_Body:
    movzwl  (%rdx,%rcx), %eax
    cmp     %ax, (%rcx)
    jne     .LDoSbb
    add     $2, %rcx
.LWordwise_Prepare:
    sub     $1, %r8
    jae     .LWordwise_Body
    xor     %eax, %eax
    ret

.LDoSbb:
    sbb      %rax, %rax
    or       $1, %rax
    ret

.LVec0Differs:
    bsf      %eax, %eax
.LSubtractWords:
    add      %rcx, %rdx { recover rdx = buf2 }
    movzwl   (%rdx,%rax), %edx
    movzwl   (%rcx,%rax), %eax
    sub      %rdx, %rax
    ret

.LVecOrMore:
    movdqu   (%rdx,%rcx), %xmm0 { Compare first vectors. }
    movdqu   (%rcx), %xmm1
    pcmpeqw  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LVec0Differs

    shl      $1, %r8 { convert to bytes }
    sub      $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
    jle      .LLastVec

    mov      %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
    add      %rcx, %r8
    and      $-16, %rcx { align buf1; +16 is performed by the loop. }
    sub      %rcx, %r8

.balign 16
.LAligned8xLoop_Body:
    add      $16, %rcx
    movdqu   (%rdx,%rcx), %xmm0
    pcmpeqb  (%rcx), %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LAligned8xLoop_VecDiffers
    sub      $16, %r8
    ja       .LAligned8xLoop_Body
.LLastVec:
    lea      16(%rcx,%r8), %rcx { point to the last 16 bytes }
    movdqu   (%rdx,%rcx), %xmm0
    movdqu   (%rcx), %xmm1
    pcmpeqw  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LVec0Differs
    xor      %eax, %eax
    ret

.LAligned8xLoop_VecDiffers:
    bsf      %eax, %eax
    add      %rax, %rcx
    sub      %r9, %rcx
    and      $-2, %rcx
    add      %r9, %rcx
    movzwl   (%rdx,%rcx), %edx
    movzwl   (%rcx), %eax
    sub      %rdx, %rax
end;
{$endif FPC_SYSTEM_HAS_COMPAREWORD}


{$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
{$define FPC_SYSTEM_HAS_COMPAREDWORD}
function CompareDWord(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
asm
{$ifndef win64}
    mov      %rdx, %r8
    mov      %rsi, %rdx
    mov      %rdi, %rcx
{$endif win64}
    sub      %rcx, %rdx { rdx = buf2 - buf1 }
    cmp      $4, %r8
    jle      .LDwordwise_Prepare
    mov      %r8, %rax
    shr      $61, %rax
    jnz      .LDwordwise_Prepare

    movdqu   (%rdx,%rcx), %xmm0 { Compare first vectors. }
    movdqu   (%rcx), %xmm1
    pcmpeqd  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LVec0Differs

    shl      $2, %r8 { convert to bytes }
    sub      $32, %r8 { first 16 bytes already analyzed + last 16 bytes analyzed separately }
    jle      .LLastVec

    mov      %rcx, %r9 { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
    add      %rcx, %r8
    and      $-16, %rcx { align buf1; +16 is performed by the loop. }
    sub      %rcx, %r8

.balign 16
.LAligned4xLoop_Body:
    add      $16, %rcx
    movdqu   (%rdx,%rcx), %xmm0
    pcmpeqb  (%rcx), %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LAligned4xLoop_VecDiffers
    sub      $16, %r8
    ja       .LAligned4xLoop_Body
.LLastVec:
    lea      16(%rcx,%r8), %rcx { point to the last 16 bytes }
    movdqu   (%rdx,%rcx), %xmm0
    movdqu   (%rcx), %xmm1
    pcmpeqd  %xmm1, %xmm0
    pmovmskb %xmm0, %eax
    inc      %ax
    jnz      .LVec0Differs
    xor      %eax, %eax
    ret

.LVec0Differs:
    bsf      %eax, %eax
    add      %rcx, %rdx { recover rdx = buf2 }
    mov      (%rdx,%rax), %edx
    cmp      %edx, (%rcx,%rax)
    sbb      %rax, %rax
    or       $1, %rax
    ret

.LAligned4xLoop_VecDiffers:
    bsf      %eax, %eax
    add      %rax, %rcx
    sub      %r9, %rcx
    and      $-4, %rcx
    add      %r9, %rcx
    mov      (%rdx,%rcx), %edx
    cmp      %edx, (%rcx)
.LDoSbb:
    sbb      %rax, %rax
    or       $1, %rax
    ret

.balign 16
.LDwordwise_Body:
    mov     (%rdx,%rcx), %eax
    cmp     %eax, (%rcx)
    jne     .LDoSbb
    add     $4, %rcx
.LDwordwise_Prepare:
    sub     $1, %r8
    jae     .LDwordwise_Body
    xor     %eax, %eax
end;
{$endif FPC_SYSTEM_HAS_COMPAREDWORD}


{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
{ does a thread save inc/dec }
function declocked(var l : longint) : boolean;assembler; nostackframe;
  asm
     { this check should be done because a lock takes a lot }
     { of time!                                             }
{$ifdef FPC_PIC}
     movq       IsMultithread@GOTPCREL(%rip),%rax
     cmpl       $0,(%rax)
{$else FPC_PIC}
     cmpl       $0,IsMultithread(%rip)
{$endif FPC_PIC}
     jz         .Ldeclockedskiplock
     .byte      0xF0 // LOCK prefix.
.Ldeclockedskiplock:
     decl       {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
     setzb      %al
  end;


{$define FPC_SYSTEM_HAS_DECLOCKED_INT64}
function declocked(var l : int64) : boolean;assembler; nostackframe;
  asm
     { this check should be done because a lock takes a lot }
     { of time!                                             }
{$ifdef FPC_PIC}
     movq       IsMultithread@GOTPCREL(%rip),%rax
     cmpl       $0,(%rax)
{$else FPC_PIC}
     cmpl       $0,IsMultithread(%rip)
{$endif FPC_PIC}
     jz         .Ldeclockedskiplock
     .byte      0xF0 // LOCK prefix.
.Ldeclockedskiplock:
     decq       {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
     setzb      %al
  end;


{$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
procedure inclocked(var l : longint);assembler; nostackframe;

  asm
     { this check should be done because a lock takes a lot }
     { of time!                                             }
{$ifdef FPC_PIC}
     movq       IsMultithread@GOTPCREL(%rip),%rax
     cmpl       $0,(%rax)
{$else FPC_PIC}
     cmpl       $0,IsMultithread(%rip)
{$endif FPC_PIC}
     jz         .Linclockedskiplock
     .byte      0xF0 // LOCK prefix.
.Linclockedskiplock:
     incl       {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  end;


{$define FPC_SYSTEM_HAS_INCLOCKED_INT64}
procedure inclocked(var l : int64);assembler; nostackframe;

  asm
     { this check should be done because a lock takes a lot }
     { of time!                                             }
{$ifdef FPC_PIC}
     movq       IsMultithread@GOTPCREL(%rip),%rax
     cmpl       $0,(%rax)
{$else FPC_PIC}
     cmpl       $0,IsMultithread(%rip)
{$endif FPC_PIC}
     jz         .Linclockedskiplock
     .byte      0xF0 // LOCK prefix.
.Linclockedskiplock:
     incq       {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
  end;


function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
asm
        movl    $-1,%eax
        lock
        xaddl   %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
        decl    %eax
end;


function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
asm
        movl    $1,%eax
        lock
        xaddl   %eax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
        incl    %eax
end;


function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
asm
{$ifdef win64}
        xchgl   (%rcx),%edx
        movl    %edx,%eax
{$else win64}
        xchgl   (%rdi),%esi
        movl    %esi,%eax
{$endif win64}
end;


function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
asm
{$ifdef win64}
        lock
        xaddl   %edx, (%rcx)
        movl    %edx,%eax
{$else win64}
        lock
        xaddl   %esi, (%rdi)
        movl    %esi,%eax
{$endif win64}
end;


function InterLockedCompareExchange(var Target: longint; NewValue, Comperand : longint): longint; assembler; nostackframe;
asm
{$ifdef win64}
        movl            %r8d,%eax
        lock
        cmpxchgl        %edx,(%rcx)
{$else win64}
        movl            %edx,%eax
        lock
        cmpxchgl        %esi,(%rdi)
{$endif win64}
end;


function InterLockedDecrement64 (var Target: int64) : int64; assembler; nostackframe;
asm
        movq    $-1,%rax
        lock
        xaddq   %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
        decq    %rax
end;


function InterLockedIncrement64 (var Target: int64) : int64; assembler; nostackframe;
asm
        movq    $1,%rax
        lock
        xaddq   %rax, {$ifdef win64} (%rcx) {$else} (%rdi) {$endif}
        incq    %rax
end;


function InterLockedExchange64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
asm
{$ifdef win64}
        xchgq   (%rcx),%rdx
        movq    %rdx,%rax
{$else win64}
        xchgq   (%rdi),%rsi
        movq    %rsi,%rax
{$endif win64}
end;


function InterLockedExchangeAdd64 (var Target: int64;Source : int64) : int64; assembler; nostackframe;
asm
{$ifdef win64}
        lock
        xaddq   %rdx, (%rcx)
        movq    %rdx,%rax
{$else win64}
        lock
        xaddq   %rsi, (%rdi)
        movq    %rsi,%rax
{$endif win64}
end;


function InterLockedCompareExchange64(var Target: int64; NewValue, Comperand : int64): int64; assembler; nostackframe;
asm
{$ifdef win64}
        movq            %r8,%rax
        lock
        cmpxchgq        %rdx,(%rcx)
{$else win64}
        movq            %rdx,%rax
        lock
        cmpxchgq        %rsi,(%rdi)
{$endif win64}
end;


{****************************************************************************
                                  FPU
****************************************************************************}

const
  { Internal constants for use in system unit }
  FPU_Invalid = 1;
  FPU_Denormal = 2;
  FPU_DivisionByZero = 4;
  FPU_Overflow = 8;
  FPU_Underflow = $10;
  FPU_StackUnderflow = $20;
  FPU_StackOverflow = $40;
  FPU_ExceptionMask = $ff;

  MM_Invalid = 1;
  MM_Denormal = 2;
  MM_DivisionByZero = 4;
  MM_Overflow = 8;
  MM_Underflow = $10;
  MM_Precicion = $20;
  MM_ExceptionMask = $3f;

  MM_MaskInvalidOp = %0000000010000000;
  MM_MaskDenorm    = %0000000100000000;
  MM_MaskDivZero   = %0000001000000000;
  MM_MaskOverflow  = %0000010000000000;
  MM_MaskUnderflow = %0000100000000000;
  MM_MaskPrecision = %0001000000000000;

{$define FPC_SYSTEM_HAS_FPC_CPUINIT}
procedure fpc_cpuinit;
  var
    _eax,cpuid7_ebx,cpuid1_ecx : dword;
  begin
    { don't let libraries influence the FPU cw set by the host program }
    if IsLibrary then
      begin
        Default8087CW:=Get8087CW;
        DefaultMXCSR:=GetMXCSR;
      end;
    SysResetFPU;
    asm
      xorl %eax,%eax
      cpuid
      movl %eax,_eax
    end;
    if _eax>=7 then
      begin
        asm
          movl $1,%eax
          xorl %ecx,%ecx
          cpuid
          movl %ecx,cpuid1_ecx
          movl $7,%eax
          xorl %ecx,%ecx
          cpuid
          movl %ebx,cpuid7_ebx
        end;
{$ifdef use_fast_repmovstos}
        fast_large_repmovstosb:=cpuid7_ebx and (1 shl 9)<>0;
{$endif}
        { XGETBV support? }
        if (cpuid1_ecx and $8000000)<>0 then 
          begin
            asm
              xorl %ecx,%ecx
              .byte   0x0f,0x01,0xd0 { xgetbv }
              movl %eax,_eax
            end;
            if (_eax and 6)=6 then
              begin
                has_avx_support:=(cpuid1_ecx and $10000000)<>0;
                has_avx2_support:=(cpuid7_ebx and $20)<>0;
              end;
          end;
      end;
  end;

{$define FPC_SYSTEM_HAS_SYSINITFPU}
Procedure SysInitFPU;
begin
end;


{$define FPC_SYSTEM_HAS_SYSRESETFPU}
Procedure SysResetFPU;
  var
    { these locals are so we don't have to hack pic code in the assembler }
    localmxcsr: dword;
    localfpucw: word;
  begin
    localfpucw:=Default8087CW;
    localmxcsr:=DefaultMXCSR;
    asm
      fninit
      fwait
      fldcw   localfpucw
      ldmxcsr localmxcsr
    end;
  end;


{$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
{$define FPC_SYSTEM_HAS_MEM_BARRIER}

procedure ReadBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
asm
  lfence
end;

procedure ReadDependencyBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
asm
  { reads imply barrier on earlier reads depended on }
end;

procedure ReadWriteBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
asm
  mfence
end;

procedure WriteBarrier;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
asm
  sfence
end;

{$endif}

{****************************************************************************
                          Math Routines
****************************************************************************}

{$define FPC_SYSTEM_HAS_SWAPENDIAN}

{ SwapEndian(<16 Bit>) being inlined is faster than using assembler }
function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
  begin
    { the extra Word type cast is necessary because the "AValue shr 8" }
    { is turned into "longint(AValue) shr 8", so if AValue < 0 then    }
    { the sign bits from the upper 16 bits are shifted in rather than  }
    { zeroes.                                                          }
    Result := SmallInt(((Word(AValue) shr 8) or (Word(AValue) shl 8)) and $ffff);
  end;


function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
  begin
    Result := ((AValue shr 8) or (AValue shl 8)) and $ffff;
  end;


function SwapEndian(const AValue: LongInt): LongInt; assembler; nostackframe;
asm
{$ifdef win64}
  movl %ecx, %eax
{$else win64}
  movl %edi, %eax
{$endif win64}
  bswap %eax
end;


function SwapEndian(const AValue: DWord): DWord; assembler; nostackframe;
asm
{$ifdef win64}
  movl %ecx, %eax
{$else win64}
  movl %edi, %eax
{$endif win64}
  bswap %eax
end;


function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
asm
{$ifdef win64}
  movq %rcx, %rax
{$else win64}
  movq %rdi, %rax
{$endif win64}
  bswap %rax
end;


function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
asm
{$ifdef win64}
  movq %rcx, %rax
{$else win64}
  movq %rdi, %rax
{$endif win64}
  bswap %rax
end;


{$ifndef win64}
{$define FPC_SYSTEM_HAS_U128_DIV_U64_TO_U64}
function u128_div_u64_to_u64( const xh, xl: qword; const y: qword; out quotient, remainder: qword ): boolean;nostackframe;assembler;
{
  SysV:
  xh: RDI
  xl: RSI
  y: RDX
  quotient: RCX
  remainder: R8
}
label
  dodiv;
asm
  cmpq %rdi,%rdx
  ja   dodiv
  xorl %eax,%eax
  ret
dodiv:
  movq %rdx,%r9
  movq %rsi,%rax
  movq %rdi,%rdx
  divq %r9
  movq %rax,(%rcx)
  movq %rdx,(%r8)
  movl $1,%eax
end;
{$endif win64}