fpc/rtl/i386/i386.inc

{
    This file is part of the Free Pascal run time library.
    Copyright (c) 1999-2000 by the Free Pascal development team.

    Processor dependent implementation for the system unit for
    intel i386+

    See the file COPYING.FPC, included in this distribution,
    for details about the copyright.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 **********************************************************************}

{$if defined(linux)}
  {$define FPC_SYSTEM_STACKALIGNMENT16}
{$endif defined(linux)}

{****************************************************************************
                               Primitives
****************************************************************************}
var
  os_supports_sse : boolean;
  { this variable is set to true, if currently an sse check is executed and no sig ill should be generated }
  sse_check : boolean;
  fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. }
  has_sse41_support : boolean;
  fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. }

{$asmmode ATT}

function cpuid_support : boolean;assembler;nostackframe;
  {
    Check if the ID-flag can be changed, if changed then CpuID is supported.
    Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
  }
  asm
    pushfl
    movl    (%esp),%eax
    xorl    $0x200000,%eax
    pushl   %eax
    popfl
    pushfl
    popl    %eax
    xorl    (%esp),%eax
    popfl
    testl   $0x200000,%eax
    setnz   %al
  end;

{$define FPC_SYSTEM_HAS_FPC_CPUINIT}
procedure fpc_cpuinit;
  begin
    { because of the brain dead sse detection on x86, this test is post poned to fpc_cpucodeinit which
      must be implemented OS dependend (FK)
    has_sse_support:=sse_support;
    has_mmx_support:=mmx_support;
    }
  end;

{$ifndef darwin}
procedure fpc_geteipasebx; [public, alias: 'fpc_geteipasebx'];assembler; nostackframe;
asm
  movl    (%esp),%ebx
end;


procedure fpc_geteipasecx; [public, alias: 'fpc_geteipasecx'];assembler; nostackframe;
asm
  movl    (%esp),%ecx
end;
{$endif}

{$if not defined(FPC_SYSTEM_HAS_MOVE)
 and not defined(OLD_ASSEMBLER)
 and not defined(darwin)}
{$i fastmove.inc}
{$endif}

{$ifndef FPC_SYSTEM_HAS_MOVE}
{$define FPC_SYSTEM_HAS_MOVE}

procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
var
  saveesi,saveedi : longint;
asm
        movl    %edi,saveedi
        movl    %esi,saveesi
        movl    %eax,%esi
        movl    %edx,%edi
        movl    %ecx,%edx
        movl    %edi,%eax
{ check for zero or negative count }
        cmpl    $0,%edx
        jle     .LMoveEnd
{ Check for back or forward }
        sub     %esi,%eax
        jz      .LMoveEnd               { Do nothing when source=dest }
        jc      .LFMove                 { Do forward, dest<source }
        cmp     %edx,%eax
        jb      .LBMove                 { Dest is in range of move, do backward }
{ Forward Copy }
.LFMove:
{$ifdef FPC_ENABLED_CLD}
        cld
{$endif FPC_ENABLED_CLD}
        cmpl    $15,%edx
        jl      .LFMove1
        movl    %edi,%ecx       { Align on 32bits }
        negl    %ecx
        andl    $3,%ecx
        subl    %ecx,%edx
        rep
        movsb
        movl    %edx,%ecx
        andl    $3,%edx
        shrl    $2,%ecx
        rep
        movsl
.LFMove1:
        movl    %edx,%ecx
        rep
        movsb
        jmp .LMoveEnd
{ Backward Copy }
.LBMove:
        std
        addl    %edx,%esi
        addl    %edx,%edi
        movl    %edi,%ecx
        decl    %esi
        decl    %edi
        cmpl    $15,%edx
        jl      .LBMove1
        negl    %ecx            { Align on 32bits }
        andl    $3,%ecx
        subl    %ecx,%edx
        rep
        movsb
        movl    %edx,%ecx
        andl    $3,%edx
        shrl    $2,%ecx
        subl    $3,%esi
        subl    $3,%edi
        rep
        movsl
        addl    $3,%esi
        addl    $3,%edi
.LBMove1:
        movl    %edx,%ecx
        rep
        movsb
        cld
.LMoveEnd:
        movl    saveedi,%edi
        movl    saveesi,%esi
end;

{$endif FPC_SYSTEM_HAS_MOVE}


{ Darwin uses Clang to assemble. Recent Clang versions (rightly) give an error when you add global labels in
  the middle of .cfi_startproc / .cfi_endproc pairs, since this means you could jump into it from other code
  whose CFI state is completely different without the compiler even having the theoretical ability to analyse
  all code and generate balanced information.

  Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
}
{$ifndef darwin}
  {$define can_jump_into_the_middle_of_a_procedure}
{$endif darwin}

{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  or not defined(FPC_SYSTEM_HAS_FILLWORD)
  or not defined(FPC_SYSTEM_HAS_FILLDWORD)
  or not defined(FPC_SYSTEM_HAS_FILLQWORD)}

{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
   or not defined(FPC_SYSTEM_HAS_FILLWORD)
   or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
const
  FillXxxx_RepStosThreshold_ERMS = 1024;
  FillXxxx_RepStosThreshold_NoERMS = 512 * 1024;

procedure FillXxxx_U32Pattern_RepStos_8OrMore; assembler; nostackframe;
{ eax — x, ecx — uint32 pattern, edx — byte count >= 8 (preferably >= FillXxxx_RepStosThreshold_(No)ERMS, depending on fast_large_repmovstosb). }
asm
{$ifdef FPC_ENABLED_CLD}
        cld
{$endif FPC_ENABLED_CLD}
        mov    %ecx, (%eax) { Write first 4 bytes unaligned. }
        push   %ecx { pattern }
        push   %edi
        mov    %eax, %edi { Move x to edi, as expected by ‘rep stosl’. }
        xchg   %eax, %ecx { now eax = pattern (as expected by ‘rep stosl’) and ecx = x (to rotate the pattern by its misalignment) }
        shl    $3, %ecx { ecx = misalignment of x in bits. }
        rol    %cl, %eax { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
        add    %edi, %edx { edx = x end }
        lea    -1(%edx), %ecx { ecx = x end - 1. }
        add    $4, %edi
        and    $-4, %edi { edi = 4-byte aligned pointer strictly to the right of the start. }
        and    $-4, %ecx { ecx = 4-byte aligned pointer strictly to the left of the end. }
        sub    %edi, %ecx { ecx = byte count between them. }
        shr    $2, %ecx { ecx = uint32 count, as expected by ‘rep stosl’. }
        rep stosl
        pop    %edi
        pop    %ecx
        mov    %ecx, -4(%edx) { Write last 4 bytes unaligned. }
end;
{$endif FillChar/Word/DWord required.}

{$ifdef can_jump_into_the_middle_of_a_procedure}
label
  FillXxxx_MoreThanTwoXMMs;
{$else can_jump_into_the_middle_of_a_procedure}
procedure FillXxxx_MoreThanTwoXMMs; forward;
{$endif can_jump_into_the_middle_of_a_procedure}

procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
{ eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
asm
        movd   %ecx, %xmm0
        pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
        movdqu %xmm0, (%eax)
        movdqu %xmm0, -16(%eax,%edx)
        cmp    $32, %edx
        ja     .LMoreThanTwoVectors
        ret
        .byte  144 { Turn .balign 16 before .L64x_Body into a no-op. }

      { x can start and end misaligned on the vector boundary:
        x = ~~][H1][H2][...][T2][T1]~
            [UH]                 [UT]
        UH/UT stands for “unaligned head/tail”, both have 1~16 bytes. }

.LMoreThanTwoVectors:
        push   %esi
        mov    %ecx, %esi { esi = pattern }
        mov    %eax, %ecx
        shl    $3, %ecx { ecx = misalignment of x in bits }
        rol    %cl, %esi { misalign the pattern }
        movd   %esi, %xmm0
        pshufd $0, %xmm0, %xmm0
        pop    %esi
{$ifdef can_jump_into_the_middle_of_a_procedure}
{ FillChar (to skip the misaligning above) and FillQWord jump here.
  eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
FillXxxx_MoreThanTwoXMMs:
{$else can_jump_into_the_middle_of_a_procedure}
        jmp    FillXxxx_MoreThanTwoXMMs
end;

procedure FillXxxx_MoreThanTwoXMMs; assembler; nostackframe;
asm
{$endif can_jump_into_the_middle_of_a_procedure}
        lea    -65(%eax,%edx), %ecx
        and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
        mov    %ecx, %edx { Remember T4 to edx. }
        and    $-16, %eax { eax = H1 − 16. }
        sub    %eax, %ecx { ecx = aligned byte count − 48. }
        movdqa %xmm0, 16(%eax) { Write H1. }
        cmp    $32-48, %ecx
        jle    .LOneAlignedTailWrite
        movdqa %xmm0, 32(%eax) { Write H2. }
        cmp    $64-48, %ecx
        jle    .LTwoAlignedTailWrites
        sub    $48, %ecx { ecx = aligned byte count − 96 (32 bytes already written + 64 bytes written after loop). }
        jle    .LFourAlignedTailWrites { ecx was ≤ 96−48 }

        add    $48, %eax { eax = H3. }
        cmp    $4 * 1024 * 1024, %ecx { Non-temporal fill threshold. }
        jae    .L64xNT_Body

.balign 16 { no-op }
.L64x_Body:
        movdqa %xmm0, (%eax)
        movdqa %xmm0, 16(%eax)
        movdqa %xmm0, 32(%eax)
        movdqa %xmm0, 48(%eax)
        add    $64, %eax
        sub    $64, %ecx
        ja     .L64x_Body
.LFourAlignedTailWrites:
        movdqa %xmm0, (%edx) { T4 }
        movdqa %xmm0, 16(%edx) { T3 }
.LTwoAlignedTailWrites:
        movdqa %xmm0, 32(%edx) { T2 }
.LOneAlignedTailWrite:
        movdqa %xmm0, 48(%edx) { T1 }
        ret

.balign 16
.L64xNT_Body:
        movntdq %xmm0, (%eax)
        movntdq %xmm0, 16(%eax)
        movntdq %xmm0, 32(%eax)
        movntdq %xmm0, 48(%eax)
        add    $64, %eax
        sub    $64, %ecx
        ja     .L64xNT_Body
        sfence
        jmp    .LFourAlignedTailWrites
end;

{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
  or not defined(FPC_SYSTEM_HAS_FILLWORD)
  or not defined(FPC_SYSTEM_HAS_FILLDWORD)}
{$ifndef CPUX86_HAS_SSE2}
procedure FillXxxx_U32Pattern_Plain_16OrMore; assembler; nostackframe;
{ eax — x, ecx — uint32 pattern, edx — byte count >= 12 (preferably >= 16). }
asm
        mov     %ecx, (%eax) { Write first 4 bytes. }
        lea     -9(%eax,%edx), %edx
        mov     %ecx, 5(%edx) { Write last 4 bytes. }
        and     $-4, %edx { edx = loop bound. }
        push    %esi
        mov     %ecx, %esi { esi = pattern }
        mov     %eax, %ecx
        shl     $3, %ecx { ecx = misalignment of x in bits }
        rol     %cl, %esi { misalign the pattern }
        add     $4, %eax
        and     $-4, %eax
.balign 16
.L8xLoop:
        mov     %esi, (%eax)
        mov     %esi, 4(%eax)
        add     $8, %eax
        cmp     %edx, %eax
        jb      .L8xLoop
        mov     %esi, (%edx)
        mov     %esi, 4(%edx)
        pop     %esi
end;
{$endif ndef CPUX86_HAS_SSE2 (need Fill*_Plain)}

procedure FillXxxx_U32Pattern_Ladder_4to16; assembler; nostackframe;
{ eax — x, ecx — uint32 pattern, edx — byte count, 4 <= edx <= 16. }
asm
        mov     %ecx, (%eax)
        cmp     $8, %edx
        jle     .LLast4
        mov     %ecx, 4(%eax)
        mov     %ecx, -8(%eax,%edx)
.LLast4:
        mov     %ecx, -4(%eax,%edx)
end;
{$endif FillChar/Word/DWord required.}
{$endif FillChar/Word/DWord/QWord required.}


{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)}
{$define FPC_SYSTEM_HAS_FILLCHAR}
procedure FillChar_3OrLess; assembler; nostackframe;
{ cl — x, edx — byte count, Low(int32) <= edx <= 3. }
asm
        test    %edx, %edx
        jle     .LQuit
        mov     %cl, (%eax)
        mov     %cl, -1(%eax,%edx)
        shr     $1, %edx
        mov     %cl, (%eax,%edx)
.LQuit:
end;

{$ifndef CPUX86_HAS_SSE2}
procedure FillChar_Plain(var x;count:SizeInt;value:byte);assembler;nostackframe;
asm
        cmp     $3, %edx
        jle     FillChar_3OrLess

        movzbl  %cl, %ecx
        imul    $0x01010101, %ecx
        cmp     $16, %edx
        jbe     FillXxxx_U32Pattern_Ladder_4to16
        jmp     FillXxxx_U32Pattern_Plain_16OrMore
end;
{$endif ndef CPUX86_HAS_SSE2}

procedure FillChar_SSE2(var x;count:SizeInt;value:byte);assembler;nostackframe;
asm
        cmp     $3, %edx
        jle     FillChar_3OrLess

        movzbl  %cl, %ecx
        imul    $0x01010101, %ecx
        cmp     $16, %edx
        jbe     FillXxxx_U32Pattern_Ladder_4to16
        cmp     $FillXxxx_RepStosThreshold_NoERMS, %edx
        jae     FillXxxx_U32Pattern_RepStos_8OrMore

        movd   %ecx, %xmm0
        pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
        movdqu %xmm0, (%eax)
        movdqu %xmm0, -16(%eax,%edx)
        cmp    $32, %edx
        ja     FillXxxx_MoreThanTwoXMMs
end;

procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
asm
        cmp     $3, %edx
        jle     FillChar_3OrLess

        movzbl  %cl, %ecx
        imul    $0x01010101, %ecx
        cmp     $16, %edx
        jbe     FillXxxx_U32Pattern_Ladder_4to16
        cmp     $FillXxxx_RepStosThreshold_ERMS, %edx
        jae     FillXxxx_U32Pattern_RepStos_8OrMore

        movd   %ecx, %xmm0
        pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
        movdqu %xmm0, (%eax)
        movdqu %xmm0, -16(%eax,%edx)
        cmp    $32, %edx
        ja     FillXxxx_MoreThanTwoXMMs
end;

procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;

var
  FillChar_Impl: procedure(var x;count:SizeInt;value:byte) = @FillChar_Dispatch;

procedure FillChar_Dispatch(var x;count:SizeInt;value:byte);
begin
  if not fpc_cpucodeinit_performed then
    begin
      {$ifdef CPUX86_HAS_SSE2} FillChar_SSE2 {$else} FillChar_Plain {$endif} (x, count, value);
      exit;
    end;
  if fast_large_repmovstosb then
    FillChar_Impl := @FillChar_SSE2_ERMS
  else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
    FillChar_Impl := @FillChar_SSE2
{$ifndef CPUX86_HAS_SSE2}
  else
    FillChar_Impl := @FillChar_Plain
{$endif ndef CPUX86_HAS_SSE2};
  FillChar_Impl(x, count, value);
end;

procedure FillChar(var x;count:SizeInt;value:byte);
begin
  FillChar_Impl(x, count, value);
end;
{$endif FPC_SYSTEM_HAS_FILLCHAR}


{$if not defined(FPC_SYSTEM_HAS_FILLWORD)}
{$define FPC_SYSTEM_HAS_FILLWORD}
procedure FillWord_3OrLess; assembler; nostackframe;
asm
        test    %edx, %edx
        jle     .LQuit
        mov     %cx, (%eax)
        mov     %cx, -2(%eax,%edx,2)
        shr     $1, %edx
        mov     %cx, (%eax,%edx,2)
.LQuit:
end;

{$ifndef CPUX86_HAS_SSE2}
procedure FillWord_Plain(var x;count:SizeInt;value:word);assembler;nostackframe;
asm
        cmp     $3, %edx
        jle     FillWord_3OrLess

        shl     $1, %edx
        movzwl  %cx, %ecx
        imul    $0x00010001, %ecx
        cmp     $16, %edx
        jbe     FillXxxx_U32Pattern_Ladder_4to16
        jmp     FillXxxx_U32Pattern_Plain_16OrMore
end;
{$endif ndef CPUX86_HAS_SSE2}

procedure FillWord_SSE2(var x;count:SizeInt;value:word);assembler;nostackframe;
asm
        cmp     $3, %edx
        jle     FillWord_3OrLess

        shl     $1, %edx
        movzwl  %cx, %ecx
        imul    $0x00010001, %ecx
        cmp     $16, %edx
        jbe     FillXxxx_U32Pattern_Ladder_4to16
        cmp     $FillXxxx_RepStosThreshold_NoERMS, %edx
        jb      FillXxxx_U32Pattern_SSE2_16OrMore
        jmp     FillXxxx_U32Pattern_RepStos_8OrMore
end;

procedure FillWord_SSE2_ERMS(var x;count:SizeInt;value:word);assembler;nostackframe;
asm
        cmp     $3, %edx
        jle     FillWord_3OrLess

        shl     $1, %edx
        movzwl  %cx, %ecx
        imul    $0x00010001, %ecx
        cmp     $16, %edx
        jbe     FillXxxx_U32Pattern_Ladder_4to16
        cmp     $FillXxxx_RepStosThreshold_ERMS, %edx
        jb      FillXxxx_U32Pattern_SSE2_16OrMore
        jmp     FillXxxx_U32Pattern_RepStos_8OrMore
end;

procedure FillWord_Dispatch(var x;count:SizeInt;value:word); forward;

var
  FillWord_Impl: procedure(var x;count:SizeInt;value:word) = @FillWord_Dispatch;

procedure FillWord_Dispatch(var x;count:SizeInt;value:word);
begin
  if not fpc_cpucodeinit_performed then
    begin
      {$ifdef CPUX86_HAS_SSE2} FillWord_SSE2 {$else} FillWord_Plain {$endif} (x, count, value);
      exit;
    end;
  if fast_large_repmovstosb then
    FillWord_Impl := @FillWord_SSE2_ERMS
  else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
    FillWord_Impl := @FillWord_SSE2
{$ifndef CPUX86_HAS_SSE2}
  else
    FillWord_Impl := @FillWord_Plain
{$endif ndef CPUX86_HAS_SSE2};
  FillWord_Impl(x, count, value);
end;

procedure FillWord(var x;count:SizeInt;value:word);
begin
  FillWord_Impl(x, count, value);
end;
{$endif FPC_SYSTEM_HAS_FILLWORD}


{$if not defined(FPC_SYSTEM_HAS_FILLDWORD)}
{$define FPC_SYSTEM_HAS_FILLDWORD}
procedure FillDWord_4OrLess; assembler; nostackframe;
asm
        cmp     $1, %edx
        jl      .LQuit
        mov     %ecx, (%eax)
        je      .LQuit
        mov     %ecx, 4(%eax)
        mov     %ecx, -8(%eax,%edx,4)
        mov     %ecx, -4(%eax,%edx,4)
.LQuit:
end;

{$ifndef CPUX86_HAS_SSE2}
procedure FillDWord_Plain(var x;count:SizeInt;value:dword);assembler;nostackframe;
asm
        cmp     $4, %edx
        jle     FillDWord_4OrLess
        shl     $2, %edx
        jmp     FillXxxx_U32Pattern_Plain_16OrMore
end;
{$endif ndef CPUX86_HAS_SSE2}

procedure FillDWord_SSE2(var x;count:SizeInt;value:dword);assembler;nostackframe;
asm
        cmp     $4, %edx
        jle     FillDWord_4OrLess
        shl     $2, %edx
        cmp     $FillXxxx_RepStosThreshold_NoERMS, %edx
        jb      FillXxxx_U32Pattern_SSE2_16OrMore
        jmp     FillXxxx_U32Pattern_RepStos_8OrMore
end;

procedure FillDWord_SSE2_ERMS(var x;count:SizeInt;value:dword);assembler;nostackframe;
asm
        cmp     $4, %edx
        jle     FillDWord_4OrLess
        shl     $2, %edx
        cmp     $FillXxxx_RepStosThreshold_ERMS, %edx
        jb      FillXxxx_U32Pattern_SSE2_16OrMore
        jmp     FillXxxx_U32Pattern_RepStos_8OrMore
end;

procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword); forward;

var
  FillDWord_Impl: procedure(var x;count:SizeInt;value:dword) = @FillDWord_Dispatch;

procedure FillDWord_Dispatch(var x;count:SizeInt;value:dword);
begin
  if not fpc_cpucodeinit_performed then
    begin
      {$ifdef CPUX86_HAS_SSE2} FillDWord_SSE2 {$else} FillDWord_Plain {$endif}(x, count, value);
      exit;
    end;
  if fast_large_repmovstosb then
    FillDWord_Impl := @FillDWord_SSE2_ERMS
  else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
    FillDWord_Impl := @FillDWord_SSE2
{$ifndef CPUX86_HAS_SSE2}
  else
    FillDWord_Impl := @FillDWord_Plain
{$endif ndef CPUX86_HAS_SSE2};
  FillDWord_Impl(x, count, value);
end;

procedure FillDWord(var x;count:SizeInt;value:dword);
begin
  FillDWord_Impl(x, count, value);
end;
{$endif FPC_SYSTEM_HAS_FILLDWORD}


{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
{$define FPC_SYSTEM_HAS_FILLQWORD}
{$ifndef CPUX86_HAS_SSE2}
procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
{ eax = x, edx = count, [esp + 4] = value }
asm
        test    %edx, %edx { Don't care about clever things like separate small branches or aligning writes by misaligning the pattern, the function is dead. }
        jle     .LQuit
        push    %esi
        mov     4+4(%esp), %esi { esi = value[0:31] }
        mov     4+8(%esp), %ecx { ecx = value[32:63] }
.balign 16
.LLoop:
        mov     %esi, (%eax)
        mov     %ecx, 4(%eax)
        add     $8, %eax
        sub     $1, %edx
        jnz     .LLoop
        pop     %esi
.LQuit:
end;
{$endif ndef CPUX86_HAS_SSE2}

procedure {$ifdef CPUX86_HAS_SSE2} FillQWord {$else} FillQWord_SSE2 {$endif}(var x;count:SizeInt;value:QWord);assembler;nostackframe;
{ eax = x, edx = count, [esp + 4] = value }
asm
        cmp     $4, %edx
        jle     .L4OrLess
        movq    4(%esp), %xmm0
        punpcklqdq %xmm0, %xmm0
        { Stack is 12 bytes:
          [esp] = return address, [esp + 4] = value (not required anymore).
          Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
          [esp] = return address. }
        mov     (%esp), %ecx
        add     $8, %esp
        mov     %ecx, (%esp)
        shl     $3, %edx
        movdqu  %xmm0, (%eax)
        movdqu  %xmm0, -16(%eax,%edx)
        test    $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
        jz      FillXxxx_MoreThanTwoXMMs
        mov     %eax, %ecx { Misalign uint64s in xmm0 by the misalignment of x. }
        shl     $3, %ecx
        and     $63, %ecx
        movd    %ecx, %xmm2
        movdqa  %xmm0, %xmm1
        psllq   %xmm2, %xmm1
        neg     %ecx      { Can also do not ecx; and $63, %ecx; inc ecx to support ecx = 0, as SSE shifts correctly handle count >= bitsizeof.  }
        and     $63, %ecx { But in the definitely unaligned case, ecx = 0 is impossible. }
        movd    %ecx, %xmm2
        psrlq   %xmm2, %xmm0
        por     %xmm1, %xmm0
        jmp     FillXxxx_MoreThanTwoXMMs

.L4OrLess: { Doing this with 64-bit half-XMM MOVQs is a lot simpler but 2x slower (Coffee Lake). :\ }
        cmp     $1, %edx
        jl      .LQuit
        mov     4(%esp), %ecx
        mov     %ecx, (%eax)
        je      .LSecondHalfOf1
        mov     %ecx, 8(%eax)
        mov     %ecx, -16(%eax,%edx,8)
        mov     %ecx, -8(%eax,%edx,8)
        mov     8(%esp), %ecx
        mov     %ecx, 4(%eax)
        mov     %ecx, 12(%eax)
        mov     %ecx, -12(%eax,%edx,8)
        mov     %ecx, -4(%eax,%edx,8)
.LQuit:
        ret     $8
.LSecondHalfOf1:
        mov     8(%esp), %ecx
        mov     %ecx, 4(%eax)
end;

{$ifndef CPUX86_HAS_SSE2}
procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword); forward;

var
  FillQWord_Impl: procedure(var x;count:SizeInt;value:qword) = @FillQWord_Dispatch;

procedure FillQWord_Dispatch(var x;count:SizeInt;value:qword);
begin
  if not fpc_cpucodeinit_performed then
    begin
      FillQWord_Plain(x, count, value);
      exit;
    end;
  if has_sse2_support then
    FillQWord_Impl := @FillQWord_SSE2
  else
    FillQWord_Impl := @FillQWord_Plain;
  FillQWord_Impl(x, count, value);
end;

procedure FillQWord(var x;count:SizeInt;value:qword);
begin
  FillQWord_Impl(x, count, value);
end;
{$endif ndef CPUX86_HAS_SSE2 (need FillQWord dispatcher)}
{$endif FPC_SYSTEM_HAS_FILLQWORD}


{$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
{$define FPC_SYSTEM_HAS_INDEXBYTE}
{$ifndef CPUX86_HAS_SSE2}
function IndexByte_Plain(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
{ eax = buf, edx = len, cl = b }
asm
        test  %edx,%edx
        jz    .Lnothing0
        push  %eax                  { save initial value of 'buf' }

        test  $3,%al
        jz    .Laligned4
.Lalignloop:                        { align to 4 bytes }
        cmp   %cl,(%eax)
        je    .Lfoundateax
        inc   %eax
        dec   %edx
        jz    .Lnothing1
        test  $3,%al
        jnz   .Lalignloop

.Laligned4:                         { align to 8 bytes }
        push  %esi
        push  %edi

        mov    %cl,%ch              { prepare pattern }
        movzwl %cx,%esi
        shl    $16,%ecx
        or     %esi,%ecx

        test  $7,%al
        jz    .Lloop
        test  %edx,%edx             { Adjust buf+len and reuse the second unroll from the loop body. Careful with len < 0 (esp. len = −1). }
        jl    .Ldontfixuplen
        add   $4,%edx
.Ldontfixuplen:
        sub   $4,%eax
        jmp   .Lalignfrom4to8

.balign 16
.Lloop:                             { Requires 8-byte alignment of eax, to safely over-read up to 7 bytes on last iteration. }
        mov   (%eax),%esi           { load dword }
        xor   %ecx,%esi             { XOR with pattern, bytes equal to target are now 0 }
        lea   -0x01010101(%esi),%edi
        not   %esi
        and   $0x80808080,%esi
        and   %edi,%esi             { (x-0x01010101) and (not x) and 0x80808080 }
        jnz   .Lfound0              { one of the bytes matches }

.Lalignfrom4to8:
        mov   4(%eax),%esi
        xor   %ecx,%esi
        lea   -0x01010101(%esi),%edi
        not   %esi
        and   $0x80808080,%esi
        and   %edi,%esi
        jnz   .Lfound1

        add   $8,%eax
        sub   $8,%edx
        ja    .Lloop
.Lnothing3:
        pop   %edi
        pop   %esi
.Lnothing1:
        pop   %edx
.Lnothing0:
        or    $-1,%eax
        ret

.Lfound1:
        sub   $4,%edx
        jbe   .Lnothing3
        add   $4,%eax
.Lfound0:
        bsf   %esi,%esi
        shr   $3,%esi
        cmp   %edx,%esi             { Garbage after remaining length? }
        jae   .Lnothing3
        add   %esi,%eax
        pop   %edi
        pop   %esi
.Lfoundateax:
        pop   %ecx
        sub   %ecx,%eax
end;
{$endif ndef CPUX86_HAS_SSE2}

function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
asm
        test      %edx, %edx
        jz        .Lnotfound                 { exit if len=0 }

        movd      %ecx, %xmm1
        mov       %eax, %ecx
        punpcklbw %xmm1, %xmm1
        punpcklbw %xmm1, %xmm1
        and       $4095, %ecx
        pshufd    $0, %xmm1, %xmm1

        cmp       $4080, %ecx
        ja        .LCrossPage

        movdqu    (%eax), %xmm0              { Analyze first 16 bytes, unaligned. }
        pcmpeqb   %xmm1, %xmm0
        pmovmskb  %xmm0, %ecx
        test      %ecx, %ecx
        jz        .LContinueAligned

        bsf       %ecx, %eax
        cmp       %edx, %eax
        jae       .Lnotfound
        ret

        .byte     144                        { Make .balign 16 before .Lloop a no-op. }
.LContinueAligned:
        cmp       $16, %edx                  { Length might be explicitly set to 16 or less; if so, skip a bit of work. }
        jbe       .Lnotfound                 { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) }

        push      %ebx
        lea       16(%eax), %ecx             { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
        and       $-0x10, %ecx               { first aligned address after buf }
        sub       %eax, %ecx                 { ecx=number of valid bytes, eax=original ptr }

    .balign 16
.Lloop:
        movdqa    (%eax,%ecx), %xmm0         { eax and ecx may have any values, }
        add       $16, %ecx                  { but their sum is evenly divisible by 16. }
        pcmpeqb   %xmm1, %xmm0
        pmovmskb  %xmm0, %ebx
        test      %ebx, %ebx
        jnz       .Lmatch
.Lcontinue:
        cmp       %ecx, %edx
        ja        .Lloop
        pop       %ebx
.Lnotfound:
        or        $-1, %eax
        ret

.LCrossPage:
        push      %ebx
        lea       16(%eax), %ecx             { eax = original ptr, ecx = buf + 16 for aligning & shifts. }
        and       $-0x10, %ecx               { first aligned address after buf }
        movdqa    -16(%ecx), %xmm0           { Fetch first 16 bytes (up to 15 bytes before target) }
        sub       %eax, %ecx                 { ecx=number of valid bytes, eax=original ptr }

        pcmpeqb   %xmm1, %xmm0               { compare with pattern and get bitmask }
        pmovmskb  %xmm0, %ebx

        shl       %cl, %ebx                  { shift valid bits into high word }
        and       $0xffff0000, %ebx          { clear low word containing invalid bits }
        shr       %cl, %ebx                  { shift back }
        jz        .Lcontinue
.Lmatch:
        bsf       %ebx, %ebx
        lea       -16(%ecx,%ebx), %eax
        pop       %ebx
        cmp       %eax, %edx                 { check against the buffer length }
        jbe       .Lnotfound
end;

{$ifndef CPUX86_HAS_SSE2}
function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt; forward;

var
  IndexByte_Impl: function(const buf;len:SizeInt;b:byte):SizeInt = @IndexByte_Dispatch;

function IndexByte_Dispatch(const buf;len:SizeInt;b:byte):SizeInt;
begin
  if not fpc_cpucodeinit_performed then
    exit(IndexByte_Plain(buf,len,b));
  if has_sse2_support then
    IndexByte_Impl:=@IndexByte_SSE2
  else
    IndexByte_Impl:=@IndexByte_Plain;
  result:=IndexByte_Impl(buf,len,b);
end;

function IndexByte(const buf;len:SizeInt;b:byte):SizeInt;
begin
  result:=IndexByte_Impl(buf,len,b);
end;
{$endif ndef CPUX86_HAS_SSE2}
{$endif FPC_SYSTEM_HAS_INDEXBYTE}


{$ifndef FPC_SYSTEM_HAS_INDEXWORD}
{$define FPC_SYSTEM_HAS_INDEXWORD}
{$ifndef CPUX86_HAS_SSE2}
function IndexWord_Plain(Const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
asm
        test    %edx, %edx
        jz      .LNotFound
        push    %eax
.LWordwise_Body: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
        cmp     %cx, (%eax)
        je      .LFound
        add     $2, %eax
        dec     %edx
        jnz     .LWordwise_Body
        pop     %edx
.LNotFound:
        or      $-1, %eax
        ret

.LFound:
        pop     %edx
        sub     %edx, %eax
        shr     $1, %eax
end;
{$endif ndef CPUX86_HAS_SSE2}

function {$ifdef CPUX86_HAS_SSE2} IndexWord {$else} IndexWord_SSE2 {$endif} (const buf;len:SizeInt;b:word):SizeInt; assembler; nostackframe;
asm
        test      %edx, %edx       { exit if len=0 }
        je        .Lnotfound
        push      %ebx
        movd      %ecx, %xmm1
        punpcklwd %xmm1, %xmm1
        pshufd    $0, %xmm1, %xmm1
        lea       16(%eax), %ecx
        and       $-16, %ecx
        movdqa    -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 14 bytes before target) }
        sub       %eax, %ecx

        test      $1, %eax         { if buffer isn't aligned to word boundary, }
        jnz       .Lunaligned      { use a different algorithm }

        pcmpeqw   %xmm1, %xmm0
        pmovmskb  %xmm0, %ebx

        shl       %cl, %ebx
        and       $0xffff0000, %ebx
        shr       %cl, %ebx
        shr       $1, %ecx         { ecx=number of valid bytes }
        test      %ebx, %ebx
        jz        .Lcontinue
.Lmatch:
        bsf       %ebx, %ebx
        shr       $1, %ebx         { in words }
        lea       -8(%ecx,%ebx), %eax
        pop       %ebx
        cmp       %eax, %edx
        jbe       .Lnotfound       { if match is after the specified length, ignore it }
        ret

.balign 16
.Lloop:
        movdqa    (%eax,%ecx,2), %xmm0
        add       $8, %ecx
        pcmpeqw   %xmm1, %xmm0
        pmovmskb  %xmm0, %ebx
        test      %ebx, %ebx
        jnz       .Lmatch
.Lcontinue:
        cmp       %ecx, %edx
        ja        .Lloop
        pop       %ebx
.Lnotfound:
        or        $-1, %eax
        ret

.Lunaligned:
        push      %esi
        movdqa    %xmm1, %xmm2     { (mis)align the pattern (in this particular case: }
        psllw     $8, %xmm1        {   swap bytes of each word of pattern) }
        psrlw     $8, %xmm2
        por       %xmm2, %xmm1

        pcmpeqb   %xmm1, %xmm0
        pmovmskb  %xmm0, %ebx

        shl       %cl, %ebx
        and       $0xffff0000, %ebx
        shr       %cl, %ebx

        xor       %esi, %esi       { nothing to merge yet }
        add       %edx, %edx       { length words -> bytes }
        jmp       .Lcontinue_u

.balign 16
.Lloop_u:
        movdqa    (%eax,%ecx), %xmm0
        add       $16, %ecx
        pcmpeqb   %xmm1, %xmm0     { compare by bytes }
        shr       $16, %esi        { bit 16 shifts into 0 }
        pmovmskb  %xmm0, %ebx
.Lcontinue_u:
        shl       $1, %ebx         { 15:0 -> 16:1 }
        or        %esi, %ebx       { merge bit 0 from previous round }
        mov       %ebx, %esi
        shr       $1, %ebx         { now AND together adjacent pairs of bits }
        and       %esi, %ebx
        and       $0x5555, %ebx    { also reset odd bits }
        jnz       .Lmatch_u
        cmp       %ecx, %edx
        ja        .Lloop_u
.Lnotfound_u:
        pop       %esi
        pop       %ebx
        or        $-1, %eax
        ret

.Lmatch_u:
        bsf       %ebx, %ebx
        lea       -16(%ecx,%ebx), %eax
        cmp       %eax, %edx
        jbe       .Lnotfound_u     { if match is after the specified length, ignore it }
        sar       $1, %eax         { in words }
        pop       %esi
        pop       %ebx
end;

{$ifndef CPUX86_HAS_SSE2}
function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt; forward;

var
  IndexWord_Impl: function(const buf;len:SizeInt;b:word):SizeInt = @IndexWord_Dispatch;

function IndexWord_Dispatch(const buf;len:SizeInt;b:word):SizeInt;
begin
  if not fpc_cpucodeinit_performed then
    exit(IndexWord_Plain(buf,len,b));
  if has_sse2_support then
    IndexWord_Impl:=@IndexWord_SSE2
  else
    IndexWord_Impl:=@IndexWord_Plain;
  result:=IndexWord_Impl(buf,len,b);
end;

function IndexWord(const buf;len:SizeInt;b:word):SizeInt; inline;
begin
  result:=IndexWord_Impl(buf,len,b);
end;
{$endif ndef CPUX86_HAS_SSE2}
{$endif FPC_SYSTEM_HAS_INDEXWORD}


{$ifndef FPC_SYSTEM_HAS_INDEXDWORD}
{$define FPC_SYSTEM_HAS_INDEXDWORD}
{$ifndef CPUX86_HAS_SSE2}
function IndexDWord_Plain(Const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
asm
        push    %eax
        sub     $4, %eax
.LDWordwise_Next: { Loop does not cross cache line if the function entry is aligned on 16 bytes. }
        add     $4, %eax
        sub     $1, %edx
        jb      .LNotFound
        cmp     %ecx, (%eax)
        jne     .LDWordwise_Next
        pop     %edx
        sub     %edx, %eax
        shr     $2, %eax
        ret

.LNotFound:
        pop     %edx
        mov     $-1, %eax
end;
{$endif ndef CPUX86_HAS_SSE2}

function {$ifdef CPUX86_HAS_SSE2} IndexDWord {$else} IndexDWord_SSE2 {$endif} (const buf;len:SizeInt;b:DWord):SizeInt; assembler; nostackframe;
asm
        push     %eax
        sub      $4, %edx
        jle      .LDwordwise_Prepare
        movd     %ecx, %xmm1
        pshufd   $0, %xmm1, %xmm1
.balign 16 { 1-byte NOP. }
.L4x_Body:
        movdqu   (%eax), %xmm0
        pcmpeqd  %xmm1, %xmm0
        pmovmskb %xmm0, %ecx
        test     %ecx, %ecx
        jnz      .LFoundAtMask
        add      $16, %eax
        sub      $4, %edx
        jg       .L4x_Body

        lea      (%eax,%edx,4), %eax
        movdqu   (%eax), %xmm0
        pcmpeqd  %xmm1, %xmm0
        pmovmskb %xmm0, %ecx
        test     %ecx, %ecx
        jz       .LNothing
.LFoundAtMask:
        bsf      %ecx, %ecx
        add      %ecx, %eax
.LFoundAtEax:
        pop      %edx
        sub      %edx, %eax
        shr      $2, %eax
        ret
        nop { Turns .balign 16 before .LDwordwise_Body into a no-op. }

.LDwordwise_Prepare:
        add      $3, %edx
        cmp      $-1, %edx
        je       .LNothing
.balign 16 { no-op }
.LDwordwise_Body:
        cmp      (%eax), %ecx
        je       .LFoundAtEax
        add      $4, %eax
        sub      $1, %edx
        jae      .LDwordwise_Body
.LNothing:
        pop      %edx
        or       $-1, %eax
end;

{$ifndef CPUX86_HAS_SSE2}
function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt; forward;

var
  IndexDWord_Impl: function(const buf;len:SizeInt;b:DWord):SizeInt = @IndexDWord_Dispatch;

function IndexDWord_Dispatch(const buf;len:SizeInt;b:DWord):SizeInt;
begin
  if not fpc_cpucodeinit_performed then
    exit(IndexDWord_Plain(buf,len,b));
  if has_sse2_support then
    IndexDWord_Impl:=@IndexDWord_SSE2
  else
    IndexDWord_Impl:=@IndexDWord_Plain;
  result:=IndexDWord_Impl(buf,len,b);
end;

function IndexDWord(const buf;len:SizeInt;b:DWord):SizeInt;
begin
  result:=IndexDWord_Impl(buf,len,b);
end;
{$endif CPUX86_HAS_SSE2}
{$endif FPC_SYSTEM_HAS_INDEXDWORD}


{$ifndef FPC_SYSTEM_HAS_INDEXQWORD}
{$define FPC_SYSTEM_HAS_INDEXQWORD}
function IndexQWord_Plain(const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
{ eax = buf, edx = len, [esp+4] = b }
asm
        push    %ebx
        mov     8(%esp), %ecx { ecx = b[0:31] }
        mov     12(%esp), %ebx { ebx = b[32:63] }
        mov     %eax, 8(%esp) { remember original buf }
        sub     $8, %eax

.balign 16 { no-op }
.LQWordwise_Next:
        add     $8, %eax
        sub     $1, %edx
        jb      .LNotFound
        cmp     %ecx, (%eax)
        jne     .LQWordwise_Next
        cmp     %ebx, 4(%eax)
        jne     .LQWordwise_Next
        sub     8(%esp), %eax
        pop     %ebx
        shr     $3, %eax
        ret     $8

.LNotFound:
        pop     %ebx
        mov     $-1, %eax
end;

function {$ifdef CPUX86_HAS_SSE4_1} IndexQWord {$else} IndexQWord_SSE41 {$endif}(Const buf;len:SizeInt;b:QWord):SizeInt; assembler; nostackframe;
{ eax = buf, edx = len, [esp+4] = b }
asm
    cmp      $6, len
    jle      IndexQWord_Plain
    movddup  4(%esp), %xmm0 { xmm0 = pattern of 'b's. }
    mov      %eax, %ecx { ecx = original buf }
    sub      $6, len
.balign 16
.L6x_Loop:
    movdqu   (%eax), %xmm1
    pcmpeqq  %xmm0, %xmm1 { xmm1 = cmpeq(vec 0, pattern) }
    movdqu   16(%eax), %xmm2
    pcmpeqq  %xmm0, %xmm2
    por      %xmm1, %xmm2 { xmm2 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) }
    movdqu   32(%eax), %xmm3
    pcmpeqq  %xmm0, %xmm3
    por      %xmm2, %xmm3 { xmm3 = cmpeq(vec 0, pattern) or cmpeq(vec 1, pattern) or cmpeq(vec 2, pattern) }
    ptest    %xmm3, %xmm3
    jnz      .LFound
    add      $48, %eax
    sub      $6, len
    jge      .L6x_Loop
    lea      (%eax,%edx,8), %eax { Point to last 3 vectors. }
    cmp      $-5, len
    jge      .L6x_Loop { Reuse .L6x_Loop to compare last 3 vectors, if not compared already. }
    mov      $-1, %eax
    ret      $8

.LFound:
    sub      %ecx, %eax
    ptest    %xmm1, %xmm1
    jnz      .LFoundAtXmm1
    ptest    %xmm2, %xmm2
    jnz      .LFoundAtXmm2
    add      $16, %eax
    movdqa   %xmm3, %xmm2
.LFoundAtXmm2:
    add      $16, %eax
    movdqa   %xmm2, %xmm1
.LFoundAtXmm1:
    pmovmskb %xmm1, %ecx
    bsf      %ecx, %ecx
    add      %ecx, %eax
    shr      $3, %eax
end;

{$ifndef CPUX86_HAS_SSE4_1}
function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt; forward;

var
  IndexQWord_Impl: function(const buf;len:SizeInt;b:QWord):SizeInt = @IndexQWord_Dispatch;

function IndexQWord_Dispatch(const buf;len:SizeInt;b:QWord):SizeInt;
begin
  if not fpc_cpucodeinit_performed then
    exit(IndexQWord_Plain(buf,len,b));
  if has_sse41_support then
    IndexQWord_Impl:=@IndexQWord_SSE41
  else
    IndexQWord_Impl:=@IndexQWord_Plain;
  result:=IndexQWord_Impl(buf,len,b);
end;

function IndexQWord(const buf;len:SizeInt;b:QWord):SizeInt;
begin
  result:=IndexQWord_Impl(buf,len,b);
end;
{$endif ndef CPUX86_HAS_SSE4_1}
{$endif FPC_SYSTEM_HAS_INDEXQWORD}


{$ifndef FPC_SYSTEM_HAS_COMPAREBYTE}
{$define FPC_SYSTEM_HAS_COMPAREBYTE}
{$ifndef CPUX86_HAS_SSE2}
function CompareByte_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
asm
        { eax = buf1, edx = buf2, ecx = len }
        push    %ebx
        sub     %eax, %edx { edx = buf2 - buf1 }
        cmp     $3, %ecx
        jle     .LBytewise_Prepare

        { Align buf1 on 4 bytes. }
        mov     (%edx,%eax), %ebx
        cmp     (%eax), %ebx
        jne     .L4xDiffer
        lea     -4(%eax,%ecx), %ecx { ecx = buf1 end - (4 + buf1 and -4) = count remaining }
        and     $-4, %eax
        sub     %eax, %ecx

.balign 16
.L4x_Next:
        add     $4, %eax
        sub     $4, %ecx { at .LLast4, ecx is 4 less than remaining bytes }
        jle     .LLast4
        mov     (%edx,%eax), %ebx
        cmp     (%eax), %ebx
        je      .L4x_Next
.L4xDiffer:
        mov     (%eax), %edx
{$ifdef CPUX86_HAS_BSWAP}
        bswap   %ebx
        bswap   %edx
{$else}
        rol     $8, %bx
        rol     $16, %ebx
        rol     $8, %bx
        rol     $8, %dx
        rol     $16, %edx
        rol     $8, %dx
{$endif}
        cmp     %ebx, %edx
.LDoSbb:
        sbb     %eax, %eax
        or      $1, %eax
        pop     %ebx
        ret

.LLast4:
        add     %ecx, %eax
        mov     (%edx,%eax), %ebx
        cmp     (%eax), %ebx
        jne     .L4xDiffer
        xor     %eax, %eax
        pop     %ebx
        ret

.LBytewise_Prepare:
        sub     $1, %ecx
        jb      .LNothing
.balign 16 { no-op }
.LBytewise_Body:
        movzbl  (%edx,%eax), %ebx
        cmp     %bl, (%eax)
        jne     .LDoSbb
        add     $1, %eax
        sub     $1, %ecx
        jae     .LBytewise_Body
.LNothing:
        xor     %eax, %eax
        pop     %ebx
end;
{$endif ndef CPUX86_HAS_SSE2}

label
  CompareByte_1OrLess, CompareByte_CantOverReadBoth_AVX2;

function CompareByte_SSE2(const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
asm
        { eax = buf1, edx = buf2, ecx = len }
        cmp      $1, %ecx
        jle      CompareByte_1OrLess

        push     %ebx
        cmp      $16, %ecx
        jae      .LVecOrMore

        { 2 to 15 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
        mov      %eax, %ebx
        or       %edx, %ebx
        and      $4095, %ebx
        cmp      $4080, %ebx
        ja       .LCantOverReadBoth

        { Over-read both as XMMs. }
        movdqu   (%eax), %xmm0
        movdqu   (%edx), %xmm1
        pcmpeqb  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx { Here and below, 2-byte 'inc r16' after 'pmovmskb' can be replaced with 5-byte 'add $1, r16' or 6-byte 'xor $65535, r32'. }
        jz       .LNothing
        bsf      %ebx, %ebx
        cmp      %ecx, %ebx { Ignore garbage beyond 'len'. }
        jae      .LNothing
        movzbl   (%eax,%ebx), %eax
        movzbl   (%edx,%ebx), %edx
        sub      %edx, %eax
        pop      %ebx
        ret

.LNothing:
        pop      %ebx
        xor      %eax, %eax
        ret

.LAligned32xLoop_TwoVectorsDiffer:
        add      %eax, %edx { restore edx = buf2 }
        pmovmskb %xmm0, %ecx { Is there a difference in the first vector? }
        inc      %cx
        jz       .LVec1Differs { No difference in the first vector, xmm0 is all ones, ebx = pmovmskb(pcmpeqb(buf1 + 16, buf2 + 16)) from the loop body. }
        mov      %ecx, %ebx
.LVec0Differs:
        bsf      %ebx, %ebx
        movzbl   (%eax,%ebx), %eax
        movzbl   (%edx,%ebx), %edx
        sub      %edx, %eax
        pop      %ebx
        ret

        .byte    144 { Turn .balign 16 before .LAligned32xLoop_Body into a no-op. }
CompareByte_CantOverReadBoth_AVX2:
        cmp      $16, %ecx
        jb       .LCantOverReadBoth
.LVecOrMore:
        { Compare first vectors. }
        movdqu   (%eax), %xmm0
        movdqu   (%edx), %xmm1
        pcmpeqb  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LVec0Differs

        sub      $32, %ecx { now ecx is len - 32. }
        jbe      .LLastVec

        { Compare second vectors. }
        movdqu   16(%eax), %xmm0
        movdqu   16(%edx), %xmm1
        pcmpeqb  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LVec1Differs

        cmp      $32, %ecx
        jbe      .LLastTwoVectors

        { More than four vectors: aligned loop. }
        lea      -32(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 32) }
        sub      %eax, %edx { edx = buf2 - buf1 }
        and      $-16, %eax { Align buf1. First two vectors already analyzed are skipped by +32 on the first loop iteration. }
        sub      %eax, %ecx { ecx = count to be handled with loop }
.balign 16 { No-op. }
.LAligned32xLoop_Body:
        add      $32, %eax
        { Compare two XMMs, reduce the result with 'and'. }
        movdqu   (%edx,%eax), %xmm0
        pcmpeqb  (%eax), %xmm0 { xmm0 = pcmpeqb(buf1, buf2) }
        movdqu   16(%edx,%eax), %xmm1
        pcmpeqb  16(%eax), %xmm1
        pand     %xmm0, %xmm1 { xmm1 = xmm0 and pcmpeqb(buf1 + 16, buf2 + 16) }
        pmovmskb %xmm1, %ebx
        inc      %bx
        jnz      .LAligned32xLoop_TwoVectorsDiffer
        sub      $32, %ecx
        ja       .LAligned32xLoop_Body
        add      %eax, %edx { restore edx = buf2 }
        add      $32, %ecx
.LLastTwoVectors:
        movdqu   (%eax,%ecx), %xmm0
        movdqu   (%edx,%ecx), %xmm1
        pcmpeqb  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LVecEm2Differs
.LLastVec:
        movdqu   16(%eax,%ecx), %xmm0
        movdqu   16(%edx,%ecx), %xmm1
        pcmpeqb  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LVecEm1Differs
        pop      %ebx
        xor      %eax, %eax
        ret

.LVec1Differs:
        xor      %ecx, %ecx
.LVecEm1Differs:
        add      $16, %ecx
.LVecEm2Differs:
        bsf      %ebx, %ebx
        add      %ecx, %ebx
        movzbl   (%eax,%ebx), %eax
        movzbl   (%edx,%ebx), %edx
        sub      %edx, %eax
        pop      %ebx
        ret

.LCantOverReadBoth:
        cmp      $3, %ecx
        jle      .L2to3
        push     %esi
        mov      (%eax), %ebx
        mov      (%edx), %esi
        cmp      %esi, %ebx
        jne      .L4xDiffer
        cmp      $8, %ecx
        jbe      .LLast4x
        mov      4(%eax), %ebx
        mov      4(%edx), %esi
        cmp      %esi, %ebx
        jne      .L4xDiffer
        mov      -8(%eax,%ecx), %ebx
        mov      -8(%edx,%ecx), %esi
        cmp      %esi, %ebx
        jne      .L4xDiffer
.LLast4x:
        mov      -4(%eax,%ecx), %ebx
        mov      -4(%edx,%ecx), %esi
        cmp      %esi, %ebx
        jne      .L4xDiffer
        pop      %esi
        pop      %ebx
        xor      %eax, %eax
        ret

.L4xDiffer:
        bswap    %ebx
        bswap    %esi
        cmp      %esi, %ebx
        pop      %esi
        sbb      %eax, %eax
        or       $1, %eax
        pop      %ebx
        ret

.L2to3:
        movzwl   (%edx), %ebx
        bswap    %ebx
        shr      $1, %ebx
        mov      -1(%edx,%ecx), %bl
        movzwl   (%eax), %edx
        bswap    %edx
        shr      $1, %edx
        mov      -1(%eax,%ecx), %dl
        mov      %edx, %eax
        sub      %ebx, %eax
        pop      %ebx
        ret

CompareByte_1OrLess:
        jl       .LUnbounded_Prepare
        movzbl   (%eax), %eax
        movzbl   (%edx), %edx
        sub      %edx, %eax
        ret

.LUnbounded_Prepare:
        sub      %eax, %edx { edx = buf2 - buf1 }
        test     %ecx, %ecx
        jnz      .LUnbounded_Body
        xor      %eax, %eax
        ret

.balign 16
.LUnbounded_Next:
        add      $1, %eax
.LUnbounded_Body:
        movzbl   (%edx,%eax), %ecx
        cmp      %cl, (%eax)
        je       .LUnbounded_Next
        sbb      %eax, %eax
        or       $1, %eax
end;

function {$ifdef CPUX86_HAS_BMI1} CompareByte {$else} CompareByte_AVX2 {$endif} (const buf1, buf2; len: SizeInt): SizeInt; assembler; nostackframe;
asm
        { eax = buf1, edx = buf2, ecx = len }
        cmp       $1, %ecx
        jle       CompareByte_1OrLess

        push      %ebx
        cmp       $32, %ecx
        jae       .LVecOrMore

        { 2 to 31 bytes: check for page cross. Pessimistic variant that has false positives, but uses 1 less register and 2 less instructions. }
        mov       %eax, %ebx
        or        %edx, %ebx
        and       $4095, %ebx
        cmp       $4064, %ebx
        ja        CompareByte_CantOverReadBoth_AVX2

        { Over-read both as YMMs. }
        vmovdqu   (%eax), %ymm0
        vpcmpeqb  (%edx), %ymm0, %ymm0
        vpmovmskb %ymm0, %ebx
        inc       %ebx
        { bzhi      %ecx, %ebx, %ecx }
        .byte     0xc4,0xe2,0x70,0xf5,0xcb { bootstrap compiler doesn't know bzhi }
        jnz       .LVec0Differs
        vzeroupper
        pop       %ebx
        xor       %eax, %eax
        ret

        .byte     144 { Turn .balign 16 before .LAligned64xLoop_Body into a no-op. }
.LAligned64xLoop_TwoVectorsDiffer:
        add       %eax, %edx { restore edx = buf2 }
        vpmovmskb %ymm0, %ecx { Is there a difference in the first vector? }
        inc       %ecx
        jz        .LVec1Differs { No difference in the first vector, ymm0 is all ones, ebx = vpmovmskb(vpcmpeqb(buf1 + 32, buf2 + 32)) from the loop body. }
        mov       %ecx, %ebx
.LVec0Differs:
        vzeroupper
        tzcnt     %ebx, %ebx
        movzbl    (%eax,%ebx), %eax
        movzbl    (%edx,%ebx), %edx
        sub       %edx, %eax
        pop       %ebx
        ret

.LVecOrMore:
        { Compare first vectors. }
        vmovdqu   (%eax), %ymm0
        vpcmpeqb  (%edx), %ymm0, %ymm0
        vpmovmskb %ymm0, %ebx
        inc       %ebx
        jnz       .LVec0Differs

        sub       $64, %ecx { now ecx is len - 64. }
        jbe       .LLastVec

        { Compare second vectors. }
        vmovdqu   32(%eax), %ymm0
        vpcmpeqb  32(%edx), %ymm0, %ymm0
        vpmovmskb %ymm0, %ebx
        inc       %ebx
        jnz       .LVec1Differs

        cmp       $64, %ecx
        jbe       .LLastTwoVectors

        { More than four vectors: aligned loop. }
        lea       -64(%eax,%ecx), %ecx { buffer end - last two vectors handled separately - first two vectors already analyzed (by the fact ecx was still len - 64) }
        sub       %eax, %edx { edx = buf2 - buf1 }
        and       $-32, %eax { Align buf1. First two vectors already analyzed are skipped by +64 on the first loop iteration. }
        sub       %eax, %ecx { ecx = count to be handled with loop }
.balign 16 { No-op. }
.LAligned64xLoop_Body:
        add       $64, %eax
        { Compare two YMMs, reduce the result with 'and'. }
        vmovdqu   (%edx,%eax), %ymm0
        vpcmpeqb  (%eax), %ymm0, %ymm0 { ymm0 = vpcmpeqb(buf1, buf2) }
        vmovdqu   32(%edx,%eax), %ymm1
        vpcmpeqb  32(%eax), %ymm1, %ymm1
        vpand     %ymm0, %ymm1, %ymm1 { ymm1 = ymm0 and vpcmpeqb(buf1 + 32, buf2 + 32) }
        vpmovmskb %ymm1, %ebx
        inc       %ebx
        jnz       .LAligned64xLoop_TwoVectorsDiffer
        sub       $64, %ecx
        ja        .LAligned64xLoop_Body
        add       %eax, %edx { restore edx = buf2 }
        add       $64, %ecx
.LLastTwoVectors:
        vmovdqu   (%eax,%ecx), %ymm0
        vpcmpeqb  (%edx,%ecx), %ymm0, %ymm0
        vpmovmskb %ymm0, %ebx
        inc       %ebx
        jnz       .LVecEm2Differs
.LLastVec:
        vmovdqu   32(%eax,%ecx), %ymm0
        vpcmpeqb  32(%edx,%ecx), %ymm0, %ymm0
        vpmovmskb %ymm0, %ebx
        inc       %ebx
        jnz       .LVecEm1Differs
        vzeroupper
        pop       %ebx
        xor       %eax, %eax
        ret

.LVec1Differs:
        xor      %ecx, %ecx
.LVecEm1Differs:
        add      $32, %ecx
.LVecEm2Differs:
        vzeroupper
        tzcnt    %ebx, %ebx
        add      %ecx, %ebx
        movzbl   (%eax,%ebx), %eax
        movzbl   (%edx,%ebx), %edx
        sub      %edx, %eax
        pop      %ebx
end;

{$ifndef CPUX86_HAS_BMI1}
function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;

var
  CompareByte_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareByte_Dispatch;

function CompareByte_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
begin
  if not fpc_cpucodeinit_performed then
    exit({$ifdef CPUX86_HAS_SSE2} CompareByte_SSE2 {$else} CompareByte_Plain {$endif} (buf1, buf2, len));
  if has_avx2_support then
    CompareByte_Impl:=@CompareByte_AVX2
  else {$ifndef CPUX86_HAS_SSE2} if has_sse2_support then {$endif}
    CompareByte_Impl:=@CompareByte_SSE2
{$ifndef CPUX86_HAS_SSE2}
  else
    CompareByte_Impl:=@CompareByte_Plain
{$endif};
  result:=CompareByte_Impl(buf1, buf2, len);
end;

function CompareByte(const buf1, buf2; len: SizeInt): SizeInt;
begin
  result:=CompareByte_Impl(buf1, buf2, len);
end;
{$endif ndef CPUX86_HAS_BMI1 (need CompareByte dispatcher)}
{$endif FPC_SYSTEM_HAS_COMPAREBYTE}


{$ifndef FPC_SYSTEM_HAS_COMPAREWORD}
{$define FPC_SYSTEM_HAS_COMPAREWORD}
{$ifndef CPUX86_HAS_SSE2}
function CompareWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
asm
        push    %ebx
        sub     %eax, %edx { edx = buf2 - buf1 }
        lea     -4(%ecx), %ebx { Go wordwise if ecx <= 3 or ecx > 1073741823 (High(int32) div 2) ==> uint32(ecx - 4) > 1073741819. }
        cmp     $1073741819, %ebx
        ja      .LWordwise_Prepare
        test    $2, %al
        je      .LAlignedToPtrUintOrNaturallyMisaligned
        movzwl  (%edx,%eax), %ebx
        cmp     %bx, (%eax)
        jne     .LDoSbb
        add     $2, %eax
        sub     $1, %ecx
.LAlignedToPtrUintOrNaturallyMisaligned:
        sub     $2, %ecx
.balign 16
.LPtrUintWise_Next:
        mov     (%edx,%eax), %ebx
        cmp     %ebx, (%eax)
        jne     .LPtrUintsDiffer
        add     $4, %eax
        sub     $2, %ecx
        jg      .LPtrUintWise_Next
        lea     (%eax,%ecx,2), %eax
        mov     (%edx,%eax), %ebx
        cmp     %ebx, (%eax)
        jne     .LPtrUintsDiffer
        pop     %ebx
        xor     %eax, %eax
        ret

.LPtrUintsDiffer:
        cmp     %bx, (%eax)
        jne     .LDoSbb
        shr     $16, %ebx
        cmp     %bx, 2(%eax)
.LDoSbb:
        sbb     %eax, %eax
        or      $1, %eax
        pop     %ebx
        ret

.balign 16
.LWordwise_Body:
        movzwl  (%edx,%eax), %ebx
        cmp     %bx, (%eax)
        jne     .LDoSbb
        add     $2, %eax
.LWordwise_Prepare:
        sub     $1, %ecx
        jnb     .LWordwise_Body
        pop     %ebx
        xor     %eax, %eax
end;
{$endif ndef CPUX86_HAS_SSE2}

function {$ifdef CPUX86_HAS_SSE2} CompareWord {$else} CompareWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
asm
        push     %ebx
        sub      %eax, %edx { edx = buf2 - buf1 }
        lea      -2(%ecx), %ebx { Go wordwise if ecx <= 1 or uint32(ecx) > 1073741823 (High(int32) div 2) ==> uint32(ecx - 2) > 1073741821. }
        cmp      $1073741821, %ebx
        ja       .LWordwise_Prepare
        cmp      $8, %ecx
        jge      .LVecOrMore

        lea      (%edx,%eax), %ebx
        or       %eax, %ebx
        and      $4095, %ebx
        cmp      $4080, %ebx
        ja       .LWordwise_Prepare
        movdqu   (%edx,%eax), %xmm0
        movdqu   (%eax), %xmm1
        pcmpeqw  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jz       .LNothing
        shl      $1, %ecx { convert to bytes }
        bsf      %ebx, %ebx
        cmp      %ecx, %ebx
        jb       .LSubtractWords
.LNothing:
        pop      %ebx
        xor      %eax, %eax
        ret

.balign 16
.LWordwise_Body:
        movzwl  (%edx,%eax), %ebx
        cmp     %bx, (%eax)
        jne     .LDoSbb
        add     $2, %eax
.LWordwise_Prepare:
        sub     $1, %ecx
        jae     .LWordwise_Body
        xor     %eax, %eax
        pop     %ebx
        ret

.LDoSbb:
        sbb     %eax, %eax
        or      $1, %eax
        pop     %ebx
        ret

.LVecOrMore:
        movdqu   (%edx,%eax), %xmm0 { Compare first vectors. }
        movdqu   (%eax), %xmm1
        pcmpeqw  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LVec0Differs

        shl      $1, %ecx { convert to bytes }
        sub      $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
        jle      .LLastVec

        push     %eax { save original buf1 to recover word position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
        add      %eax, %ecx
        and      $-16, %eax { align buf1; +16 is performed by the loop. }
        sub      %eax, %ecx

.balign 16
.LAligned8xLoop_Body:
        add      $16, %eax
        movdqu   (%edx,%eax), %xmm0
        pcmpeqb  (%eax), %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LAligned8xLoop_VecDiffers
        sub      $16, %ecx
        ja       .LAligned8xLoop_Body
        pop      %ebx { drop original buf1 }
.LLastVec:
        lea      16(%eax,%ecx), %eax { point to the last 16 bytes }
        movdqu   (%edx,%eax), %xmm0
        movdqu   (%eax), %xmm1
        pcmpeqw  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LVec0Differs
        pop      %ebx
        xor      %eax, %eax
        ret

.LVec0Differs:
        bsf      %ebx, %ebx
.LSubtractWords:
        add      %eax, %edx
        movzwl   (%eax,%ebx), %eax
        movzwl   (%edx,%ebx), %edx
        sub      %edx, %eax
        pop      %ebx
        ret

.LAligned8xLoop_VecDiffers:
        bsf      %ebx, %ebx
        add      %ebx, %eax
        pop      %ecx
        sub      %ecx, %eax
        and      $-2, %eax
        add      %ecx, %eax
        movzwl   (%edx,%eax), %edx
        movzwl   (%eax), %eax
        sub      %edx, %eax
        pop      %ebx
end;

{$ifndef CPUX86_HAS_SSE2}
function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;

var
  CompareWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareWord_Dispatch;

function CompareWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
begin
  if not fpc_cpucodeinit_performed then
    exit(CompareWord_Plain(buf1, buf2, len));
  if has_sse2_support then
    CompareWord_Impl:=@CompareWord_SSE2
  else
    CompareWord_Impl:=@CompareWord_Plain;
  result:=CompareWord_Impl(buf1, buf2, len);
end;

function CompareWord(const buf1, buf2; len: SizeInt): SizeInt;
begin
  result:=CompareWord_Impl(buf1, buf2, len);
end;
{$endif ndef CPUX86_HAS_SSE2 (need CompareWord dispatcher)}
{$endif FPC_SYSTEM_HAS_COMPAREWORD}


{$ifndef FPC_SYSTEM_HAS_COMPAREDWORD}
{$define FPC_SYSTEM_HAS_COMPAREDWORD}
{$ifndef CPUX86_HAS_SSE2}
function CompareDWord_Plain(Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
asm
        sub     $1, %ecx
        jb      .LNothing
        push    %ebx
        sub     %eax, %edx
.balign 16
.LDwordwise_Body:
        mov     (%edx,%eax), %ebx
        cmp     %ebx, (%eax)
        jne     .LDoSbb
        add     $4, %eax
        sub     $1, %ecx
        jnb     .LDwordwise_Body
        pop     %ebx
.LNothing:
        xor %eax, %eax
        ret

.LDoSbb:
        pop     %ebx
        sbb     %eax, %eax
        or      $1, %eax
end;
{$endif}

function {$ifdef CPUX86_HAS_SSE2} CompareDWord {$else} CompareDWord_SSE2 {$endif} (Const buf1,buf2;len:SizeInt):SizeInt; assembler; nostackframe;
asm
        push     %ebx
        sub      %eax, %edx { edx = buf2 - buf1 }
        lea      -5(%ecx), %ebx { Go dwordwise if ecx <= 4 or ecx > 536870911 (High(int32) div 4) ==> uint32(ecx - 5) > 536870906. }
        cmp      $536870906, %ebx
        ja       .LDwordwise_Prepare
        shl      $2, %ecx { convert to bytes }

        movdqu   (%edx,%eax), %xmm1 { Compare first vectors. }
        movdqu   (%eax), %xmm0
        pcmpeqd  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LVec0Differs

        sub      $32, %ecx { first 16 bytes already analyzed + last 16 bytes analyzed separately }
        jle      .LLastVec

        push     %eax { save original buf1 to recover uint32 position if byte mismatch found (aligned loop works in bytes to support misaligned buf1). }
        add      %eax, %ecx
        and      $-16, %eax { align buf1; +16 is performed by the loop. }
        sub      %eax, %ecx

.balign 16
.LAligned4xLoop_Body:
        add      $16, %eax
        movdqu   (%eax,%edx), %xmm0
        pcmpeqb  (%eax), %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LAligned4xLoop_VecDiffers
        sub      $16, %ecx
        ja       .LAligned4xLoop_Body
        pop      %ebx { drop original buf1 }
.LLastVec:
        lea      16(%eax,%ecx), %eax { point to the last 16 bytes }
        movdqu   (%edx,%eax), %xmm1
        movdqu   (%eax), %xmm0
        pcmpeqd  %xmm1, %xmm0
        pmovmskb %xmm0, %ebx
        inc      %bx
        jnz      .LVec0Differs
        pop      %ebx
        xor      %eax, %eax
        ret

.LVec0Differs:
        bsf      %ebx, %ebx
        add      %eax, %edx { recover edx = buf2 }
        mov      (%edx,%ebx), %edx
        cmp      %edx, (%eax,%ebx)
        sbb      %eax, %eax
        or       $1, %eax
        pop      %ebx
        ret

.LAligned4xLoop_VecDiffers:
        bsf      %ebx, %ebx
        add      %ebx, %eax
        pop      %ecx
        sub      %ecx, %eax
        and      $-4, %eax
        add      %ecx, %eax
        mov      (%edx,%eax), %edx
        cmp      %edx, (%eax)
.LDoSbb:
        sbb      %eax, %eax
        or       $1, %eax
        pop      %ebx
        ret

.balign 16
.LDwordwise_Body:
        mov     (%edx,%eax), %ebx
        cmp     %ebx, (%eax)
        jne     .LDoSbb
        add     $4, %eax
.LDwordwise_Prepare:
        sub     $1, %ecx
        jnb     .LDwordwise_Body
        pop     %ebx
        xor     %eax, %eax
end;

{$ifndef CPUX86_HAS_SSE2}
function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt; forward;

var
  CompareDWord_Impl: function(const buf1, buf2; len: SizeInt): SizeInt = @CompareDWord_Dispatch;

function CompareDWord_Dispatch(const buf1, buf2; len: SizeInt): SizeInt;
begin
  if not fpc_cpucodeinit_performed then
    exit(CompareDWord_Plain(buf1, buf2, len));
  if has_sse2_support then
    CompareDWord_Impl:=@CompareDWord_SSE2
  else
    CompareDWord_Impl:=@CompareDWord_Plain;
  result:=CompareDWord_Impl(buf1, buf2, len);
end;

function CompareDWord(const buf1, buf2; len: SizeInt): SizeInt;
begin
  result:=CompareDWord_Impl(buf1, buf2, len);
end;
{$endif ndef CPUX86_HAS_SSE2 (need CompareDWord dispatcher)}
{$endif FPC_SYSTEM_HAS_COMPAREDWORD}


{$ifndef FPC_SYSTEM_HAS_INDEXCHAR0}
{$define FPC_SYSTEM_HAS_INDEXCHAR0}
function IndexChar0(Const buf;len:SizeInt;b:AnsiChar):SizeInt; assembler;
var
  saveesi,saveebx : longint;
asm
        movl    %esi,saveesi
        movl    %ebx,saveebx
// Can't use scasb, or will have to do it twice, think this
//   is faster for small "len"
        movl    %eax,%esi        // Load address
        movzbl  %cl,%ebx          // Load searchpattern
        testl   %edx,%edx
        je      .LFound
        xorl    %ecx,%ecx       // zero index in Buf
        xorl    %eax,%eax       // To make DWord compares possible
        .balign 4
.LLoop:
        movb    (%esi),%al      // Load byte
        cmpb    %al,%bl
        je      .LFound         //  byte the same?
        incl    %ecx
        incl    %esi
        cmpl    %edx,%ecx       // Maximal distance reached?
        je      .LNotFound
        testl   %eax,%eax       // Nullchar = end of search?
        jne     .LLoop
.LNotFound:
        movl    $-1,%ecx        // Not found return -1
.LFound:
        movl    %ecx,%eax
        movl    saveesi,%esi
        movl    saveebx,%ebx
end;
{$endif FPC_SYSTEM_HAS_INDEXCHAR0}


{****************************************************************************
                                 String
****************************************************************************}

{$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}
{$define FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}

procedure fpc_shortstr_to_shortstr(out res:shortstring; const sstr: shortstring);assembler;[public,alias:'FPC_SHORTSTR_TO_SHORTSTR']; compilerproc;
{$ifndef FPC_PROFILE}
  nostackframe;
{$endif}
  { eax = res, edx = high(res), ecx = sstr }
  asm
  {$ifdef FPC_PROFILE}
        push  %eax
        push  %edx
        push  %ecx
        call  mcount
        pop   %ecx
        pop   %edx
        pop   %eax
  {$endif FPC_PROFILE}
        cmp     (%ecx), %dl { length(sstr) fits into res? }
        jbe     .LEdxIsLen { use high(res) if length(sstr) does not fit }
        movzbl  (%ecx), %edx { use length(sstr) }
.LEdxIsLen:
        mov     %dl, (%eax) { store length to res[0] }
        xchg    %ecx, %edx { ecx = length = Move count, edx = sstr }
        xchg    %eax, %edx { eax = sstr = Move src, edx = res = Move dest }
        inc     %eax
        inc     %edx
{$ifdef FPC_PROFILE}
{$ifdef FPC_SYSTEM_STACKALIGNMENT16}
        lea     -8(%esp), %esp
{$endif FPC_SYSTEM_STACKALIGNMENT16}
        call    Move
{$ifdef FPC_SYSTEM_STACKALIGNMENT16}
        lea     8(%esp), %esp
{$endif FPC_SYSTEM_STACKALIGNMENT16}
{$else FPC_PROFILE}
        jmp     Move
{$endif FPC_PROFILE}
  end;


procedure fpc_shortstr_assign(len:longint;sstr,dstr:pointer);[public,alias:'FPC_SHORTSTR_ASSIGN'];
begin
  asm
  {$ifdef FPC_PROFILE}
        push  %eax
        push  %edx
        push  %ecx
        call  mcount
        pop   %ecx
        pop   %edx
        pop   %eax
  {$endif FPC_PROFILE}
        pushl   %eax
        pushl   %ecx
{$ifdef FPC_ENABLED_CLD}
        cld
{$endif FPC_ENABLED_CLD}
        movl    dstr,%edi
        movl    sstr,%esi
        xorl    %eax,%eax
        movl    len,%ecx
        lodsb
        cmpl    %ecx,%eax
        jbe     .LStrCopy1
        movl    %ecx,%eax
.LStrCopy1:
        stosb
        cmpl    $7,%eax
        jl      .LStrCopy2
        movl    %edi,%ecx       { Align on 32bits }
        negl    %ecx
        andl    $3,%ecx
        subl    %ecx,%eax
        rep
        movsb
        movl    %eax,%ecx
        andl    $3,%eax
        shrl    $2,%ecx
        rep
        movsl
.LStrCopy2:
        movl    %eax,%ecx
        rep
        movsb
        popl    %ecx
        popl    %eax
  end ['ESI','EDI'];
end;
{$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_ASSIGN}


{$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}
{$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}

function fpc_shortstr_compare(const left,right:shortstring): longint;assembler; [public,alias:'FPC_SHORTSTR_COMPARE']; compilerproc;
{ eax = left, edx = right }
asm
{$ifdef FPC_PROFILE}
        push  %eax
        push  %edx
        push  %ecx
        call  mcount
        pop   %ecx
        pop   %edx
        pop   %eax
{$endif FPC_PROFILE}
        push    %ebx
        movzbl  (%eax), %ecx { ecx = len(left) }
        movzbl  (%edx), %ebx { ebx = len(right) }
        cmp     %ebx, %ecx
{$ifdef CPUX86_HAS_CMOV}
        cmovg   %ebx, %ecx
{$else}
        jle     .LEcxIsLen
        mov     %ebx, %ecx
.LEcxIsLen:
{$endif}
        push    %eax { save left }
        inc     %eax
        inc     %edx
        { stack is already aligned to 16 bytes if required: return address + push ebp + push ebx + push eax. }
{$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
        call    CompareByte
{$else}
        call    CompareByte_Impl { manually inline CompareByte }
{$endif}
        pop     %edx { restore left }
        test    %eax, %eax
        jnz     .LReturn
        movzbl  (%edx), %eax
        sub     %ebx, %eax
.LReturn:
        pop     %ebx
end;
{$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE}


{$ifndef FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
{$define FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}
function fpc_shortstr_compare_equal(const left,right:shortstring) : longint;assembler;nostackframe; [public,alias:'FPC_SHORTSTR_COMPARE_EQUAL']; compilerproc; nostackframe;
{ eax = left, edx = right }
asm
        movzbl  (%eax), %ecx
        cmp     (%edx), %cl
        jne     .LNotEqual
        inc     %eax
        inc     %edx
{$if defined(FPC_PIC) or not declared(CompareByte_Impl)}
        jmp     CompareByte
{$else}
        jmp     CompareByte_Impl { manually inline CompareByte }
{$endif}
.LNotEqual:
        or      $-1, %eax
end;
{$endif FPC_SYSTEM_HAS_FPC_SHORTSTR_COMPARE_EQUAL}

{$ifndef FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
{$define FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}
procedure fpc_pchar_to_shortstr(out res : shortstring;p:PAnsiChar);assembler;[public,alias:'FPC_PCHAR_TO_SHORTSTR']; compilerproc;
{$ifndef FPC_PROFILE}
  nostackframe;
{$endif}
// eax = res, edx = high(res), ecx = p
asm
{$ifdef FPC_PROFILE}
        push  %eax
        push  %edx
        push  %ecx
        call  mcount
        pop   %ecx
        pop   %edx
        pop   %eax
{$endif FPC_PROFILE}
        test    %ecx, %ecx
        jz      .LEmpty
        push    %eax { save res }
        push    %ecx { save p }
        push    %edx { save high(res) }
        mov     %ecx, %eax { eax = IndexByte.buf }
        { edx is already high(res) = IndexByte.count.
          Careful: using high(res) instead of -1 limits the scan by high(res) which is a good thing,
          but assumes that IndexByte is “safe” and won’t read potentially invalid memory past the searched byte even if formally (and wrongly) allowed by ‘count’.
          Generic and x86 versions are “safe”. }
        xor     %ecx, %ecx { ecx = 0 = IndexByte.value }
        { Stack is already aligned on 16 bytes if the function is nostackframe: return address + push eax + push ecx + push edx.
          With a stack frame, there is an additional push ebp and need 12 more bytes to align. }
{$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
        leal    -12(%esp), %esp
{$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
{$if defined(FPC_PIC) or not declared(IndexByte_Impl)}
        call    IndexByte
{$else}
        call    IndexByte_Impl { manually inline IndexByte }
{$endif}
{$if defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
        leal    12(%esp), %esp
{$endif defined(FPC_SYSTEM_STACKALIGNMENT16) and defined(FPC_PROFILE)}
        pop     %ecx { ecx = high(res) = Move.len }
        test    %eax, %eax { If IndexByte result (eax) is non-negative (terminator is among first high(res) characters), use it, otherwise keep high(res). }
{$ifdef CPUX86_HAS_CMOV}
        cmovns  %eax, %ecx
{$else}
        js      .LEcxIsLen
        mov     %eax, %ecx
.LEcxIsLen:
{$endif}
        pop     %eax { pop p to eax = Move.src }
        pop     %edx { pop res to edx }
        mov     %cl, (%edx) { res[0] := len }
        inc     %edx { res[1] = Move.dst }
{$ifdef FPC_PROFILE}
{$ifdef FPC_SYSTEM_STACKALIGNMENT16}
        leal    -12(%esp), %esp
{$endif FPC_SYSTEM_STACKALIGNMENT16}
        call    Move
{$ifdef FPC_SYSTEM_STACKALIGNMENT16}
        leal    12(%esp), %esp
{$endif FPC_SYSTEM_STACKALIGNMENT16}
        jmp     .LReturn
{$else FPC_PROFILE}
        jmp     Move { can perform a tail call }
{$endif FPC_PROFILE}

.LEmpty:
        movb    $0, (%eax)
{$ifdef FPC_PROFILE}
.LReturn:
{$endif}
end;
{$endif FPC_SYSTEM_HAS_FPC_PCHAR_TO_SHORTSTR}

{$IFNDEF INTERNAL_BACKTRACE}
{$define FPC_SYSTEM_HAS_GET_FRAME}
function get_frame:pointer;assembler;nostackframe;{$ifdef SYSTEMINLINE}inline;{$endif}
asm
        movl    %ebp,%eax
end;
{$ENDIF not INTERNAL_BACKTRACE}


{$define FPC_SYSTEM_HAS_GET_PC_ADDR}
Function Get_pc_addr : Pointer;assembler;nostackframe;
asm
        movl    (%esp),%eax
end;


{$define FPC_SYSTEM_HAS_GET_CALLER_ADDR}
function get_caller_addr(framebp:pointer;addr:pointer=nil):pointer;
{$if defined(win32)}
{ Windows has StackTop always properly set }
begin
  if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
    Result:=PPointer(framebp+4)^
  else
    Result:=nil;
end;
{$else defined(win32)}
nostackframe;assembler;
asm
        orl     %eax,%eax
        jz      .Lg_a_null
        movl    4(%eax),%eax
.Lg_a_null:
end;
{$endif defined(win32)}


{$define FPC_SYSTEM_HAS_GET_CALLER_FRAME}
function get_caller_frame(framebp:pointer;addr:pointer=nil):pointer;
{$if defined(win32)}
{ Windows has StackTop always properly set }
begin
  if assigned(framebp) and (framebp<=StackTop) and (framebp>=Sptr) then
    Result:=PPointer(framebp)^
  else
    Result:=nil;
end;
{$else defined(win32)}
nostackframe;assembler;
asm
        orl     %eax,%eax
        jz      .Lgnf_null
        movl    (%eax),%eax
.Lgnf_null:
end;
{$endif defined(win32)}


{$define FPC_SYSTEM_HAS_SPTR}
Function Sptr : Pointer;assembler;nostackframe;
asm
        movl    %esp,%eax
end;

{****************************************************************************
                                 Str()
****************************************************************************}

{$if defined(disabled) and defined(regcall) }
{$define FPC_SYSTEM_HAS_INT_STR_LONGWORD}
{$define FPC_SYSTEM_HAS_INT_STR_LONGINT}

label str_int_shortcut;


procedure int_str(l:longword;out s:shortstring);assembler;nostackframe;

asm
  pushl %esi
  pushl %edi
  pushl %ebx
  mov %edx,%edi
  xor %edx,%edx
  jmp str_int_shortcut
end;

procedure int_str(l:longint;out s:shortstring);assembler;nostackframe;

{Optimized for speed, but balanced with size.}

const digits:array[0..9] of cardinal=(0,10,100,1000,10000,
                                      100000,1000000,10000000,
                                      100000000,1000000000);

asm
{$ifdef FPC_PROFILE}
  push  %eax
  push  %edx
  push  %ecx
  call  mcount
  pop   %ecx
  pop   %edx
  pop   %eax
{$endif FPC_PROFILE}
  push %esi
  push %edi
  push %ebx
  movl %edx,%edi

  { Calculate absolute value and put sign in edx}
  cltd
  xorl %edx,%eax
  subl %edx,%eax
  negl %edx
str_int_shortcut:
  movl %ecx,%esi
  {Calculate amount of digits in ecx.}
  xorl %ecx,%ecx
  bsrl %eax,%ecx
  incl %ecx
  imul $1233,%ecx
  shr $12,%ecx
{$ifdef FPC_PIC}
  call fpc_geteipasebx
  {$ifdef darwin}
  movl digits-.Lpic(%ebx),%ebx
  {$else}
  addl $_GLOBAL_OFFSET_TABLE_,%ebx
  movl digits@GOT(%ebx),%ebx
  {$endif}
  cmpl (%ebx,%ecx,4),%eax
{$else}
  cmpl digits(,%ecx,4),%eax
{$endif}
  cmc
  adcl $0,%ecx               {Nr. digits ready in ecx.}

  {Write length & sign.}
  lea (%edx,%ecx),%ebx
  movb $45,%bh               {movb $'-,%bh   Not supported by our ATT reader.}
  movw %bx,(%edi)
  addl %edx,%edi
  subl %edx,%esi

  {Skip digits beyond string length.}
  movl %eax,%edx
  subl %ecx,%esi
  jae .Lloop_write
	.balign 4
.Lloop_skip:
  movl $0xcccccccd,%eax      {Divide by 10 using mul+shr}
  mull %edx
  shrl $3,%edx
  decl %ecx
  jz .Ldone                  {If (l<0) and (high(s)=1) this jump is taken.}
  incl %esi
  jnz .Lloop_skip

  {Write out digits.}
	.balign 4
.Lloop_write:
  movl $0xcccccccd,%eax      {Divide by 10 using mul+shr}
  {Pre-add '0'}
  leal 48(%edx),%ebx         {leal $'0(,%edx),%ebx Not supported by our ATT reader.}
  mull %edx
  shrl $3,%edx
  leal (%edx,%edx,8),%eax    {x mod 10 = x-10*(x div 10)}
  subl %edx,%ebx
  subl %eax,%ebx
  movb %bl,(%edi,%ecx)
  decl %ecx
  jnz .Lloop_write
.Ldone:
  popl %ebx
  popl %edi
  popl %esi
end;
{$endif}

{****************************************************************************
                               Bounds Check
****************************************************************************}


{ do a thread-safe inc/dec }
{$define FPC_SYSTEM_HAS_DECLOCKED_LONGINT}
function cpudeclocked(var l : longint) : boolean;assembler;nostackframe;

  asm
     lock
     decl       (%eax)
     setzb      %al
  end;

{$define FPC_SYSTEM_HAS_INCLOCKED_LONGINT}
procedure cpuinclocked(var l : longint);assembler;nostackframe;

  asm
     lock
     incl       (%eax)
  end;

// inline SMP check and normal lock.
// the locked one is so slow, inlining doesn't matter.
function declocked(var l : longint) : boolean; inline;

begin
  if not ismultithread then
    begin
     dec(l);
     declocked:=l=0;
    end
   else
    declocked:=cpudeclocked(l);
end;

procedure inclocked(var l : longint); inline;

begin
  if not ismultithread then
    inc(l)
   else
    cpuinclocked(l);
end;


function InterLockedDecrement (var Target: longint) : longint; assembler; nostackframe;
asm
        movl    $-1,%edx
        lock
        xaddl   %edx, (%eax)
        lea     -1(%edx),%eax
end;


function InterLockedIncrement (var Target: longint) : longint; assembler; nostackframe;
asm
        movl    $1,%edx
        lock
        xaddl   %edx, (%eax)
        lea     1(%edx),%eax
end;


function InterLockedExchange (var Target: longint;Source : longint) : longint; assembler; nostackframe;
asm
        xchgl   (%eax),%edx
        movl    %edx,%eax
end;


function InterLockedExchangeAdd (var Target: longint;Source : longint) : longint; assembler; nostackframe;
asm
        lock
        xaddl   %edx, (%eax)
        movl    %edx,%eax
end;


function InterlockedCompareExchange(var Target: longint; NewValue: longint; Comperand: longint): longint; assembler; nostackframe;
asm
        xchgl   %eax,%ecx
        lock
        cmpxchgl   %edx, (%ecx)
end;


function InterlockedCompareExchange64(var Target: int64; NewValue: int64; Comperand: int64): int64; assembler;
asm
        pushl       %ebx
        pushl       %edi
        movl        %eax,%edi
        movl        Comperand+4,%edx
        movl        Comperand+0,%eax
        movl        NewValue+4,%ecx
        movl        NewValue+0,%ebx
        lock cmpxchg8b (%edi)
        pop         %edi
        pop         %ebx
end;


{****************************************************************************
                                  FPU
****************************************************************************}

const
  { Internal constants for use in system unit }
  FPU_Invalid = 1;
  FPU_Denormal = 2;
  FPU_DivisionByZero = 4;
  FPU_Overflow = 8;
  FPU_Underflow = $10;
  FPU_StackUnderflow = $20;
  FPU_StackOverflow = $40;
  FPU_ExceptionMask = $ff;

  MM_Invalid = 1;
  MM_Denormal = 2;
  MM_DivisionByZero = 4;
  MM_Overflow = 8;
  MM_Underflow = $10;
  MM_Precicion = $20;
  MM_ExceptionMask = $3f;

  MM_MaskInvalidOp = %0000000010000000;
  MM_MaskDenorm    = %0000000100000000;
  MM_MaskDivZero   = %0000001000000000;
  MM_MaskOverflow  = %0000010000000000;
  MM_MaskUnderflow = %0000100000000000;
  MM_MaskPrecision = %0001000000000000;


{$define FPC_SYSTEM_HAS_SYSINITFPU}
Procedure SysInitFPU;
  begin
  end;


{$define FPC_SYSTEM_HAS_SYSRESETFPU}
Procedure SysResetFPU;
  var
    { these locals are so we don't have to hack pic code in the assembler }
    localmxcsr: dword;
    localfpucw: word;
  begin
    localfpucw:=Default8087CW;
    asm
      fninit
      fwait
      fldcw   localfpucw
    end;
    if has_sse_support then
      begin
        localmxcsr:=DefaultMXCSR;
        asm
          { setup sse exceptions }
        {$ifndef OLD_ASSEMBLER}
          ldmxcsr localmxcsr
        {$else OLD_ASSEMBLER}
          mov     localmxcsr,%eax
          subl    $4,%esp
          mov     %eax,(%esp)
          //ldmxcsr (%esp)
          .byte   0x0f,0xae,0x14,0x24
          addl    $4,%esp
        {$endif OLD_ASSEMBLER}
        end;
      end;
  end;


{ because of the brain dead sse detection on x86, this test is post poned }
procedure fpc_cpucodeinit;
  var
    _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint;
  begin
    if cpuid_support then
      begin
        asm
            movl $1,%eax
            xorl %ecx,%ecx
            cpuid
            movl %edx,_edx_cpuid1
            movl %ecx,_ecx_cpuid1
        end ['ebx'];
        has_mmx_support:=(_edx_cpuid1 and $800000)<>0;
        if ((_edx_cpuid1 and $2000000)<>0) then
          begin
            os_supports_sse:=true;
            sse_check:=true;
            asm
              { force an sse exception if no sse is supported, the exception handler sets
                os_supports_sse to false then }
              { don't change this instruction, the code above depends on its size }
            {$ifdef OLD_ASSEMBLER}
              .byte  0x0f,0x28,0xf7
            {$else}
              movaps %xmm7, %xmm6
            {$endif not EMX}
            end;
            sse_check:=false;
            has_sse_support:=os_supports_sse;
          end;
        if has_sse_support then
          begin
            has_sse2_support:=((_edx_cpuid1 and $4000000)<>0);
            has_sse3_support:=((_ecx_cpuid1 and $200)<>0);
            has_sse41_support:=boolean(_ecx_cpuid1 shr 19 and 1);

            { now avx }
            asm
              xorl %eax,%eax
              cpuid
              movl %eax,_eax
            end;
            if _eax>=7 then
              begin
                asm
                  movl $7,%eax
                  xorl %ecx,%ecx
                  cpuid
                  movl %ebx,_ebx_cpuid7
                end;
                fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0;
                if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then
                  begin
                    asm
                      xorl %ecx,%ecx
                      .byte   0x0f,0x01,0xd0 { xgetbv }
                      movl %eax,_eax
                    end;
                    if (_eax and 6)=6 then
                      begin
                        has_avx_support:=(_ecx_cpuid1 and $10000000)<>0;
                        has_avx2_support:=(_ebx_cpuid7 and $20)<>0;
                      end;
                  end;
              end;
          end;
      end;

    { don't let libraries influence the FPU cw set by the host program }
    if IsLibrary then
      begin
        Default8087CW:=Get8087CW;
        if has_sse_support then
          DefaultMXCSR:=GetMXCSR;
      end;

    SysResetFPU;
    fpc_cpucodeinit_performed:=true;
  end;


{$if not defined(darwin) and defined(regcall) }
{ darwin requires that the stack is aligned to 16 bytes when calling another function }

{$ifdef FPC_HAS_FEATURE_ANSISTRINGS}

{$define FPC_SYSTEM_HAS_ANSISTR_DECR_REF}
Procedure fpc_AnsiStr_Decr_Ref (Var S : Pointer); [Public,Alias:'FPC_ANSISTR_DECR_REF']; compilerproc; nostackframe; assembler;
asm
        movl    (%eax),%edx
        testl   %edx,%edx
        jz      .Lquit
        movl    $0,(%eax)          // s:=nil
        cmpl    $0,-8(%edx)        // exit if refcount<0
        jl      .Lquit
  {$ifdef FPC_PIC}
        call	fpc_geteipasecx
        addl	$_GLOBAL_OFFSET_TABLE_,%ecx
        movl	ismultithread@GOT(%ecx),%ecx
        cmpl	$0,(%ecx)
  {$else FPC_PIC}
        cmpl    $0,ismultithread
  {$endif FPC_PIC}
        je      .Lskiplock
        .byte   0xF0               // LOCK prefix, jumped over if IsMultiThread = false. FPC assembler does not accept disjoint LOCK mnemonic.
.Lskiplock:
        decl    -8(%edx)
        jz      .Lfree
.Lquit:
        ret
.Lfree:
        leal    -12(%edx),%eax     // points to start of allocation
        { freemem is not an assembler leaf function like fpc_geteipasecx, so it
          needs to be called with proper stack alignment }
{$ifdef FPC_SYSTEM_STACKALIGNMENT16}
        leal    -12(%esp),%esp
        call    FPC_FREEMEM
        leal    12(%esp),%esp
{$else  FPC_SYSTEM_STACKALIGNMENT16}
        jmp     FPC_FREEMEM        // can perform a tail call
{$endif FPC_SYSTEM_STACKALIGNMENT16}
end;

function fpc_truely_ansistr_unique(Var S : Pointer): Pointer; forward;

{$define FPC_SYSTEM_HAS_ANSISTR_UNIQUE}
Function fpc_ansistr_Unique(Var S : Pointer): Pointer; [Public,Alias : 'FPC_ANSISTR_UNIQUE']; compilerproc; nostackframe;assembler;
asm
        movl    (%eax),%edx
        testl   %edx,%edx
        jz      .Lunchanged
        cmpl    $1,-8(%edx)
        jne     fpc_truely_ansistr_unique
.Lunchanged:
        movl    %edx,%eax
end;

{$endif FPC_HAS_FEATURE_ANSISTRINGS}

{$endif ndef darwin and defined(regcall) }

{$ifndef FPC_SYSTEM_HAS_MEM_BARRIER}
{$define FPC_SYSTEM_HAS_MEM_BARRIER}

procedure ReadBarrier;assembler;nostackframe;
asm
{$ifdef CPUX86_HAS_SSE2}
  lfence
{$else CPUX86_HAS_SSE2}
  lock
  addl $0,0(%esp)
{$endif CPUX86_HAS_SSE2}
end;

procedure ReadDependencyBarrier;
begin
  { reads imply barrier on earlier reads depended on }
end;

procedure ReadWriteBarrier;assembler;nostackframe;
asm
{$ifdef CPUX86_HAS_SSE2}
  mfence
{$else CPUX86_HAS_SSE2}
  lock
  addl $0,0(%esp)
{$endif CPUX86_HAS_SSE2}
end;

procedure WriteBarrier;assembler;nostackframe;
asm
{$ifdef CPUX86_HAS_SSEUNIT}
  sfence
{$endif CPUX86_HAS_SSEUNIT}
end;

{$endif}

{$ifndef FPC_SYSTEM_HAS_BSF_QWORD}
{$define FPC_SYSTEM_HAS_BSF_QWORD}
function BsfQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
asm
{$ifdef CPUX86_HINT_BSX_DEST_UNCHANGED_ON_ZF_1}
     mov     $255-32,%eax { On AMD, BSF/R are documented to not change the destination on zero input. }
     bsfl    8(%esp),%eax { On Intel, destination is formally undefined on zero input, but in practice the behavior is the same. }
     add     $32,%eax
     bsfl    4(%esp),%eax
{$else}
     bsfl    4(%esp),%eax
     jz     .L1
     ret     $8
.L1:
     bsfl    8(%esp),%eax
     jz      .L2
     add     $32,%eax
     ret     $8
.L2:
     movl    $255,%eax
{$endif}
end;
{$endif FPC_SYSTEM_HAS_BSF_QWORD}


{$ifndef FPC_SYSTEM_HAS_BSR_QWORD}
{$define FPC_SYSTEM_HAS_BSR_QWORD}
function BsrQWord(Const AValue : QWord): cardinal; assembler; nostackframe;
asm
{$ifdef CPUX86_HINT_BSX_DEST_UNCHANGED_ON_ZF_1}
     mov     $255,%eax
     bsrl    4(%esp),%eax
     sub     $32,%eax
     bsrl    8(%esp),%eax
     add     $32,%eax
{$else}
     mov     8(%esp),%eax
     test    %eax,%eax
     jnz    .L1 { Speculate Hi(q) = 0. }
     bsrl    4(%esp),%eax
     jz     .L2
     ret     $8
.L1:
     bsrl    %eax,%eax
     add     $32,%eax
     ret     $8
.L2:
     movl    $255,%eax
{$endif}
end;
{$endif FPC_SYSTEM_HAS_BSR_QWORD}

{$ifndef FPC_SYSTEM_HAS_SAR_QWORD}
{$define FPC_SYSTEM_HAS_SAR_QWORD}
function fpc_SarInt64(Const AValue : Int64;const Shift : Byte): Int64; [Public,Alias:'FPC_SARINT64']; compilerproc; assembler; nostackframe;
asm
        movl   8(%esp),%edx
        movzbl %al,%ecx
        cmpb   $32,%al
        jnb    .L1
        movl   4(%esp),%eax
        shrdl  %cl,%edx,%eax
        sarl   %cl,%edx
        ret    $8
.L1:
        movl   %edx,%eax
        sarl   $31,%edx
        sarl   %cl,%eax // uses 5 lower bits of cl.
end;
{$endif FPC_SYSTEM_HAS_SAR_QWORD}