fpc/rtl/x86_64/set.inc

{
    This file is part of the Free Pascal run time library.
    Copyright (c) 2002 by the Free Pascal development team

    Include file with set operations called by the compiler

    See the file COPYING.FPC, included in this distribution,
    for details about the copyright.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.

 **********************************************************************}

{$asmmode intel}

{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
asm
    sub    size, 16
    jl     @Bytewise_Prepare { probably dead branch... }

@16x_Loop:
    movdqu xmm0, xmmword ptr [set1 + size]
    movdqu xmm1, xmmword ptr [set2 + size]
    por    xmm0, xmm1
    movdqu xmmword ptr [dest + size], xmm0
    sub    size, 16
    ja     @16x_Loop

    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
    movdqu xmm1, xmmword ptr [set2]
    por    xmm0, xmm1
    movdqu xmmword ptr [dest], xmm0
    ret

@Bytewise_Prepare:
    add    size, 15
@Bytewise_Loop:
    movzx  eax, byte ptr [set1 + size]
    or     al, byte ptr [set2 + size]
    mov    byte ptr [dest + size], al
    sub    size, 1
    jae    @Bytewise_Loop
end;
{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}

{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
{ Same as fpc_varset_add_sets but with 'and' instead of 'or'. }
asm
    sub    size, 16
    jl     @Bytewise_Prepare { probably dead branch... }

@16x_Loop:
    movdqu xmm0, xmmword ptr [set1 + size]
    movdqu xmm1, xmmword ptr [set2 + size]
    pand   xmm0, xmm1
    movdqu xmmword ptr [dest + size], xmm0
    sub    size, 16
    ja     @16x_Loop

    movdqu xmm0, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
    movdqu xmm1, xmmword ptr [set2]
    pand   xmm0, xmm1
    movdqu xmmword ptr [dest], xmm0
    ret

@Bytewise_Prepare:
    add    size, 15
@Bytewise_Loop:
    movzx  eax, byte ptr [set1 + size]
    and    al, byte ptr [set2 + size]
    mov    byte ptr [dest + size], al
    sub    size, 1
    jae    @Bytewise_Loop
end;
{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}

{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
{ Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
asm
    sub    size, 16
    jl     @Bytewise_Prepare { probably dead branch... }

    movdqu xmm1, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
    movdqu xmm2, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
    pandn  xmm2, xmm1

@16x_Loop:
    movdqu xmm1, xmmword ptr [set1 + size]
    movdqu xmm0, xmmword ptr [set2 + size]
    pandn  xmm0, xmm1
    movdqu xmmword ptr [dest + size], xmm0
    sub    size, 16
    ja     @16x_Loop

    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
    ret

@Bytewise_Prepare:
    add    size, 15
@Bytewise_Loop:
    movzx  eax, byte ptr [set2 + size]
    not    eax
    and    al, byte ptr [set1 + size]
    mov    byte ptr [dest + size], al
    sub    size, 1
    jae    @Bytewise_Loop
end;
{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}

{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; assembler; nostackframe;
{ Same as fpc_varset_mul_sets but with 'xor' instead of 'and not'.

  Windows: rcx = set1, rdx = set2, r8 = dest, r9 = size
  Linux:   rdi = set1, rsi = set2, rdx = dest, rcx = size }
asm
    sub    size, 16
    jl     @Bytewise_Prepare { probably dead branch... }

    movdqu xmm2, xmmword ptr [set1] { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
    movdqu xmm1, xmmword ptr [set2] { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
    pxor   xmm2, xmm1

@16x_Loop:
    movdqu xmm0, xmmword ptr [set1 + size]
    movdqu xmm1, xmmword ptr [set2 + size]
    pxor   xmm0, xmm1
    movdqu xmmword ptr [dest + size], xmm0
    sub    size, 16
    ja     @16x_Loop

    movdqu xmmword ptr [dest], xmm2 { Write precalculated tail. }
    ret

@Bytewise_Prepare:
    add    size, 15
@Bytewise_Loop:
    movzx  eax, byte ptr [set2 + size]
    xor    al, byte ptr [set1 + size]
    mov    byte ptr [dest + size], al
    sub    size, 1
    jae    @Bytewise_Loop
end;
{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}

{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; assembler; nostackframe;
{ Windows: rcx = set1, rdx = set2, r8 = size
  Linux:   rdi = set1, rsi = set2, rdx = size }
asm
    sub    size, 16
    jl     @Bytewise_Prepare { probably dead branch... }

{$if false}
{ Scans 16 bytes at a time left to right with early exits.
  Would be better for large enough sets (maybe around 64 bytes or even more) — if they existed, but worse for actually existing 32.
  Kept for the future. }
    pxor   xmm2, xmm2 { xmm2 = 0 }
    add    set1, size
    add    set2, size
    neg    size { Now "size" = -(orig.size - 16), "set1" points to orig.set1 + orig.size - 16, "set2" points to orig.set2 + orig.size - 16.
                  Loop ends on "size" >= 0, leaving up to 16 tail bytes. }

@16x_Loop:
    movdqu xmm1, xmmword ptr [set1 + size]
    movdqu xmm0, xmmword ptr [set2 + size]
    pandn  xmm0, xmm1
    pcmpeqb xmm0, xmm2
    pmovmskb eax, xmm0
    inc    ax
    jnz    @No
    add    size, 16
    js     @16x_Loop

    movdqu xmm1, xmmword ptr [set1]
    movdqu xmm0, xmmword ptr [set2]
    pandn  xmm0, xmm1
{$else}
{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
    movdqu xmm1, xmmword ptr [set1]
    movdqu xmm2, xmmword ptr [set2]
    pandn  xmm2, xmm1

@16x_Loop:
    movdqu xmm1, xmmword ptr [set1 + size]
    movdqu xmm0, xmmword ptr [set2 + size]
    pandn  xmm0, xmm1
    por    xmm2, xmm0
    sub    size, 16
    ja     @16x_Loop

    pxor   xmm0, xmm0
{$endif}
    pcmpeqb xmm0, xmm2
    pmovmskb ecx, xmm0
    xor    eax, eax
    inc    cx
    setz   al
    ret

@No:
    xor    eax, eax
    ret

@Bytewise_Prepare:
    add    size, 16
    neg    size
    sub    set1, size
    sub    set2, size
@Bytewise_Loop:
    movzx  eax, byte ptr [set2 + size]
    not    eax
    test   byte ptr [set1 + size], al
    jnz    @No
    inc    size
    jnz    @Bytewise_Loop
    mov    eax, $1
end;
{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}

{$asmmode att}