mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-10-20 04:31:59 +02:00
479 lines
15 KiB
PHP
479 lines
15 KiB
PHP
{
|
|
This file is part of the Free Pascal run time library.
|
|
Copyright (c) 1999-2000 by the Free Pascal development team
|
|
|
|
Include file with set operations called by the compiler
|
|
|
|
See the file COPYING.FPC, included in this distribution,
|
|
for details about the copyright.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
|
|
|
**********************************************************************}
|
|
|
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
|
procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
|
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
|
asm
|
|
push %ebx
|
|
push %esi
|
|
mov 12(%esp), %esi { esi = size }
|
|
sub $4, %esi
|
|
jl .LBytewise_Prepare { probably dead branch... }
|
|
|
|
.L4x_Loop:
|
|
mov (%eax,%esi), %ebx
|
|
or (%edx,%esi), %ebx
|
|
mov %ebx, (%ecx,%esi)
|
|
sub $4, %esi
|
|
ja .L4x_Loop
|
|
|
|
mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
|
|
or (%edx), %ebx
|
|
mov %ebx, (%ecx)
|
|
pop %esi
|
|
pop %ebx
|
|
ret $4
|
|
|
|
.LBytewise_Prepare:
|
|
add $3, %esi
|
|
.LBytewise_Loop:
|
|
movzbl (%eax,%esi), %ebx
|
|
or (%edx,%esi), %bl
|
|
mov %bl, (%ecx,%esi)
|
|
sub $1, %esi
|
|
jae .LBytewise_Loop
|
|
pop %esi
|
|
pop %ebx
|
|
end;
|
|
|
|
procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_add_sets {$else} fpc_varset_add_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
|
|
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
|
asm
|
|
push %ebx
|
|
mov 8(%esp), %ebx
|
|
sub $16, %ebx { ebx = position }
|
|
jl .LFallback { Hopefully dead branch... }
|
|
|
|
.L16x_Loop:
|
|
movups (%eax,%ebx), %xmm0
|
|
movups (%edx,%ebx), %xmm1
|
|
orps %xmm1, %xmm0
|
|
movups %xmm0, (%ecx,%ebx)
|
|
sub $16, %ebx
|
|
ja .L16x_Loop
|
|
|
|
movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
|
movups (%edx), %xmm1
|
|
orps %xmm1, %xmm0
|
|
movups %xmm0, (%ecx)
|
|
pop %ebx
|
|
ret $4
|
|
|
|
.LFallback:
|
|
pop %ebx
|
|
jmp fpc_varset_add_sets_plain
|
|
end;
|
|
|
|
{$ifndef CPUX86_HAS_SSEUNIT}
|
|
procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
|
|
|
|
var
|
|
fpc_varset_add_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_add_sets_dispatch;
|
|
|
|
procedure fpc_varset_add_sets_dispatch(const set1,set2; var dest;size : ptrint);
|
|
begin
|
|
if has_sse_support then
|
|
fpc_varset_add_sets_impl:=@fpc_varset_add_sets_sse
|
|
else
|
|
fpc_varset_add_sets_impl:=@fpc_varset_add_sets_plain;
|
|
fpc_varset_add_sets_impl(set1,set2,dest,size);
|
|
end;
|
|
|
|
procedure fpc_varset_add_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
|
|
begin
|
|
fpc_varset_add_sets_impl(set1,set2,dest,size);
|
|
end;
|
|
{$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_add_sets dispatcher)}
|
|
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
|
|
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
|
procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
|
{ Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
|
|
asm
|
|
push %ebx
|
|
push %esi
|
|
mov 12(%esp), %esi { esi = size }
|
|
sub $4, %esi
|
|
jl .LBytewise_Prepare { probably dead branch... }
|
|
|
|
.L4x_Loop:
|
|
mov (%eax,%esi), %ebx
|
|
and (%edx,%esi), %ebx
|
|
mov %ebx, (%ecx,%esi)
|
|
sub $4, %esi
|
|
ja .L4x_Loop
|
|
|
|
mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
|
|
and (%edx), %ebx
|
|
mov %ebx, (%ecx)
|
|
pop %esi
|
|
pop %ebx
|
|
ret $4
|
|
|
|
.LBytewise_Prepare:
|
|
add $3, %esi
|
|
.LBytewise_Loop:
|
|
movzbl (%eax,%esi), %ebx
|
|
and (%edx,%esi), %bl
|
|
mov %bl, (%ecx,%esi)
|
|
sub $1, %esi
|
|
jae .LBytewise_Loop
|
|
pop %esi
|
|
pop %ebx
|
|
end;
|
|
|
|
procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_mul_sets {$else} fpc_varset_mul_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
|
|
{ Same as fpc_varset_add_sets_sse but with 'and' instead of 'or'. }
|
|
asm
|
|
push %ebx
|
|
mov 8(%esp), %ebx
|
|
sub $16, %ebx { ebx = position }
|
|
jl .LFallback { Hopefully dead branch... }
|
|
|
|
.L16x_Loop:
|
|
movups (%eax,%ebx), %xmm0
|
|
movups (%edx,%ebx), %xmm1
|
|
andps %xmm1, %xmm0
|
|
movups %xmm0, (%ecx,%ebx)
|
|
sub $16, %ebx
|
|
ja .L16x_Loop
|
|
|
|
movups (%eax), %xmm0 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
|
movups (%edx), %xmm1
|
|
andps %xmm1, %xmm0
|
|
movups %xmm0, (%ecx)
|
|
pop %ebx
|
|
ret $4
|
|
|
|
.LFallback:
|
|
pop %ebx
|
|
jmp fpc_varset_mul_sets_plain
|
|
end;
|
|
|
|
{$ifndef CPUX86_HAS_SSEUNIT}
|
|
procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
|
|
|
|
var
|
|
fpc_varset_mul_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_mul_sets_dispatch;
|
|
|
|
procedure fpc_varset_mul_sets_dispatch(const set1,set2; var dest;size : ptrint);
|
|
begin
|
|
if has_sse_support then
|
|
fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_sse
|
|
else
|
|
fpc_varset_mul_sets_impl:=@fpc_varset_mul_sets_plain;
|
|
fpc_varset_mul_sets_impl(set1,set2,dest,size);
|
|
end;
|
|
|
|
procedure fpc_varset_mul_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
|
|
begin
|
|
fpc_varset_mul_sets_impl(set1,set2,dest,size);
|
|
end;
|
|
{$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_mul_sets dispatcher)}
|
|
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
|
|
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
|
procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
|
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
|
asm
|
|
push %ebx
|
|
push %esi
|
|
mov 12(%esp), %esi { esi = size }
|
|
sub $4, %esi
|
|
jl .LBytewise_Prepare { probably dead branch... }
|
|
|
|
mov (%edx), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
|
|
not %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
|
and (%eax), %ebx
|
|
push %ebx
|
|
.L4x_Loop:
|
|
mov (%edx,%esi), %ebx
|
|
not %ebx
|
|
and (%eax,%esi), %ebx
|
|
mov %ebx, (%ecx,%esi)
|
|
sub $4, %esi
|
|
ja .L4x_Loop
|
|
|
|
pop %ebx
|
|
mov %ebx, (%ecx) { Write precalculated tail. }
|
|
pop %esi
|
|
pop %ebx
|
|
ret $4
|
|
|
|
.LBytewise_Prepare:
|
|
add $3, %esi
|
|
.LBytewise_Loop:
|
|
movzbl (%edx,%esi), %ebx
|
|
not %ebx
|
|
and (%eax,%esi), %bl
|
|
mov %bl, (%ecx,%esi)
|
|
sub $1, %esi
|
|
jae .LBytewise_Loop
|
|
pop %esi
|
|
pop %ebx
|
|
end;
|
|
|
|
procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_sub_sets {$else} fpc_varset_sub_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
|
|
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
|
asm
|
|
push %ebx
|
|
mov 8(%esp), %ebx
|
|
sub $16, %ebx { ebx = position }
|
|
jl .LFallback { Hopefully dead branch... }
|
|
|
|
movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
|
movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
|
andnps %xmm1, %xmm2
|
|
|
|
.L16x_Loop:
|
|
movups (%eax,%ebx), %xmm1
|
|
movups (%edx,%ebx), %xmm0
|
|
andnps %xmm1, %xmm0
|
|
movups %xmm0, (%ecx,%ebx)
|
|
sub $16, %ebx
|
|
ja .L16x_Loop
|
|
|
|
movups %xmm2, (%ecx) { Write precalculated tail. }
|
|
pop %ebx
|
|
ret $4
|
|
|
|
.LFallback:
|
|
pop %ebx
|
|
jmp fpc_varset_sub_sets_plain
|
|
end;
|
|
|
|
{$ifndef CPUX86_HAS_SSEUNIT}
|
|
procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
|
|
|
|
var
|
|
fpc_varset_sub_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_sub_sets_dispatch;
|
|
|
|
procedure fpc_varset_sub_sets_dispatch(const set1,set2; var dest;size : ptrint);
|
|
begin
|
|
if has_sse_support then
|
|
fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_sse
|
|
else
|
|
fpc_varset_sub_sets_impl:=@fpc_varset_sub_sets_plain;
|
|
fpc_varset_sub_sets_impl(set1,set2,dest,size);
|
|
end;
|
|
|
|
procedure fpc_varset_sub_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
|
|
begin
|
|
fpc_varset_sub_sets_impl(set1,set2,dest,size);
|
|
end;
|
|
{$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_sub_sets dispatcher)}
|
|
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
|
|
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
|
procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
|
{ Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
|
|
eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
|
asm
|
|
push %ebx
|
|
push %esi
|
|
mov 12(%esp), %esi { esi = size }
|
|
sub $4, %esi
|
|
jl .LBytewise_Prepare { probably dead branch... }
|
|
|
|
mov (%eax), %ebx { Tail, just in case (if size is always divisible by 4, 4x_Loop can be altered to handle everything instead). }
|
|
xor (%edx), %ebx { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
|
push %ebx
|
|
.L4x_Loop:
|
|
mov (%eax,%esi), %ebx
|
|
xor (%edx,%esi), %ebx
|
|
mov %ebx, (%ecx,%esi)
|
|
sub $4, %esi
|
|
ja .L4x_Loop
|
|
|
|
pop %ebx
|
|
mov %ebx, (%ecx) { Write precalculated tail. }
|
|
pop %esi
|
|
pop %ebx
|
|
ret $4
|
|
|
|
.LBytewise_Prepare:
|
|
add $3, %esi
|
|
.LBytewise_Loop:
|
|
movzbl (%eax,%esi), %ebx
|
|
xor (%edx,%esi), %bl
|
|
mov %bl, (%ecx,%esi)
|
|
sub $1, %esi
|
|
jae .LBytewise_Loop
|
|
pop %esi
|
|
pop %ebx
|
|
end;
|
|
|
|
procedure {$ifdef CPUX86_HAS_SSEUNIT} fpc_varset_symdif_sets {$else} fpc_varset_symdif_sets_sse {$endif} (const set1,set2; var dest;size : ptrint); assembler; nostackframe; {$ifdef CPUX86_HAS_SSEUNIT} compilerproc; {$endif}
|
|
{ Same as fpc_varset_sub_sets_sse but with 'xor' instead of 'and not'.
|
|
eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
|
asm
|
|
push %ebx
|
|
mov 8(%esp), %ebx
|
|
sub $16, %ebx { ebx = position }
|
|
jl .LFallback { Hopefully dead branch... }
|
|
|
|
movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
|
movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
|
xorps %xmm1, %xmm2
|
|
|
|
.L16x_Loop:
|
|
movups (%eax,%ebx), %xmm1
|
|
movups (%edx,%ebx), %xmm0
|
|
xorps %xmm1, %xmm0
|
|
movups %xmm0, (%ecx,%ebx)
|
|
sub $16, %ebx
|
|
ja .L16x_Loop
|
|
|
|
movups %xmm2, (%ecx) { Write precalculated tail. }
|
|
pop %ebx
|
|
ret $4
|
|
|
|
.LFallback:
|
|
pop %ebx
|
|
jmp fpc_varset_symdif_sets_plain
|
|
end;
|
|
|
|
{$ifndef CPUX86_HAS_SSEUNIT}
|
|
procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint); forward;
|
|
|
|
var
|
|
fpc_varset_symdif_sets_impl: procedure(const set1,set2; var dest;size : ptrint) = @fpc_varset_symdif_sets_dispatch;
|
|
|
|
procedure fpc_varset_symdif_sets_dispatch(const set1,set2; var dest;size : ptrint);
|
|
begin
|
|
if has_sse_support then
|
|
fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_sse
|
|
else
|
|
fpc_varset_symdif_sets_impl:=@fpc_varset_symdif_sets_plain;
|
|
fpc_varset_symdif_sets_impl(set1,set2,dest,size);
|
|
end;
|
|
|
|
procedure fpc_varset_symdif_sets(const set1,set2; var dest;size : ptrint); compilerproc; inline;
|
|
begin
|
|
fpc_varset_symdif_sets_impl(set1,set2,dest,size);
|
|
end;
|
|
{$endif ndef CPUX86_HAS_SSEUNIT (need fpc_varset_symdif_sets dispatcher)}
|
|
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
|
|
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
|
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
|
|
function fpc_varset_contains_sets_plain(const set1,set2;size : ptrint):boolean; assembler; nostackframe;
|
|
{ eax = set1, edx = set2, ecx = size }
|
|
asm
|
|
push %ebx
|
|
sub $4, %ecx
|
|
jl .LBytewise_Prepare { probably dead branch... }
|
|
add %ecx, %eax
|
|
add %ecx, %edx
|
|
neg %ecx { Now ecx = -(size - 4), eax points to set1 + size - 4, edx points to set2 + size - 4. Loop ends on size >= 0, leaving up to 4 tail bytes. }
|
|
|
|
.L4x_Loop:
|
|
mov (%edx,%ecx), %ebx
|
|
not %ebx
|
|
test %ebx, (%eax,%ecx)
|
|
jnz .LNo
|
|
add $4, %ecx
|
|
js .L4x_Loop
|
|
|
|
mov (%edx), %ebx { Tail. }
|
|
not %ebx
|
|
mov %eax, %ecx { eax value is still required to access set1 tail, but eax is going to be xor-zeroed for setz. }
|
|
xor %eax, %eax
|
|
test %ebx, (%ecx)
|
|
setz %al
|
|
pop %ebx
|
|
ret
|
|
|
|
.LNo:
|
|
xor %eax, %eax
|
|
pop %ebx
|
|
ret
|
|
|
|
.LBytewise_Prepare:
|
|
add $4, %ecx
|
|
neg %ecx
|
|
sub %ecx, %eax
|
|
sub %ecx, %edx
|
|
.LBytewise_Loop:
|
|
movzbl (%edx,%ecx), %ebx
|
|
not %ebx
|
|
test %bl, (%eax,%ecx)
|
|
jnz .LNo
|
|
inc %ecx
|
|
jnz .LBytewise_Loop
|
|
mov $1, %eax
|
|
pop %ebx
|
|
end;
|
|
|
|
function {$ifdef CPUX86_HAS_SSE2} fpc_varset_contains_sets {$else} fpc_varset_contains_sets_sse2 {$endif} (const set1,set2;size : ptrint):boolean; assembler; nostackframe; {$ifdef CPUX86_HAS_SSE2} compilerproc; {$endif}
|
|
{ eax = set1, edx = set2, ecx = size }
|
|
asm
|
|
sub $16, %ecx
|
|
jl .LFallback { probably dead branch... }
|
|
|
|
{ Folds all 16-byte "set1 and not set2" chunks with OR and checks the final result for zero. Better for small enough sets. }
|
|
movdqu (%eax), %xmm1
|
|
movdqu (%edx), %xmm2
|
|
pandn %xmm1, %xmm2
|
|
|
|
.L16x_Loop:
|
|
movdqu (%eax,%ecx), %xmm1
|
|
movdqu (%edx,%ecx), %xmm0
|
|
pandn %xmm1, %xmm0
|
|
por %xmm0, %xmm2
|
|
sub $16, %ecx
|
|
ja .L16x_Loop
|
|
|
|
pxor %xmm0, %xmm0
|
|
pcmpeqb %xmm2,%xmm0
|
|
pmovmskb %xmm0, %ecx
|
|
xor %eax, %eax
|
|
inc %cx
|
|
setz %al
|
|
ret
|
|
|
|
.LFallback:
|
|
add $16, %ecx
|
|
jmp fpc_varset_contains_sets_plain
|
|
end;
|
|
|
|
{$ifndef CPUX86_HAS_SSE2}
|
|
function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean; forward;
|
|
|
|
var
|
|
fpc_varset_contains_sets_impl: function(const set1,set2;size : ptrint):boolean = @fpc_varset_contains_sets_dispatch;
|
|
|
|
function fpc_varset_contains_sets_dispatch(const set1,set2;size : ptrint):boolean;
|
|
begin
|
|
if has_sse2_support then
|
|
fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_sse2
|
|
else
|
|
fpc_varset_contains_sets_impl:=@fpc_varset_contains_sets_plain;
|
|
result:=fpc_varset_contains_sets_impl(set1,set2,size);
|
|
end;
|
|
|
|
function fpc_varset_contains_sets(const set1,set2;size : ptrint):boolean; compilerproc; inline;
|
|
begin
|
|
result:=fpc_varset_contains_sets_impl(set1,set2,size);
|
|
end;
|
|
{$endif ndef CPUX86_HAS_SSE2 (need fpc_varset_contains_sets dispatcher)}
|
|
{$endif ndef FPC_SYSTEM_HAS_FPC_VARSET_CONTAINS_SET}
|
|
|