Darwin: re-enable new assembler fill*word variants

Work around with an extra jump to an extra function.
This commit is contained in:
Rika Ichinose 2024-11-22 03:18:18 +03:00
parent 26b6d0223e
commit d1db5d2104
2 changed files with 49 additions and 38 deletions

View File

@ -168,12 +168,14 @@ end;
Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe. Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
} }
{$if not defined(darwin) and {$ifndef darwin}
(not defined(FPC_SYSTEM_HAS_FILLCHAR) {$define can_jump_into_the_middle_of_a_procedure}
{$endif darwin}
{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
or not defined(FPC_SYSTEM_HAS_FILLWORD) or not defined(FPC_SYSTEM_HAS_FILLWORD)
or not defined(FPC_SYSTEM_HAS_FILLDWORD) or not defined(FPC_SYSTEM_HAS_FILLDWORD)
or not defined(FPC_SYSTEM_HAS_FILLQWORD) or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
)}
{$if not defined(FPC_SYSTEM_HAS_FILLCHAR) {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
or not defined(FPC_SYSTEM_HAS_FILLWORD) or not defined(FPC_SYSTEM_HAS_FILLWORD)
@ -209,13 +211,15 @@ asm
end; end;
{$endif FillChar/Word/DWord required.} {$endif FillChar/Word/DWord required.}
{$ifdef can_jump_into_the_middle_of_a_procedure}
label label
FillXxxx_MoreThanTwoXMMs; FillXxxx_MoreThanTwoXMMs;
{$else can_jump_into_the_middle_of_a_procedure}
procedure FillXxxx_MoreThanTwoXMMs; forward;
{$endif can_jump_into_the_middle_of_a_procedure}
procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe; procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
{ eax x, ecx uint32 pattern, edx byte count >= 16 (preferably > 16). } { eax x, ecx uint32 pattern, edx byte count >= 16 (preferably > 16). }
const
NtThreshold = 4 * 1024 * 1024;
asm asm
movd %ecx, %xmm0 movd %ecx, %xmm0
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes } pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
@ -240,10 +244,17 @@ asm
movd %esi, %xmm0 movd %esi, %xmm0
pshufd $0, %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0
pop %esi pop %esi
{$ifdef can_jump_into_the_middle_of_a_procedure}
{ FillChar (to skip the misaligning above) and FillQWord jump here. { FillChar (to skip the misaligning above) and FillQWord jump here.
eax x, edx byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. } eax x, edx byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
FillXxxx_MoreThanTwoXMMs: FillXxxx_MoreThanTwoXMMs:
{$else can_jump_into_the_middle_of_a_procedure}
jmp FillXxxx_MoreThanTwoXMMs
end;
procedure FillXxxx_MoreThanTwoXMMs; assembler; nostackframe;
asm
{$endif can_jump_into_the_middle_of_a_procedure}
lea -65(%eax,%edx), %ecx lea -65(%eax,%edx), %ecx
and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. } and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
mov %ecx, %edx { Remember T4 to edx. } mov %ecx, %edx { Remember T4 to edx. }
@ -259,7 +270,7 @@ FillXxxx_MoreThanTwoXMMs:
jle .LFourAlignedTailWrites { ecx was 9648 } jle .LFourAlignedTailWrites { ecx was 9648 }
add $48, %eax { eax = H3. } add $48, %eax { eax = H3. }
cmp $NtThreshold, %ecx cmp $4 * 1024 * 1024, %ecx { Non-temporal fill threshold. }
jae .L64xNT_Body jae .L64xNT_Body
.balign 16 { no-op } .balign 16 { no-op }
@ -339,8 +350,7 @@ end;
{$endif FillChar/Word/DWord/QWord required.} {$endif FillChar/Word/DWord/QWord required.}
{$if not defined(darwin) and {$if not defined(FPC_SYSTEM_HAS_FILLCHAR)}
not defined(FPC_SYSTEM_HAS_FILLCHAR)}
{$define FPC_SYSTEM_HAS_FILLCHAR} {$define FPC_SYSTEM_HAS_FILLCHAR}
procedure FillChar_3OrLess; assembler; nostackframe; procedure FillChar_3OrLess; assembler; nostackframe;
{ cl x, edx byte count, Low(int32) <= edx <= 3. } { cl x, edx byte count, Low(int32) <= edx <= 3. }
@ -438,8 +448,7 @@ end;
{$endif FPC_SYSTEM_HAS_FILLCHAR} {$endif FPC_SYSTEM_HAS_FILLCHAR}
{$if not defined(darwin) and {$if not defined(FPC_SYSTEM_HAS_FILLWORD)}
not defined(FPC_SYSTEM_HAS_FILLWORD)}
{$define FPC_SYSTEM_HAS_FILLWORD} {$define FPC_SYSTEM_HAS_FILLWORD}
procedure FillWord_3OrLess; assembler; nostackframe; procedure FillWord_3OrLess; assembler; nostackframe;
asm asm
@ -527,8 +536,7 @@ end;
{$endif FPC_SYSTEM_HAS_FILLWORD} {$endif FPC_SYSTEM_HAS_FILLWORD}
{$if not defined(darwin) and {$if not defined(FPC_SYSTEM_HAS_FILLDWORD)}
not defined(FPC_SYSTEM_HAS_FILLDWORD)}
{$define FPC_SYSTEM_HAS_FILLDWORD} {$define FPC_SYSTEM_HAS_FILLDWORD}
procedure FillDWord_4OrLess; assembler; nostackframe; procedure FillDWord_4OrLess; assembler; nostackframe;
asm asm
@ -602,8 +610,7 @@ end;
{$endif FPC_SYSTEM_HAS_FILLDWORD} {$endif FPC_SYSTEM_HAS_FILLDWORD}
{$if not defined(darwin) and {$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
not defined(FPC_SYSTEM_HAS_FILLQWORD)}
{$define FPC_SYSTEM_HAS_FILLQWORD} {$define FPC_SYSTEM_HAS_FILLQWORD}
{$ifndef CPUX86_HAS_SSE2} {$ifndef CPUX86_HAS_SSE2}
procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe; procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;

View File

@ -15,14 +15,10 @@
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
label
fpc_varset_add_sets_plain_fallback;
procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size } { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
asm asm
push %ebx push %ebx
fpc_varset_add_sets_plain_fallback:
push %esi push %esi
mov 12(%esp), %esi { esi = size } mov 12(%esp), %esi { esi = size }
sub $4, %esi sub $4, %esi
@ -60,7 +56,7 @@ asm
push %ebx push %ebx
mov 8(%esp), %ebx mov 8(%esp), %ebx
sub $16, %ebx { ebx = position } sub $16, %ebx { ebx = position }
jl fpc_varset_add_sets_plain_fallback { probably dead branch... } jl .LFallback { Hopefully dead branch... }
.L16x_Loop: .L16x_Loop:
movups (%eax,%ebx), %xmm0 movups (%eax,%ebx), %xmm0
@ -75,6 +71,11 @@ asm
orps %xmm1, %xmm0 orps %xmm1, %xmm0
movups %xmm0, (%ecx) movups %xmm0, (%ecx)
pop %ebx pop %ebx
ret $4
.LFallback:
pop %ebx
jmp fpc_varset_add_sets_plain
end; end;
{$ifndef CPUX86_HAS_SSEUNIT} {$ifndef CPUX86_HAS_SSEUNIT}
@ -101,14 +102,10 @@ end;
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
label
fpc_varset_mul_sets_plain_fallback;
procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
{ Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. } { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
asm asm
push %ebx push %ebx
fpc_varset_mul_sets_plain_fallback:
push %esi push %esi
mov 12(%esp), %esi { esi = size } mov 12(%esp), %esi { esi = size }
sub $4, %esi sub $4, %esi
@ -146,7 +143,7 @@ asm
push %ebx push %ebx
mov 8(%esp), %ebx mov 8(%esp), %ebx
sub $16, %ebx { ebx = position } sub $16, %ebx { ebx = position }
jl fpc_varset_mul_sets_plain_fallback { probably dead branch... } jl .LFallback { Hopefully dead branch... }
.L16x_Loop: .L16x_Loop:
movups (%eax,%ebx), %xmm0 movups (%eax,%ebx), %xmm0
@ -161,6 +158,11 @@ asm
andps %xmm1, %xmm0 andps %xmm1, %xmm0
movups %xmm0, (%ecx) movups %xmm0, (%ecx)
pop %ebx pop %ebx
ret $4
.LFallback:
pop %ebx
jmp fpc_varset_mul_sets_plain
end; end;
{$ifndef CPUX86_HAS_SSEUNIT} {$ifndef CPUX86_HAS_SSEUNIT}
@ -187,14 +189,10 @@ end;
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
label
fpc_varset_sub_sets_plain_fallback;
procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size } { eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
asm asm
push %ebx push %ebx
fpc_varset_sub_sets_plain_fallback:
push %esi push %esi
mov 12(%esp), %esi { esi = size } mov 12(%esp), %esi { esi = size }
sub $4, %esi sub $4, %esi
@ -237,7 +235,7 @@ asm
push %ebx push %ebx
mov 8(%esp), %ebx mov 8(%esp), %ebx
sub $16, %ebx { ebx = position } sub $16, %ebx { ebx = position }
jl fpc_varset_sub_sets_plain_fallback { probably dead branch... } jl .LFallback { Hopefully dead branch... }
movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). } movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. } movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
@ -253,6 +251,11 @@ asm
movups %xmm2, (%ecx) { Write precalculated tail. } movups %xmm2, (%ecx) { Write precalculated tail. }
pop %ebx pop %ebx
ret $4
.LFallback:
pop %ebx
jmp fpc_varset_sub_sets_plain
end; end;
{$ifndef CPUX86_HAS_SSEUNIT} {$ifndef CPUX86_HAS_SSEUNIT}
@ -279,15 +282,11 @@ end;
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS} {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
label
fpc_varset_symdif_sets_plain_fallback;
procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
{ Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'. { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
eax = set1, edx = set2, ecx = dest, [esp + 4] = size } eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
asm asm
push %ebx push %ebx
fpc_varset_symdif_sets_plain_fallback:
push %esi push %esi
mov 12(%esp), %esi { esi = size } mov 12(%esp), %esi { esi = size }
sub $4, %esi sub $4, %esi
@ -328,7 +327,7 @@ asm
push %ebx push %ebx
mov 8(%esp), %ebx mov 8(%esp), %ebx
sub $16, %ebx { ebx = position } sub $16, %ebx { ebx = position }
jl fpc_varset_symdif_sets_plain_fallback { probably dead branch... } jl .LFallback { Hopefully dead branch... }
movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). } movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. } movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
@ -344,6 +343,11 @@ asm
movups %xmm2, (%ecx) { Write precalculated tail. } movups %xmm2, (%ecx) { Write precalculated tail. }
pop %ebx pop %ebx
ret $4
.LFallback:
pop %ebx
jmp fpc_varset_symdif_sets_plain
end; end;
{$ifndef CPUX86_HAS_SSEUNIT} {$ifndef CPUX86_HAS_SSEUNIT}