mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-22 19:49:23 +02:00
Darwin: re-enable new assembler fill*word variants
Work around with an extra jump to an extra function.
This commit is contained in:
parent
26b6d0223e
commit
d1db5d2104
@ -168,12 +168,14 @@ end;
|
|||||||
|
|
||||||
Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
|
Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe.
|
||||||
}
|
}
|
||||||
{$if not defined(darwin) and
|
{$ifndef darwin}
|
||||||
(not defined(FPC_SYSTEM_HAS_FILLCHAR)
|
{$define can_jump_into_the_middle_of_a_procedure}
|
||||||
|
{$endif darwin}
|
||||||
|
|
||||||
|
{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
|
||||||
or not defined(FPC_SYSTEM_HAS_FILLWORD)
|
or not defined(FPC_SYSTEM_HAS_FILLWORD)
|
||||||
or not defined(FPC_SYSTEM_HAS_FILLDWORD)
|
or not defined(FPC_SYSTEM_HAS_FILLDWORD)
|
||||||
or not defined(FPC_SYSTEM_HAS_FILLQWORD)
|
or not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
||||||
)}
|
|
||||||
|
|
||||||
{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
|
{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)
|
||||||
or not defined(FPC_SYSTEM_HAS_FILLWORD)
|
or not defined(FPC_SYSTEM_HAS_FILLWORD)
|
||||||
@ -209,13 +211,15 @@ asm
|
|||||||
end;
|
end;
|
||||||
{$endif FillChar/Word/DWord required.}
|
{$endif FillChar/Word/DWord required.}
|
||||||
|
|
||||||
|
{$ifdef can_jump_into_the_middle_of_a_procedure}
|
||||||
label
|
label
|
||||||
FillXxxx_MoreThanTwoXMMs;
|
FillXxxx_MoreThanTwoXMMs;
|
||||||
|
{$else can_jump_into_the_middle_of_a_procedure}
|
||||||
|
procedure FillXxxx_MoreThanTwoXMMs; forward;
|
||||||
|
{$endif can_jump_into_the_middle_of_a_procedure}
|
||||||
|
|
||||||
procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
|
procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
|
||||||
{ eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
|
{ eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
|
||||||
const
|
|
||||||
NtThreshold = 4 * 1024 * 1024;
|
|
||||||
asm
|
asm
|
||||||
movd %ecx, %xmm0
|
movd %ecx, %xmm0
|
||||||
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
||||||
@ -240,10 +244,17 @@ asm
|
|||||||
movd %esi, %xmm0
|
movd %esi, %xmm0
|
||||||
pshufd $0, %xmm0, %xmm0
|
pshufd $0, %xmm0, %xmm0
|
||||||
pop %esi
|
pop %esi
|
||||||
|
{$ifdef can_jump_into_the_middle_of_a_procedure}
|
||||||
{ FillChar (to skip the misaligning above) and FillQWord jump here.
|
{ FillChar (to skip the misaligning above) and FillQWord jump here.
|
||||||
eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
|
eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. }
|
||||||
FillXxxx_MoreThanTwoXMMs:
|
FillXxxx_MoreThanTwoXMMs:
|
||||||
|
{$else can_jump_into_the_middle_of_a_procedure}
|
||||||
|
jmp FillXxxx_MoreThanTwoXMMs
|
||||||
|
end;
|
||||||
|
|
||||||
|
procedure FillXxxx_MoreThanTwoXMMs; assembler; nostackframe;
|
||||||
|
asm
|
||||||
|
{$endif can_jump_into_the_middle_of_a_procedure}
|
||||||
lea -65(%eax,%edx), %ecx
|
lea -65(%eax,%edx), %ecx
|
||||||
and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
|
and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
|
||||||
mov %ecx, %edx { Remember T4 to edx. }
|
mov %ecx, %edx { Remember T4 to edx. }
|
||||||
@ -259,7 +270,7 @@ FillXxxx_MoreThanTwoXMMs:
|
|||||||
jle .LFourAlignedTailWrites { ecx was ≤ 96−48 }
|
jle .LFourAlignedTailWrites { ecx was ≤ 96−48 }
|
||||||
|
|
||||||
add $48, %eax { eax = H3. }
|
add $48, %eax { eax = H3. }
|
||||||
cmp $NtThreshold, %ecx
|
cmp $4 * 1024 * 1024, %ecx { Non-temporal fill threshold. }
|
||||||
jae .L64xNT_Body
|
jae .L64xNT_Body
|
||||||
|
|
||||||
.balign 16 { no-op }
|
.balign 16 { no-op }
|
||||||
@ -339,8 +350,7 @@ end;
|
|||||||
{$endif FillChar/Word/DWord/QWord required.}
|
{$endif FillChar/Word/DWord/QWord required.}
|
||||||
|
|
||||||
|
|
||||||
{$if not defined(darwin) and
|
{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)}
|
||||||
not defined(FPC_SYSTEM_HAS_FILLCHAR)}
|
|
||||||
{$define FPC_SYSTEM_HAS_FILLCHAR}
|
{$define FPC_SYSTEM_HAS_FILLCHAR}
|
||||||
procedure FillChar_3OrLess; assembler; nostackframe;
|
procedure FillChar_3OrLess; assembler; nostackframe;
|
||||||
{ cl — x, edx — byte count, Low(int32) <= edx <= 3. }
|
{ cl — x, edx — byte count, Low(int32) <= edx <= 3. }
|
||||||
@ -438,8 +448,7 @@ end;
|
|||||||
{$endif FPC_SYSTEM_HAS_FILLCHAR}
|
{$endif FPC_SYSTEM_HAS_FILLCHAR}
|
||||||
|
|
||||||
|
|
||||||
{$if not defined(darwin) and
|
{$if not defined(FPC_SYSTEM_HAS_FILLWORD)}
|
||||||
not defined(FPC_SYSTEM_HAS_FILLWORD)}
|
|
||||||
{$define FPC_SYSTEM_HAS_FILLWORD}
|
{$define FPC_SYSTEM_HAS_FILLWORD}
|
||||||
procedure FillWord_3OrLess; assembler; nostackframe;
|
procedure FillWord_3OrLess; assembler; nostackframe;
|
||||||
asm
|
asm
|
||||||
@ -527,8 +536,7 @@ end;
|
|||||||
{$endif FPC_SYSTEM_HAS_FILLWORD}
|
{$endif FPC_SYSTEM_HAS_FILLWORD}
|
||||||
|
|
||||||
|
|
||||||
{$if not defined(darwin) and
|
{$if not defined(FPC_SYSTEM_HAS_FILLDWORD)}
|
||||||
not defined(FPC_SYSTEM_HAS_FILLDWORD)}
|
|
||||||
{$define FPC_SYSTEM_HAS_FILLDWORD}
|
{$define FPC_SYSTEM_HAS_FILLDWORD}
|
||||||
procedure FillDWord_4OrLess; assembler; nostackframe;
|
procedure FillDWord_4OrLess; assembler; nostackframe;
|
||||||
asm
|
asm
|
||||||
@ -602,8 +610,7 @@ end;
|
|||||||
{$endif FPC_SYSTEM_HAS_FILLDWORD}
|
{$endif FPC_SYSTEM_HAS_FILLDWORD}
|
||||||
|
|
||||||
|
|
||||||
{$if not defined(darwin) and
|
{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
||||||
not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
|
||||||
{$define FPC_SYSTEM_HAS_FILLQWORD}
|
{$define FPC_SYSTEM_HAS_FILLQWORD}
|
||||||
{$ifndef CPUX86_HAS_SSE2}
|
{$ifndef CPUX86_HAS_SSE2}
|
||||||
procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
|
procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe;
|
||||||
|
@ -15,14 +15,10 @@
|
|||||||
|
|
||||||
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
||||||
{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS}
|
||||||
label
|
|
||||||
fpc_varset_add_sets_plain_fallback;
|
|
||||||
|
|
||||||
procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
||||||
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
||||||
asm
|
asm
|
||||||
push %ebx
|
push %ebx
|
||||||
fpc_varset_add_sets_plain_fallback:
|
|
||||||
push %esi
|
push %esi
|
||||||
mov 12(%esp), %esi { esi = size }
|
mov 12(%esp), %esi { esi = size }
|
||||||
sub $4, %esi
|
sub $4, %esi
|
||||||
@ -60,7 +56,7 @@ asm
|
|||||||
push %ebx
|
push %ebx
|
||||||
mov 8(%esp), %ebx
|
mov 8(%esp), %ebx
|
||||||
sub $16, %ebx { ebx = position }
|
sub $16, %ebx { ebx = position }
|
||||||
jl fpc_varset_add_sets_plain_fallback { probably dead branch... }
|
jl .LFallback { Hopefully dead branch... }
|
||||||
|
|
||||||
.L16x_Loop:
|
.L16x_Loop:
|
||||||
movups (%eax,%ebx), %xmm0
|
movups (%eax,%ebx), %xmm0
|
||||||
@ -75,6 +71,11 @@ asm
|
|||||||
orps %xmm1, %xmm0
|
orps %xmm1, %xmm0
|
||||||
movups %xmm0, (%ecx)
|
movups %xmm0, (%ecx)
|
||||||
pop %ebx
|
pop %ebx
|
||||||
|
ret $4
|
||||||
|
|
||||||
|
.LFallback:
|
||||||
|
pop %ebx
|
||||||
|
jmp fpc_varset_add_sets_plain
|
||||||
end;
|
end;
|
||||||
|
|
||||||
{$ifndef CPUX86_HAS_SSEUNIT}
|
{$ifndef CPUX86_HAS_SSEUNIT}
|
||||||
@ -101,14 +102,10 @@ end;
|
|||||||
|
|
||||||
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
||||||
{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS}
|
||||||
label
|
|
||||||
fpc_varset_mul_sets_plain_fallback;
|
|
||||||
|
|
||||||
procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
||||||
{ Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
|
{ Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. }
|
||||||
asm
|
asm
|
||||||
push %ebx
|
push %ebx
|
||||||
fpc_varset_mul_sets_plain_fallback:
|
|
||||||
push %esi
|
push %esi
|
||||||
mov 12(%esp), %esi { esi = size }
|
mov 12(%esp), %esi { esi = size }
|
||||||
sub $4, %esi
|
sub $4, %esi
|
||||||
@ -146,7 +143,7 @@ asm
|
|||||||
push %ebx
|
push %ebx
|
||||||
mov 8(%esp), %ebx
|
mov 8(%esp), %ebx
|
||||||
sub $16, %ebx { ebx = position }
|
sub $16, %ebx { ebx = position }
|
||||||
jl fpc_varset_mul_sets_plain_fallback { probably dead branch... }
|
jl .LFallback { Hopefully dead branch... }
|
||||||
|
|
||||||
.L16x_Loop:
|
.L16x_Loop:
|
||||||
movups (%eax,%ebx), %xmm0
|
movups (%eax,%ebx), %xmm0
|
||||||
@ -161,6 +158,11 @@ asm
|
|||||||
andps %xmm1, %xmm0
|
andps %xmm1, %xmm0
|
||||||
movups %xmm0, (%ecx)
|
movups %xmm0, (%ecx)
|
||||||
pop %ebx
|
pop %ebx
|
||||||
|
ret $4
|
||||||
|
|
||||||
|
.LFallback:
|
||||||
|
pop %ebx
|
||||||
|
jmp fpc_varset_mul_sets_plain
|
||||||
end;
|
end;
|
||||||
|
|
||||||
{$ifndef CPUX86_HAS_SSEUNIT}
|
{$ifndef CPUX86_HAS_SSEUNIT}
|
||||||
@ -187,14 +189,10 @@ end;
|
|||||||
|
|
||||||
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
||||||
{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS}
|
||||||
label
|
|
||||||
fpc_varset_sub_sets_plain_fallback;
|
|
||||||
|
|
||||||
procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
||||||
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
{ eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
||||||
asm
|
asm
|
||||||
push %ebx
|
push %ebx
|
||||||
fpc_varset_sub_sets_plain_fallback:
|
|
||||||
push %esi
|
push %esi
|
||||||
mov 12(%esp), %esi { esi = size }
|
mov 12(%esp), %esi { esi = size }
|
||||||
sub $4, %esi
|
sub $4, %esi
|
||||||
@ -237,7 +235,7 @@ asm
|
|||||||
push %ebx
|
push %ebx
|
||||||
mov 8(%esp), %ebx
|
mov 8(%esp), %ebx
|
||||||
sub $16, %ebx { ebx = position }
|
sub $16, %ebx { ebx = position }
|
||||||
jl fpc_varset_sub_sets_plain_fallback { probably dead branch... }
|
jl .LFallback { Hopefully dead branch... }
|
||||||
|
|
||||||
movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
||||||
movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
||||||
@ -253,6 +251,11 @@ asm
|
|||||||
|
|
||||||
movups %xmm2, (%ecx) { Write precalculated tail. }
|
movups %xmm2, (%ecx) { Write precalculated tail. }
|
||||||
pop %ebx
|
pop %ebx
|
||||||
|
ret $4
|
||||||
|
|
||||||
|
.LFallback:
|
||||||
|
pop %ebx
|
||||||
|
jmp fpc_varset_sub_sets_plain
|
||||||
end;
|
end;
|
||||||
|
|
||||||
{$ifndef CPUX86_HAS_SSEUNIT}
|
{$ifndef CPUX86_HAS_SSEUNIT}
|
||||||
@ -279,15 +282,11 @@ end;
|
|||||||
|
|
||||||
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
{$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
||||||
{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
{$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS}
|
||||||
label
|
|
||||||
fpc_varset_symdif_sets_plain_fallback;
|
|
||||||
|
|
||||||
procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe;
|
||||||
{ Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
|
{ Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'.
|
||||||
eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
eax = set1, edx = set2, ecx = dest, [esp + 4] = size }
|
||||||
asm
|
asm
|
||||||
push %ebx
|
push %ebx
|
||||||
fpc_varset_symdif_sets_plain_fallback:
|
|
||||||
push %esi
|
push %esi
|
||||||
mov 12(%esp), %esi { esi = size }
|
mov 12(%esp), %esi { esi = size }
|
||||||
sub $4, %esi
|
sub $4, %esi
|
||||||
@ -328,7 +327,7 @@ asm
|
|||||||
push %ebx
|
push %ebx
|
||||||
mov 8(%esp), %ebx
|
mov 8(%esp), %ebx
|
||||||
sub $16, %ebx { ebx = position }
|
sub $16, %ebx { ebx = position }
|
||||||
jl fpc_varset_symdif_sets_plain_fallback { probably dead branch... }
|
jl .LFallback { Hopefully dead branch... }
|
||||||
|
|
||||||
movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). }
|
||||||
movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. }
|
||||||
@ -344,6 +343,11 @@ asm
|
|||||||
|
|
||||||
movups %xmm2, (%ecx) { Write precalculated tail. }
|
movups %xmm2, (%ecx) { Write precalculated tail. }
|
||||||
pop %ebx
|
pop %ebx
|
||||||
|
ret $4
|
||||||
|
|
||||||
|
.LFallback:
|
||||||
|
pop %ebx
|
||||||
|
jmp fpc_varset_symdif_sets_plain
|
||||||
end;
|
end;
|
||||||
|
|
||||||
{$ifndef CPUX86_HAS_SSEUNIT}
|
{$ifndef CPUX86_HAS_SSEUNIT}
|
||||||
|
Loading…
Reference in New Issue
Block a user