diff --git a/rtl/i386/i386.inc b/rtl/i386/i386.inc index 76b9e99026..e52788c5e2 100644 --- a/rtl/i386/i386.inc +++ b/rtl/i386/i386.inc @@ -168,12 +168,14 @@ end; Since FPC does not attempt it even for local labels, this kind of code is inherently unsafe. } -{$if not defined(darwin) and - (not defined(FPC_SYSTEM_HAS_FILLCHAR) - or not defined(FPC_SYSTEM_HAS_FILLWORD) - or not defined(FPC_SYSTEM_HAS_FILLDWORD) - or not defined(FPC_SYSTEM_HAS_FILLQWORD) -)} +{$ifndef darwin} + {$define can_jump_into_the_middle_of_a_procedure} +{$endif darwin} + +{$if not defined(FPC_SYSTEM_HAS_FILLCHAR) + or not defined(FPC_SYSTEM_HAS_FILLWORD) + or not defined(FPC_SYSTEM_HAS_FILLDWORD) + or not defined(FPC_SYSTEM_HAS_FILLQWORD)} {$if not defined(FPC_SYSTEM_HAS_FILLCHAR) or not defined(FPC_SYSTEM_HAS_FILLWORD) @@ -209,13 +211,15 @@ asm end; {$endif FillChar/Word/DWord required.} +{$ifdef can_jump_into_the_middle_of_a_procedure} label FillXxxx_MoreThanTwoXMMs; +{$else can_jump_into_the_middle_of_a_procedure} +procedure FillXxxx_MoreThanTwoXMMs; forward; +{$endif can_jump_into_the_middle_of_a_procedure} procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe; { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). } -const - NtThreshold = 4 * 1024 * 1024; asm movd %ecx, %xmm0 pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes } @@ -240,10 +244,17 @@ asm movd %esi, %xmm0 pshufd $0, %xmm0, %xmm0 pop %esi - +{$ifdef can_jump_into_the_middle_of_a_procedure} { FillChar (to skip the misaligning above) and FillQWord jump here. eax — x, edx — byte count > 32, xmm0 = pattern for ALIGNED writes, first and last 16 bytes written. } FillXxxx_MoreThanTwoXMMs: +{$else can_jump_into_the_middle_of_a_procedure} + jmp FillXxxx_MoreThanTwoXMMs +end; + +procedure FillXxxx_MoreThanTwoXMMs; assembler; nostackframe; +asm +{$endif can_jump_into_the_middle_of_a_procedure} lea -65(%eax,%edx), %ecx and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. } mov %ecx, %edx { Remember T4 to edx. } @@ -259,7 +270,7 @@ FillXxxx_MoreThanTwoXMMs: jle .LFourAlignedTailWrites { ecx was ≤ 96−48 } add $48, %eax { eax = H3. } - cmp $NtThreshold, %ecx + cmp $4 * 1024 * 1024, %ecx { Non-temporal fill threshold. } jae .L64xNT_Body .balign 16 { no-op } @@ -339,8 +350,7 @@ end; {$endif FillChar/Word/DWord/QWord required.} -{$if not defined(darwin) and - not defined(FPC_SYSTEM_HAS_FILLCHAR)} +{$if not defined(FPC_SYSTEM_HAS_FILLCHAR)} {$define FPC_SYSTEM_HAS_FILLCHAR} procedure FillChar_3OrLess; assembler; nostackframe; { cl — x, edx — byte count, Low(int32) <= edx <= 3. } @@ -438,8 +448,7 @@ end; {$endif FPC_SYSTEM_HAS_FILLCHAR} -{$if not defined(darwin) and - not defined(FPC_SYSTEM_HAS_FILLWORD)} +{$if not defined(FPC_SYSTEM_HAS_FILLWORD)} {$define FPC_SYSTEM_HAS_FILLWORD} procedure FillWord_3OrLess; assembler; nostackframe; asm @@ -527,8 +536,7 @@ end; {$endif FPC_SYSTEM_HAS_FILLWORD} -{$if not defined(darwin) and - not defined(FPC_SYSTEM_HAS_FILLDWORD)} +{$if not defined(FPC_SYSTEM_HAS_FILLDWORD)} {$define FPC_SYSTEM_HAS_FILLDWORD} procedure FillDWord_4OrLess; assembler; nostackframe; asm @@ -602,8 +610,7 @@ end; {$endif FPC_SYSTEM_HAS_FILLDWORD} -{$if not defined(darwin) and - not defined(FPC_SYSTEM_HAS_FILLQWORD)} +{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)} {$define FPC_SYSTEM_HAS_FILLQWORD} {$ifndef CPUX86_HAS_SSE2} procedure FillQWord_Plain(var x;count:SizeInt;value:QWord);assembler;nostackframe; diff --git a/rtl/i386/set.inc b/rtl/i386/set.inc index c94bc93af9..bb482d524a 100644 --- a/rtl/i386/set.inc +++ b/rtl/i386/set.inc @@ -15,14 +15,10 @@ {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_ADD_SETS} -label - fpc_varset_add_sets_plain_fallback; - procedure fpc_varset_add_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; { eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx -fpc_varset_add_sets_plain_fallback: push %esi mov 12(%esp), %esi { esi = size } sub $4, %esi @@ -60,7 +56,7 @@ asm push %ebx mov 8(%esp), %ebx sub $16, %ebx { ebx = position } - jl fpc_varset_add_sets_plain_fallback { probably dead branch... } + jl .LFallback { Hopefully dead branch... } .L16x_Loop: movups (%eax,%ebx), %xmm0 @@ -75,6 +71,11 @@ asm orps %xmm1, %xmm0 movups %xmm0, (%ecx) pop %ebx + ret $4 + +.LFallback: + pop %ebx + jmp fpc_varset_add_sets_plain end; {$ifndef CPUX86_HAS_SSEUNIT} @@ -101,14 +102,10 @@ end; {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_MUL_SETS} -label - fpc_varset_mul_sets_plain_fallback; - procedure fpc_varset_mul_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; { Same as fpc_varset_add_sets_plain but with 'and' instead of 'or'. } asm push %ebx -fpc_varset_mul_sets_plain_fallback: push %esi mov 12(%esp), %esi { esi = size } sub $4, %esi @@ -146,7 +143,7 @@ asm push %ebx mov 8(%esp), %ebx sub $16, %ebx { ebx = position } - jl fpc_varset_mul_sets_plain_fallback { probably dead branch... } + jl .LFallback { Hopefully dead branch... } .L16x_Loop: movups (%eax,%ebx), %xmm0 @@ -161,6 +158,11 @@ asm andps %xmm1, %xmm0 movups %xmm0, (%ecx) pop %ebx + ret $4 + +.LFallback: + pop %ebx + jmp fpc_varset_mul_sets_plain end; {$ifndef CPUX86_HAS_SSEUNIT} @@ -187,14 +189,10 @@ end; {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_SUB_SETS} -label - fpc_varset_sub_sets_plain_fallback; - procedure fpc_varset_sub_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; { eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx -fpc_varset_sub_sets_plain_fallback: push %esi mov 12(%esp), %esi { esi = size } sub $4, %esi @@ -237,7 +235,7 @@ asm push %ebx mov 8(%esp), %ebx sub $16, %ebx { ebx = position } - jl fpc_varset_sub_sets_plain_fallback { probably dead branch... } + jl .LFallback { Hopefully dead branch... } movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). } movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. } @@ -253,6 +251,11 @@ asm movups %xmm2, (%ecx) { Write precalculated tail. } pop %ebx + ret $4 + +.LFallback: + pop %ebx + jmp fpc_varset_sub_sets_plain end; {$ifndef CPUX86_HAS_SSEUNIT} @@ -279,15 +282,11 @@ end; {$ifndef FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS} {$define FPC_SYSTEM_HAS_FPC_VARSET_SYMDIF_SETS} -label - fpc_varset_symdif_sets_plain_fallback; - procedure fpc_varset_symdif_sets_plain(const set1,set2; var dest;size : ptrint); assembler; nostackframe; { Same as fpc_varset_sub_sets_plain but with 'xor' instead of 'and not'. eax = set1, edx = set2, ecx = dest, [esp + 4] = size } asm push %ebx -fpc_varset_symdif_sets_plain_fallback: push %esi mov 12(%esp), %esi { esi = size } sub $4, %esi @@ -328,7 +327,7 @@ asm push %ebx mov 8(%esp), %ebx sub $16, %ebx { ebx = position } - jl fpc_varset_symdif_sets_plain_fallback { probably dead branch... } + jl .LFallback { Hopefully dead branch... } movups (%eax), %xmm1 { Tail, just in case (if size is always divisible by 16, 16x_Loop can be altered to handle everything instead). } movups (%edx), %xmm2 { Precalculated because operation is not idempotent and dest can be equal to set1/set2. } @@ -344,6 +343,11 @@ asm movups %xmm2, (%ecx) { Write precalculated tail. } pop %ebx + ret $4 + +.LFallback: + pop %ebx + jmp fpc_varset_symdif_sets_plain end; {$ifndef CPUX86_HAS_SSEUNIT}