mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-14 18:19:54 +02:00
Don’t misalign FillChar pattern.
This commit is contained in:
parent
755d221230
commit
a35577593b
@ -199,10 +199,8 @@ asm
|
||||
end;
|
||||
{$endif FillChar/Word/DWord required.}
|
||||
|
||||
{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
||||
label
|
||||
FillXxxx_MoreThanTwoXMMs;
|
||||
{$endif FillQWord required.}
|
||||
|
||||
procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
|
||||
{ eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
|
||||
@ -212,11 +210,11 @@ asm
|
||||
movd %ecx, %xmm0
|
||||
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
||||
movdqu %xmm0, (%eax)
|
||||
movdqu %xmm0, -16(%eax,%edx)
|
||||
cmp $32, %edx
|
||||
ja .LMoreThanTwoVectors
|
||||
movdqu %xmm0, -16(%eax,%edx)
|
||||
ret
|
||||
.byte 102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
|
||||
.byte 102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
|
||||
|
||||
{ x can start and end misaligned on the vector boundary:
|
||||
x = ~~][H1][H2][...][T2][T1]~
|
||||
@ -228,22 +226,18 @@ asm
|
||||
mov %ecx, %esi { esi = pattern }
|
||||
mov %eax, %ecx
|
||||
shl $3, %ecx { ecx = misalignment of x in bits }
|
||||
rol %cl, %esi { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
|
||||
rol %cl, %esi { misalign the pattern }
|
||||
movd %esi, %xmm1
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
pop %esi
|
||||
|
||||
{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
||||
{ FillQWord jumps here.
|
||||
eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes.
|
||||
Expects first 16 bytes written...
|
||||
...and ESI pushed! }
|
||||
{ FillChar (to skip the misaligning above) and FillQWord jump here.
|
||||
eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
|
||||
FillXxxx_MoreThanTwoXMMs:
|
||||
{$endif FillQWord required.}
|
||||
lea -65(%eax,%edx), %ecx { ecx = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
|
||||
lea -65(%eax,%edx), %ecx
|
||||
and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
|
||||
and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
|
||||
movdqa %xmm1, 16(%eax) { Write H1. }
|
||||
mov %ecx, %esi
|
||||
and $-16, %esi { esi = “T4” (possibly fictive) = aligned ecx = loop bound. }
|
||||
cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
|
||||
jle .LOneAlignedTailWrite
|
||||
movdqa %xmm1, 32(%eax) { Write H2. }
|
||||
@ -256,24 +250,22 @@ FillXxxx_MoreThanTwoXMMs:
|
||||
cmp $NtThreshold, %edx
|
||||
jae .L64xNT_Body
|
||||
|
||||
.balign 16
|
||||
.balign 16 { no-op }
|
||||
.L64x_Body:
|
||||
movdqa %xmm1, (%eax)
|
||||
movdqa %xmm1, 16(%eax)
|
||||
movdqa %xmm1, 32(%eax)
|
||||
movdqa %xmm1, 48(%eax)
|
||||
add $64, %eax
|
||||
cmp %esi, %eax
|
||||
cmp %ecx, %eax
|
||||
jb .L64x_Body
|
||||
.LFourAlignedTailWrites:
|
||||
movdqa %xmm1, (%esi) { T4 }
|
||||
movdqa %xmm1, 16(%esi) { T3 }
|
||||
movdqa %xmm1, (%ecx) { T4 }
|
||||
movdqa %xmm1, 16(%ecx) { T3 }
|
||||
.LTwoAlignedTailWrites:
|
||||
movdqa %xmm1, 32(%esi) { T2 }
|
||||
movdqa %xmm1, 32(%ecx) { T2 }
|
||||
.LOneAlignedTailWrite:
|
||||
movdqa %xmm1, 48(%esi) { T1 }
|
||||
movdqu %xmm0, 49(%ecx) { UT }
|
||||
pop %esi
|
||||
movdqa %xmm1, 48(%ecx) { T1 }
|
||||
ret
|
||||
|
||||
.balign 16
|
||||
@ -283,7 +275,7 @@ FillXxxx_MoreThanTwoXMMs:
|
||||
movntdq %xmm1, 32(%eax)
|
||||
movntdq %xmm1, 48(%eax)
|
||||
add $64, %eax
|
||||
cmp %esi, %eax
|
||||
cmp %ecx, %eax
|
||||
jb .L64xNT_Body
|
||||
sfence
|
||||
jmp .LFourAlignedTailWrites
|
||||
@ -369,8 +361,15 @@ asm
|
||||
cmp $16, %edx
|
||||
jbe FillXxxx_U32Pattern_Ladder_4to16
|
||||
cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
|
||||
jb FillXxxx_U32Pattern_SSE2_16OrMore
|
||||
jmp FillXxxx_U32Pattern_RepStos_8OrMore
|
||||
jae FillXxxx_U32Pattern_RepStos_8OrMore
|
||||
|
||||
movd %ecx, %xmm0
|
||||
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
||||
movdqu %xmm0, (%eax)
|
||||
movdqu %xmm0, -16(%eax,%edx)
|
||||
movdqa %xmm0, %xmm1
|
||||
cmp $32, %edx
|
||||
ja FillXxxx_MoreThanTwoXMMs
|
||||
end;
|
||||
|
||||
procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
|
||||
@ -383,8 +382,15 @@ asm
|
||||
cmp $16, %edx
|
||||
jbe FillXxxx_U32Pattern_Ladder_4to16
|
||||
cmp $FillXxxx_RepStosThreshold_ERMS, %edx
|
||||
jb FillXxxx_U32Pattern_SSE2_16OrMore
|
||||
jmp FillXxxx_U32Pattern_RepStos_8OrMore
|
||||
jae FillXxxx_U32Pattern_RepStos_8OrMore
|
||||
|
||||
movd %ecx, %xmm0
|
||||
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
||||
movdqu %xmm0, (%eax)
|
||||
movdqu %xmm0, -16(%eax,%edx)
|
||||
movdqa %xmm0, %xmm1
|
||||
cmp $32, %edx
|
||||
ja FillXxxx_MoreThanTwoXMMs
|
||||
end;
|
||||
|
||||
procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
|
||||
@ -599,14 +605,14 @@ asm
|
||||
punpcklqdq %xmm0, %xmm0
|
||||
{ Stack is 12 bytes:
|
||||
[esp] = return address, [esp + 4] = value (not required anymore).
|
||||
Convert to 8 bytes expected by FillXxxx_MoreThanTwoXMMs:
|
||||
[esp] = esi, [esp + 4] = return address. }
|
||||
Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
|
||||
[esp] = return address. }
|
||||
mov (%esp), %ecx
|
||||
add $4, %esp
|
||||
mov %esi, (%esp)
|
||||
mov %ecx, 4(%esp)
|
||||
add $8, %esp
|
||||
mov %ecx, (%esp)
|
||||
shl $3, %edx
|
||||
movdqu %xmm0, (%eax)
|
||||
movdqu %xmm0, -16(%eax,%edx)
|
||||
movdqa %xmm0, %xmm1
|
||||
test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
|
||||
jz FillXxxx_MoreThanTwoXMMs
|
||||
|
Loading…
Reference in New Issue
Block a user