Don’t misalign FillChar pattern.

This commit is contained in:
Rika Ichinose 2024-03-05 12:43:29 +03:00 committed by FPK
parent 755d221230
commit a35577593b

View File

@ -199,10 +199,8 @@ asm
end; end;
{$endif FillChar/Word/DWord required.} {$endif FillChar/Word/DWord required.}
{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
label label
FillXxxx_MoreThanTwoXMMs; FillXxxx_MoreThanTwoXMMs;
{$endif FillQWord required.}
procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe; procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
{ eax x, ecx uint32 pattern, edx byte count >= 16 (preferably > 16). } { eax x, ecx uint32 pattern, edx byte count >= 16 (preferably > 16). }
@ -212,11 +210,11 @@ asm
movd %ecx, %xmm0 movd %ecx, %xmm0
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes } pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
movdqu %xmm0, (%eax) movdqu %xmm0, (%eax)
movdqu %xmm0, -16(%eax,%edx)
cmp $32, %edx cmp $32, %edx
ja .LMoreThanTwoVectors ja .LMoreThanTwoVectors
movdqu %xmm0, -16(%eax,%edx)
ret ret
.byte 102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. } .byte 102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
{ x can start and end misaligned on the vector boundary: { x can start and end misaligned on the vector boundary:
x = ~~][H1][H2][...][T2][T1]~ x = ~~][H1][H2][...][T2][T1]~
@ -228,22 +226,18 @@ asm
mov %ecx, %esi { esi = pattern } mov %ecx, %esi { esi = pattern }
mov %eax, %ecx mov %eax, %ecx
shl $3, %ecx { ecx = misalignment of x in bits } shl $3, %ecx { ecx = misalignment of x in bits }
rol %cl, %esi { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. } rol %cl, %esi { misalign the pattern }
movd %esi, %xmm1 movd %esi, %xmm1
pshufd $0, %xmm1, %xmm1 pshufd $0, %xmm1, %xmm1
pop %esi
{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)} { FillChar (to skip the misaligning above) and FillQWord jump here.
{ FillQWord jumps here. eax x, edx byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
eax x, edx byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes.
Expects first 16 bytes written...
...and ESI pushed! }
FillXxxx_MoreThanTwoXMMs: FillXxxx_MoreThanTwoXMMs:
{$endif FillQWord required.} lea -65(%eax,%edx), %ecx
lea -65(%eax,%edx), %ecx { ecx = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). } and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). } and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
movdqa %xmm1, 16(%eax) { Write H1. } movdqa %xmm1, 16(%eax) { Write H1. }
mov %ecx, %esi
and $-16, %esi { esi = “T4” (possibly fictive) = aligned ecx = loop bound. }
cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. } cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
jle .LOneAlignedTailWrite jle .LOneAlignedTailWrite
movdqa %xmm1, 32(%eax) { Write H2. } movdqa %xmm1, 32(%eax) { Write H2. }
@ -256,24 +250,22 @@ FillXxxx_MoreThanTwoXMMs:
cmp $NtThreshold, %edx cmp $NtThreshold, %edx
jae .L64xNT_Body jae .L64xNT_Body
.balign 16 .balign 16 { no-op }
.L64x_Body: .L64x_Body:
movdqa %xmm1, (%eax) movdqa %xmm1, (%eax)
movdqa %xmm1, 16(%eax) movdqa %xmm1, 16(%eax)
movdqa %xmm1, 32(%eax) movdqa %xmm1, 32(%eax)
movdqa %xmm1, 48(%eax) movdqa %xmm1, 48(%eax)
add $64, %eax add $64, %eax
cmp %esi, %eax cmp %ecx, %eax
jb .L64x_Body jb .L64x_Body
.LFourAlignedTailWrites: .LFourAlignedTailWrites:
movdqa %xmm1, (%esi) { T4 } movdqa %xmm1, (%ecx) { T4 }
movdqa %xmm1, 16(%esi) { T3 } movdqa %xmm1, 16(%ecx) { T3 }
.LTwoAlignedTailWrites: .LTwoAlignedTailWrites:
movdqa %xmm1, 32(%esi) { T2 } movdqa %xmm1, 32(%ecx) { T2 }
.LOneAlignedTailWrite: .LOneAlignedTailWrite:
movdqa %xmm1, 48(%esi) { T1 } movdqa %xmm1, 48(%ecx) { T1 }
movdqu %xmm0, 49(%ecx) { UT }
pop %esi
ret ret
.balign 16 .balign 16
@ -283,7 +275,7 @@ FillXxxx_MoreThanTwoXMMs:
movntdq %xmm1, 32(%eax) movntdq %xmm1, 32(%eax)
movntdq %xmm1, 48(%eax) movntdq %xmm1, 48(%eax)
add $64, %eax add $64, %eax
cmp %esi, %eax cmp %ecx, %eax
jb .L64xNT_Body jb .L64xNT_Body
sfence sfence
jmp .LFourAlignedTailWrites jmp .LFourAlignedTailWrites
@ -369,8 +361,15 @@ asm
cmp $16, %edx cmp $16, %edx
jbe FillXxxx_U32Pattern_Ladder_4to16 jbe FillXxxx_U32Pattern_Ladder_4to16
cmp $FillXxxx_RepStosThreshold_NoERMS, %edx cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
jb FillXxxx_U32Pattern_SSE2_16OrMore jae FillXxxx_U32Pattern_RepStos_8OrMore
jmp FillXxxx_U32Pattern_RepStos_8OrMore
movd %ecx, %xmm0
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
movdqu %xmm0, (%eax)
movdqu %xmm0, -16(%eax,%edx)
movdqa %xmm0, %xmm1
cmp $32, %edx
ja FillXxxx_MoreThanTwoXMMs
end; end;
procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe; procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
@ -383,8 +382,15 @@ asm
cmp $16, %edx cmp $16, %edx
jbe FillXxxx_U32Pattern_Ladder_4to16 jbe FillXxxx_U32Pattern_Ladder_4to16
cmp $FillXxxx_RepStosThreshold_ERMS, %edx cmp $FillXxxx_RepStosThreshold_ERMS, %edx
jb FillXxxx_U32Pattern_SSE2_16OrMore jae FillXxxx_U32Pattern_RepStos_8OrMore
jmp FillXxxx_U32Pattern_RepStos_8OrMore
movd %ecx, %xmm0
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
movdqu %xmm0, (%eax)
movdqu %xmm0, -16(%eax,%edx)
movdqa %xmm0, %xmm1
cmp $32, %edx
ja FillXxxx_MoreThanTwoXMMs
end; end;
procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward; procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
@ -599,14 +605,14 @@ asm
punpcklqdq %xmm0, %xmm0 punpcklqdq %xmm0, %xmm0
{ Stack is 12 bytes: { Stack is 12 bytes:
[esp] = return address, [esp + 4] = value (not required anymore). [esp] = return address, [esp + 4] = value (not required anymore).
Convert to 8 bytes expected by FillXxxx_MoreThanTwoXMMs: Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
[esp] = esi, [esp + 4] = return address. } [esp] = return address. }
mov (%esp), %ecx mov (%esp), %ecx
add $4, %esp add $8, %esp
mov %esi, (%esp) mov %ecx, (%esp)
mov %ecx, 4(%esp)
shl $3, %edx shl $3, %edx
movdqu %xmm0, (%eax) movdqu %xmm0, (%eax)
movdqu %xmm0, -16(%eax,%edx)
movdqa %xmm0, %xmm1 movdqa %xmm0, %xmm1
test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. } test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
jz FillXxxx_MoreThanTwoXMMs jz FillXxxx_MoreThanTwoXMMs