mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-08-18 15:29:11 +02:00
Don’t misalign FillChar pattern.
This commit is contained in:
parent
755d221230
commit
a35577593b
@ -199,10 +199,8 @@ asm
|
|||||||
end;
|
end;
|
||||||
{$endif FillChar/Word/DWord required.}
|
{$endif FillChar/Word/DWord required.}
|
||||||
|
|
||||||
{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
|
||||||
label
|
label
|
||||||
FillXxxx_MoreThanTwoXMMs;
|
FillXxxx_MoreThanTwoXMMs;
|
||||||
{$endif FillQWord required.}
|
|
||||||
|
|
||||||
procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
|
procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
|
||||||
{ eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
|
{ eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
|
||||||
@ -212,11 +210,11 @@ asm
|
|||||||
movd %ecx, %xmm0
|
movd %ecx, %xmm0
|
||||||
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
||||||
movdqu %xmm0, (%eax)
|
movdqu %xmm0, (%eax)
|
||||||
|
movdqu %xmm0, -16(%eax,%edx)
|
||||||
cmp $32, %edx
|
cmp $32, %edx
|
||||||
ja .LMoreThanTwoVectors
|
ja .LMoreThanTwoVectors
|
||||||
movdqu %xmm0, -16(%eax,%edx)
|
|
||||||
ret
|
ret
|
||||||
.byte 102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
|
.byte 102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
|
||||||
|
|
||||||
{ x can start and end misaligned on the vector boundary:
|
{ x can start and end misaligned on the vector boundary:
|
||||||
x = ~~][H1][H2][...][T2][T1]~
|
x = ~~][H1][H2][...][T2][T1]~
|
||||||
@ -228,22 +226,18 @@ asm
|
|||||||
mov %ecx, %esi { esi = pattern }
|
mov %ecx, %esi { esi = pattern }
|
||||||
mov %eax, %ecx
|
mov %eax, %ecx
|
||||||
shl $3, %ecx { ecx = misalignment of x in bits }
|
shl $3, %ecx { ecx = misalignment of x in bits }
|
||||||
rol %cl, %esi { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
|
rol %cl, %esi { misalign the pattern }
|
||||||
movd %esi, %xmm1
|
movd %esi, %xmm1
|
||||||
pshufd $0, %xmm1, %xmm1
|
pshufd $0, %xmm1, %xmm1
|
||||||
|
pop %esi
|
||||||
|
|
||||||
{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
|
{ FillChar (to skip the misaligning above) and FillQWord jump here.
|
||||||
{ FillQWord jumps here.
|
eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
|
||||||
eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes.
|
|
||||||
Expects first 16 bytes written...
|
|
||||||
...and ESI pushed! }
|
|
||||||
FillXxxx_MoreThanTwoXMMs:
|
FillXxxx_MoreThanTwoXMMs:
|
||||||
{$endif FillQWord required.}
|
lea -65(%eax,%edx), %ecx
|
||||||
lea -65(%eax,%edx), %ecx { ecx = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
|
and $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
|
||||||
and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
|
and $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
|
||||||
movdqa %xmm1, 16(%eax) { Write H1. }
|
movdqa %xmm1, 16(%eax) { Write H1. }
|
||||||
mov %ecx, %esi
|
|
||||||
and $-16, %esi { esi = “T4” (possibly fictive) = aligned ecx = loop bound. }
|
|
||||||
cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
|
cmp $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
|
||||||
jle .LOneAlignedTailWrite
|
jle .LOneAlignedTailWrite
|
||||||
movdqa %xmm1, 32(%eax) { Write H2. }
|
movdqa %xmm1, 32(%eax) { Write H2. }
|
||||||
@ -256,24 +250,22 @@ FillXxxx_MoreThanTwoXMMs:
|
|||||||
cmp $NtThreshold, %edx
|
cmp $NtThreshold, %edx
|
||||||
jae .L64xNT_Body
|
jae .L64xNT_Body
|
||||||
|
|
||||||
.balign 16
|
.balign 16 { no-op }
|
||||||
.L64x_Body:
|
.L64x_Body:
|
||||||
movdqa %xmm1, (%eax)
|
movdqa %xmm1, (%eax)
|
||||||
movdqa %xmm1, 16(%eax)
|
movdqa %xmm1, 16(%eax)
|
||||||
movdqa %xmm1, 32(%eax)
|
movdqa %xmm1, 32(%eax)
|
||||||
movdqa %xmm1, 48(%eax)
|
movdqa %xmm1, 48(%eax)
|
||||||
add $64, %eax
|
add $64, %eax
|
||||||
cmp %esi, %eax
|
cmp %ecx, %eax
|
||||||
jb .L64x_Body
|
jb .L64x_Body
|
||||||
.LFourAlignedTailWrites:
|
.LFourAlignedTailWrites:
|
||||||
movdqa %xmm1, (%esi) { T4 }
|
movdqa %xmm1, (%ecx) { T4 }
|
||||||
movdqa %xmm1, 16(%esi) { T3 }
|
movdqa %xmm1, 16(%ecx) { T3 }
|
||||||
.LTwoAlignedTailWrites:
|
.LTwoAlignedTailWrites:
|
||||||
movdqa %xmm1, 32(%esi) { T2 }
|
movdqa %xmm1, 32(%ecx) { T2 }
|
||||||
.LOneAlignedTailWrite:
|
.LOneAlignedTailWrite:
|
||||||
movdqa %xmm1, 48(%esi) { T1 }
|
movdqa %xmm1, 48(%ecx) { T1 }
|
||||||
movdqu %xmm0, 49(%ecx) { UT }
|
|
||||||
pop %esi
|
|
||||||
ret
|
ret
|
||||||
|
|
||||||
.balign 16
|
.balign 16
|
||||||
@ -283,7 +275,7 @@ FillXxxx_MoreThanTwoXMMs:
|
|||||||
movntdq %xmm1, 32(%eax)
|
movntdq %xmm1, 32(%eax)
|
||||||
movntdq %xmm1, 48(%eax)
|
movntdq %xmm1, 48(%eax)
|
||||||
add $64, %eax
|
add $64, %eax
|
||||||
cmp %esi, %eax
|
cmp %ecx, %eax
|
||||||
jb .L64xNT_Body
|
jb .L64xNT_Body
|
||||||
sfence
|
sfence
|
||||||
jmp .LFourAlignedTailWrites
|
jmp .LFourAlignedTailWrites
|
||||||
@ -369,8 +361,15 @@ asm
|
|||||||
cmp $16, %edx
|
cmp $16, %edx
|
||||||
jbe FillXxxx_U32Pattern_Ladder_4to16
|
jbe FillXxxx_U32Pattern_Ladder_4to16
|
||||||
cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
|
cmp $FillXxxx_RepStosThreshold_NoERMS, %edx
|
||||||
jb FillXxxx_U32Pattern_SSE2_16OrMore
|
jae FillXxxx_U32Pattern_RepStos_8OrMore
|
||||||
jmp FillXxxx_U32Pattern_RepStos_8OrMore
|
|
||||||
|
movd %ecx, %xmm0
|
||||||
|
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
||||||
|
movdqu %xmm0, (%eax)
|
||||||
|
movdqu %xmm0, -16(%eax,%edx)
|
||||||
|
movdqa %xmm0, %xmm1
|
||||||
|
cmp $32, %edx
|
||||||
|
ja FillXxxx_MoreThanTwoXMMs
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
|
procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
|
||||||
@ -383,8 +382,15 @@ asm
|
|||||||
cmp $16, %edx
|
cmp $16, %edx
|
||||||
jbe FillXxxx_U32Pattern_Ladder_4to16
|
jbe FillXxxx_U32Pattern_Ladder_4to16
|
||||||
cmp $FillXxxx_RepStosThreshold_ERMS, %edx
|
cmp $FillXxxx_RepStosThreshold_ERMS, %edx
|
||||||
jb FillXxxx_U32Pattern_SSE2_16OrMore
|
jae FillXxxx_U32Pattern_RepStos_8OrMore
|
||||||
jmp FillXxxx_U32Pattern_RepStos_8OrMore
|
|
||||||
|
movd %ecx, %xmm0
|
||||||
|
pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
|
||||||
|
movdqu %xmm0, (%eax)
|
||||||
|
movdqu %xmm0, -16(%eax,%edx)
|
||||||
|
movdqa %xmm0, %xmm1
|
||||||
|
cmp $32, %edx
|
||||||
|
ja FillXxxx_MoreThanTwoXMMs
|
||||||
end;
|
end;
|
||||||
|
|
||||||
procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
|
procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
|
||||||
@ -599,14 +605,14 @@ asm
|
|||||||
punpcklqdq %xmm0, %xmm0
|
punpcklqdq %xmm0, %xmm0
|
||||||
{ Stack is 12 bytes:
|
{ Stack is 12 bytes:
|
||||||
[esp] = return address, [esp + 4] = value (not required anymore).
|
[esp] = return address, [esp + 4] = value (not required anymore).
|
||||||
Convert to 8 bytes expected by FillXxxx_MoreThanTwoXMMs:
|
Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
|
||||||
[esp] = esi, [esp + 4] = return address. }
|
[esp] = return address. }
|
||||||
mov (%esp), %ecx
|
mov (%esp), %ecx
|
||||||
add $4, %esp
|
add $8, %esp
|
||||||
mov %esi, (%esp)
|
mov %ecx, (%esp)
|
||||||
mov %ecx, 4(%esp)
|
|
||||||
shl $3, %edx
|
shl $3, %edx
|
||||||
movdqu %xmm0, (%eax)
|
movdqu %xmm0, (%eax)
|
||||||
|
movdqu %xmm0, -16(%eax,%edx)
|
||||||
movdqa %xmm0, %xmm1
|
movdqa %xmm0, %xmm1
|
||||||
test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
|
test $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
|
||||||
jz FillXxxx_MoreThanTwoXMMs
|
jz FillXxxx_MoreThanTwoXMMs
|
||||||
|
Loading…
Reference in New Issue
Block a user