Don’t misalign FillChar pattern.

2025-04-14 18:19:54 +02:00 · 2024-03-05 12:43:29 +03:00 · 2024-03-05 12:43:29 +03:00 · a35577593b
commit a35577593b
parent 755d221230
1 changed files with 38 additions and 32 deletions
--- a/rtl/i386/i386.inc
+++ b/rtl/i386/i386.inc
@ -199,10 +199,8 @@ asm
 end;
 {$endif FillChar/Word/DWord required.}

-{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
 label
  FillXxxx_MoreThanTwoXMMs;
-{$endif FillQWord required.}

 procedure FillXxxx_U32Pattern_SSE2_16OrMore; assembler; nostackframe;
 { eax — x, ecx — uint32 pattern, edx — byte count >= 16 (preferably > 16). }
@ -212,11 +210,11 @@ asm
        movd   %ecx, %xmm0
        pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
        movdqu %xmm0, (%eax)
+        movdqu %xmm0, -16(%eax,%edx)
        cmp    $32, %edx
        ja     .LMoreThanTwoVectors
-        movdqu %xmm0, -16(%eax,%edx)
        ret
-        .byte  102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }
+        .byte  102,102,102,102,144 { Turn .balign 16 before .L64x_Body into a no-op. }

      { x can start and end misaligned on the vector boundary:
        x = ~~][H1][H2][...][T2][T1]~
@ -228,22 +226,18 @@ asm
        mov    %ecx, %esi { esi = pattern }
        mov    %eax, %ecx
        shl    $3, %ecx { ecx = misalignment of x in bits }
-        rol    %cl, %esi { misalign the pattern; no-op for FillChar, but handles misaligned cases of FillWord+. }
+        rol    %cl, %esi { misalign the pattern }
        movd   %esi, %xmm1
        pshufd $0, %xmm1, %xmm1
+        pop    %esi

-{$if not defined(FPC_SYSTEM_HAS_FILLQWORD)}
-{ FillQWord jumps here.
-  eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes.
-  Expects first 16 bytes written...
-  ...and ESI pushed! }
+{ FillChar (to skip the misaligning above) and FillQWord jump here.
+  eax — x, edx — byte count > 32, xmm0 = pattern for unaligned writes, xmm1 = pattern for aligned writes, first and last 16 bytes written. }
 FillXxxx_MoreThanTwoXMMs:
-{$endif FillQWord required.}
-        lea    -65(%eax,%edx), %ecx { ecx = end of x - 65, to get the loop bound and to write UT later (why not write it right away though...). }
+        lea    -65(%eax,%edx), %ecx
+        and    $-16, %ecx { ecx = “T4” (possibly fictive) = loop bound. }
        and    $-16, %eax { align eax to the LEFT (so needs to be offset by an additional +16 for a while). }
        movdqa %xmm1, 16(%eax) { Write H1. }
-        mov    %ecx, %esi
-        and    $-16, %esi { esi = “T4” (possibly fictive) = aligned ecx = loop bound. }
        cmp    $49, %edx { 33~49 bytes might contain 1~2 heads+tails; write as H1 and T1. }
        jle    .LOneAlignedTailWrite
        movdqa %xmm1, 32(%eax) { Write H2. }
@ -256,24 +250,22 @@ FillXxxx_MoreThanTwoXMMs:
        cmp    $NtThreshold, %edx
        jae    .L64xNT_Body

-.balign 16
+.balign 16 { no-op }
 .L64x_Body:
        movdqa %xmm1, (%eax)
        movdqa %xmm1, 16(%eax)
        movdqa %xmm1, 32(%eax)
        movdqa %xmm1, 48(%eax)
        add    $64,  %eax
-        cmp    %esi, %eax
+        cmp    %ecx, %eax
        jb     .L64x_Body
 .LFourAlignedTailWrites:
-        movdqa %xmm1, (%esi) { T4 }
-        movdqa %xmm1, 16(%esi) { T3 }
+        movdqa %xmm1, (%ecx) { T4 }
+        movdqa %xmm1, 16(%ecx) { T3 }
 .LTwoAlignedTailWrites:
-        movdqa %xmm1, 32(%esi) { T2 }
+        movdqa %xmm1, 32(%ecx) { T2 }
 .LOneAlignedTailWrite:
-        movdqa %xmm1, 48(%esi) { T1 }
-        movdqu %xmm0, 49(%ecx) { UT }
-        pop    %esi
+        movdqa %xmm1, 48(%ecx) { T1 }
        ret

 .balign 16
@ -283,7 +275,7 @@ FillXxxx_MoreThanTwoXMMs:
        movntdq %xmm1, 32(%eax)
        movntdq %xmm1, 48(%eax)
        add    $64, %eax
-        cmp    %esi, %eax
+        cmp    %ecx, %eax
        jb     .L64xNT_Body
        sfence
        jmp    .LFourAlignedTailWrites
@ -369,8 +361,15 @@ asm
        cmp     $16, %edx
        jbe     FillXxxx_U32Pattern_Ladder_4to16
        cmp     $FillXxxx_RepStosThreshold_NoERMS, %edx
-        jb      FillXxxx_U32Pattern_SSE2_16OrMore
-        jmp     FillXxxx_U32Pattern_RepStos_8OrMore
+        jae     FillXxxx_U32Pattern_RepStos_8OrMore
+
+        movd   %ecx, %xmm0
+        pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
+        movdqu %xmm0, (%eax)
+        movdqu %xmm0, -16(%eax,%edx)
+        movdqa %xmm0, %xmm1
+        cmp    $32, %edx
+        ja     FillXxxx_MoreThanTwoXMMs
 end;

 procedure FillChar_SSE2_ERMS(var x;count:SizeInt;value:byte);assembler;nostackframe;
@ -383,8 +382,15 @@ asm
        cmp     $16, %edx
        jbe     FillXxxx_U32Pattern_Ladder_4to16
        cmp     $FillXxxx_RepStosThreshold_ERMS, %edx
-        jb      FillXxxx_U32Pattern_SSE2_16OrMore
-        jmp     FillXxxx_U32Pattern_RepStos_8OrMore
+        jae     FillXxxx_U32Pattern_RepStos_8OrMore
+
+        movd   %ecx, %xmm0
+        pshufd $0, %xmm0, %xmm0 { xmm0 = pattern for unaligned writes }
+        movdqu %xmm0, (%eax)
+        movdqu %xmm0, -16(%eax,%edx)
+        movdqa %xmm0, %xmm1
+        cmp    $32, %edx
+        ja     FillXxxx_MoreThanTwoXMMs
 end;

 procedure FillChar_Dispatch(var x;count:SizeInt;value:byte); forward;
@ -599,14 +605,14 @@ asm
        punpcklqdq %xmm0, %xmm0
        { Stack is 12 bytes:
          [esp] = return address, [esp + 4] = value (not required anymore).
-          Convert to 8 bytes expected by FillXxxx_MoreThanTwoXMMs:
-          [esp] = esi, [esp + 4] = return address. }
+          Convert to 4 bytes expected by FillXxxx_MoreThanTwoXMMs:
+          [esp] = return address. }
        mov     (%esp), %ecx
-        add     $4, %esp
-        mov     %esi, (%esp)
-        mov     %ecx, 4(%esp)
+        add     $8, %esp
+        mov     %ecx, (%esp)
        shl     $3, %edx
        movdqu  %xmm0, (%eax)
+        movdqu  %xmm0, -16(%eax,%edx)
        movdqa  %xmm0, %xmm1
        test    $7, %eax { Since misaligning the pattern is not very trivial, shortcut if x is aligned. }
        jz      FillXxxx_MoreThanTwoXMMs