* Did a clean rewrite of IndexByte and IndexWord, resulting in somewhat less instructions.

+ IndexWord for the case of unaligned buffer: implemented using aligned reads. + tindex.pp: Added testing correctness of IndexWord with unlimited length. git-svn-id: trunk@17317 -
2025-04-14 18:19:54 +02:00 · 2011-04-14 17:12:04 +00:00 · 2011-04-14 17:12:04 +00:00 · 30f7bff09d
commit 30f7bff09d
parent 9781c0d051
2 changed files with 134 additions and 95 deletions
--- a/rtl/x86_64/x86_64.inc
+++ b/rtl/x86_64/x86_64.inc
@ -459,7 +459,6 @@ Procedure FillChar(var x;count:SizeInt;value:byte);assembler;nostackframe;
 {$endif FPC_SYSTEM_HAS_FILLCHAR}

 {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
-{ based on libc/sysdeps/x86_64/memchr.S }
 {$define FPC_SYSTEM_HAS_INDEXBYTE}
 function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
 { win64: rcx buf, rdx len, r8b word
@ -472,51 +471,45 @@ asm
    movq   %rdi, %rcx
    movq   %rsi, %rdx
 {$endif}
-    mov    %rcx, %rax                  { duplicate buf }
+    mov    %rcx, %r8
    punpcklbw  %xmm1, %xmm1
-    and    $0xfffffffffffffff0, %rax
+    and    $-0x10, %rcx                { highest aligned address before buf }
    test   %rdx, %rdx
    punpcklbw  %xmm1, %xmm1
-    jz     .L3                         { exit if len=0 }
-    orl    $0xffffffff, %r8d
-    movdqa (%rax), %xmm0               { Fetch first 16 bytes (up to 15 bytes before target) }
+    jz     .Lnotfound                  { exit if len=0 }
+    add    $16, %rcx                   { first aligned address after buf }
    pshufd $0, %xmm1, %xmm1
-    sub    %rax, %rcx                  { rcx=misalignment }
-    pcmpeqb %xmm1, %xmm0
-    add    %rcx, %rdx                  { add misalignment to length }
-    cmovb  %r8, %rdx                   { if it overflows (happens when length=-1), set back to -1, }
-                                       {   otherwise loop will terminate too early }
-    mov    %rcx, %r9                   { and save it, will subtract back in the end }
-    shl    %cl, %r8d
-    pmovmskb %xmm0, %ecx
-    andl   %r8d, %ecx                  { mask away matches before buffer start }
-    movl   $16, %r8d
-    jnz    .L1                         { got a match within buffer -> we're done (almost) }
-    cmpq   %r8, %rdx
-    jbe    .L3
+    movdqa -16(%rcx), %xmm0            { Fetch first 16 bytes (up to 15 bytes before target) }
+    sub    %r8, %rcx                   { rcx=number of valid bytes, r8=original ptr }
+
+    pcmpeqb %xmm1, %xmm0               { compare with pattern and get bitmask }
+    pmovmskb %xmm0, %eax
+
+    shl    %cl, %eax                   { shift valid bits into high word }
+    and    $0xffff0000, %eax           { clear low word containing invalid bits }
+    shr    %cl, %eax                   { shift back }
+    jmp   .Lcontinue

    .balign 16
-.L2:
-    movdqa (%rax,%r8), %xmm0
-    lea    16(%r8), %r8
+.Lloop:
+    movdqa (%r8,%rcx), %xmm0           { r8 and rcx may have any values, }
+    add    $16, %rcx                   { but their sum is evenly divisible by 16. }
    pcmpeqb %xmm1, %xmm0
-    pmovmskb %xmm0, %ecx
-    test   %ecx, %ecx
-    jnz    .L1
-    cmp    %r8, %rdx
-    ja     .L2
+    pmovmskb %xmm0, %eax
+.Lcontinue:
+    test   %eax, %eax
+    jnz    .Lmatch
+    cmp    %rcx, %rdx
+    ja     .Lloop
+.Lnotfound:
+    or     $-1, %rax
+    retq

-.L3:
-    or    $-1, %rax
-    jmp   .Ldone
-
-.L1:
-    bsfl   %ecx, %ecx                  { compute position of the first match }
-    lea    -16(%rcx,%r8), %rax
-    cmp    %rax, %rdx
-    jbe    .L3                         { if it is after the specified length, ignore it }
-    sub    %r9, %rax
-.Ldone:
+.Lmatch:
+    bsf    %eax, %eax
+    lea    -16(%rcx,%rax), %rax
+    cmp    %rax, %rdx                  { check against the buffer length }
+    jbe    .Lnotfound
 end;
 {$endif FPC_SYSTEM_HAS_INDEXBYTE}

@ -533,77 +526,97 @@ asm
    movq   %rdi, %rcx
    movq   %rsi, %rdx
 {$endif}
-    mov    %rcx, %rax                  { duplicate buf }
+    mov    %rcx, %r8
    punpcklwd  %xmm1, %xmm1
-    and    $0xfffffffffffffff0, %rax
+    and    $-0x10, %rcx
    test   %rdx, %rdx
    pshufd $0, %xmm1, %xmm1
-    jz     .L3                         { exit if len=0 }
-    orl    $0xffffffff, %r8d
-    test   $1, %cl                     { if buffer isn't aligned to word boundary, }
-    jnz    .Lunaligned                 { fallback to slower unaligned loop }
+    jz     .Lnotfound                  { exit if len=0 }
+    add    $16, %rcx
+    movdqa -16(%rcx), %xmm0            { Fetch first 16 bytes (up to 14 bytes before target) }
+    sub    %r8, %rcx                   { rcx=number of valid bytes }

-    movdqa (%rax), %xmm0               { Fetch first 16 bytes (up to 14 bytes before target) }
-    sub    %rax, %rcx                  { rcx=misalignment }
-    pcmpeqw %xmm1, %xmm0
+    test   $1, %r8b                    { if buffer isn't aligned to word boundary, }
+    jnz    .Lunaligned                 { use a different algorithm }

-    mov    %rcx, %r9
-    shr    $1, %r9                     { save misalignment in words }
+    pcmpeqw  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax

-    add    %r9, %rdx                   { add misalignment to length }
-    cmovb  %r8, %rdx                   { if it overflows (happens when length=-1), set back to -1, }
-                                       {   otherwise loop will terminate too early }
-    shl    %cl, %r8d
-    pmovmskb %xmm0, %ecx
-    andl   %r8d, %ecx                  { mask away matches before buffer start }
-    movl   $8, %r8d
-    jnz    .L1                         { got a match within buffer -> we're done (almost) }
-    cmpq   %r8, %rdx
-    jbe    .L3
+    shl    %cl, %eax
+    and    $0xffff0000, %eax
+    shr    %cl, %eax
+    shr    $1, %ecx                    { bytes->words }
+    jmp    .Lcontinue

    .balign 16
-.L2:
-    movdqa (%rax,%r8,2), %xmm0
-    lea    8(%r8), %r8
-    pcmpeqw %xmm1, %xmm0
-    pmovmskb %xmm0, %ecx
-    test   %ecx, %ecx
-    jnz    .L1
-    cmp    %r8, %rdx
-    ja     .L2
+.Lloop:
+    movdqa (%r8,%rcx,2), %xmm0
+    add    $8, %rcx
+    pcmpeqw  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax
+.Lcontinue:
+    test   %eax, %eax
+    jnz    .Lmatch
+    cmp    %rcx, %rdx
+    ja     .Lloop

-.L3:
+.Lnotfound:
    or    $-1, %rax
-    jmp   .Ldone
-
-.L1:
-    bsfl   %ecx, %ecx                  { compute position of the first match }
-    shr    $1, %ecx                    { in words }
-    lea    -8(%rcx,%r8), %rax
-    cmp    %rax, %rdx
-    jbe    .L3                         { if it is after the specified length, ignore it }
-    sub    %r9, %rax
-.Ldone:
    retq

-{ TODO: aligned processing is still possible, but for now
-  use the simplest form }
+.Lmatch:
+    bsf    %eax, %eax
+    shr    $1, %eax                    { in words }
+    lea    -8(%rcx,%rax), %rax
+    cmp    %rax, %rdx
+    jbe    .Lnotfound                  { if match is after the specified length, ignore it }
+    retq
+
 .Lunaligned:
-    xor    %r9, %r9
-    xor    %r8, %r8
-    mov    %rcx, %rax
+    movdqa  %xmm1, %xmm2               { (mis)align the pattern (in this particular case: }
+    psllw   $8, %xmm1                  {   swap bytes of each word of pattern) }
+    psrlw   $8, %xmm2
+    por     %xmm2, %xmm1
+
+    pcmpeqb  %xmm1, %xmm0
+    pmovmskb %xmm0, %eax
+
+    shl    %cl, %eax
+    and    $0xffff0000, %eax
+    shr    %cl, %eax
+
+    add    %rdx, %rdx                  { length words -> bytes }
+    xor    %r10d, %r10d                { nothing to merge yet }
+    jmp    .Lcontinue_u

    .balign 16
-.L2u:
-    movdqu (%rax,%r8,2), %xmm0
-    lea    8(%r8), %r8
-    pcmpeqw %xmm1, %xmm0
-    pmovmskb %xmm0, %ecx
-    test   %ecx, %ecx
-    jnz    .L1
-    cmp    %r8, %rdx
-    ja     .L2u
+.Lloop_u:
+    movdqa (%r8,%rcx), %xmm0
+    add    $16, %rcx
+    pcmpeqb %xmm1, %xmm0               { compare by bytes }
+    shr    $16, %r10d                  { bit 16 shifts into 0 }
+    pmovmskb %xmm0, %eax
+.Lcontinue_u:
+    shl    $1, %eax                    { 15:0 -> 16:1 }
+    or     %r10d, %eax                 { merge bit 0 from previous round }
+    mov    %eax, %r10d
+    shr    $1, %eax                    { now AND together adjacent pairs of bits }
+    and    %r10d, %eax
+    and    $0x5555, %eax               { also reset odd bits }
+    jnz    .Lmatch_u
+    cmpq   %rcx, %rdx
+    ja     .Lloop_u
+
+.Lnotfound_u:
    or     $-1, %rax
+    retq
+.Lmatch_u:
+    bsf    %eax, %eax
+    lea    -16(%rcx,%rax), %rax
+    cmp    %rax, %rdx
+    jbe    .Lnotfound_u                { if match is after the specified length, ignore it }
+    sar    $1, %eax                    { in words }
+    retq
 end;
 {$endif FPC_SYSTEM_HAS_INDEXWORD}

--- a/tests/test/tindex.pp
+++ b/tests/test/tindex.pp
@ -1,5 +1,3 @@
-const
-  err: boolean = false;

 var
  a, b: array[0..515] of byte;
@ -128,6 +126,13 @@ begin
              writeln('indexword error 7 for (',i,',',j,',',k,')');
              halt(7);
            end;
+          {same for length=-1}
+          if indexword(b[k+4],-1,0)<>index then
+            begin
+              writeln(indexword(b[k+4],-1,0),' <> ',index);
+              writeln('indexword error 7a for (',i,',',j,',',k,')');
+              halt(27);
+            end;

          if (i=0) then
            index:=0
@ -140,6 +145,13 @@ begin
              writeln('indexword error 8 for (',i,',',j,',',k,')');
              halt(8);
            end;
+          {same for length=-1}
+          if indexword(b[k+4],-1,l)<>index then
+            begin
+              writeln(indexword(b[k+4],-1,l),' <> ',index);
+              writeln('indexword error 8a for (',i,',',j,',',k,')');
+              halt(28);
+            end;

           l:=unaligned(pword(@(b[k+4+((i shr 2) and not 1)-2]))^);
           if (i>=8) then
@ -152,6 +164,14 @@ begin
               writeln('indexword error 9 for (',i,',',j,',',k,')');
               halt(9);
             end;
+           if (i>1) and (index<>-1) then
+             if indexword(b[k+4],-1,l)<>index then
+               begin
+                 writeln(indexword(b[k+4],-1,l),' <> ',index);
+                 writeln('indexword error 9a for (',i,',',j,',',k,')');
+                 halt(29);
+               end;
+
           l:=unaligned(pword(@(b[k+4]))^);
           if (i<2) then
             index:=-1
@ -162,6 +182,12 @@ begin
               writeln('indexword error 10 for (',i,',',j,',',k,')');
               halt(10);
             end;
+           if i>1 then
+             if indexword(b[k+4],-1,l)<>index then
+               begin
+                 writeln('indexword error 10a for (',i,',',j,',',k,')');
+                 halt(30);
+               end;


           if (unaligned(pdword(@b[k+4])^)=0) then