diff --git a/rtl/i386/i386.inc b/rtl/i386/i386.inc index 6e9bd5f3ba..3c9ff0e9d4 100644 --- a/rtl/i386/i386.inc +++ b/rtl/i386/i386.inc @@ -786,13 +786,58 @@ function {$ifdef CPUX86_HAS_SSE2} IndexByte {$else} IndexByte_SSE2 {$endif} (con asm test %edx, %edx jz .Lnotfound { exit if len=0 } - push %ebx + movd %ecx, %xmm1 - lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. } + mov %eax, %ecx punpcklbw %xmm1, %xmm1 - and $-0x10, %ecx { first aligned address after buf } punpcklbw %xmm1, %xmm1 + and $4095, %ecx pshufd $0, %xmm1, %xmm1 + + cmp $4080, %ecx + ja .LCrossPage + + movdqu (%eax), %xmm0 { Analyze first 16 bytes, unaligned. } + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %ecx + test %ecx, %ecx + jz .LContinueAligned + + bsf %ecx, %eax + cmp %edx, %eax + jae .Lnotfound + ret + + .byte 144 { Make .balign 16 before .Lloop a no-op. } +.LContinueAligned: + cmp $16, %edx { Length might be explicitly set to 16 or less; if so, skip a bit of work. } + jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) } + + push %ebx + lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. } + and $-0x10, %ecx { first aligned address after buf } + sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr } + + .balign 16 +.Lloop: + movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, } + add $16, %ecx { but their sum is evenly divisible by 16. } + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %ebx + test %ebx, %ebx + jnz .Lmatch +.Lcontinue: + cmp %ecx, %edx + ja .Lloop + pop %ebx +.Lnotfound: + or $-1, %eax + ret + +.LCrossPage: + push %ebx + lea 16(%eax), %ecx { eax = original ptr, ecx = buf + 16 for aligning & shifts. } + and $-0x10, %ecx { first aligned address after buf } movdqa -16(%ecx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) } sub %eax, %ecx { ecx=number of valid bytes, eax=original ptr } @@ -809,22 +854,6 @@ asm pop %ebx cmp %eax, %edx { check against the buffer length } jbe .Lnotfound - ret - - .balign 16 -.Lloop: - movdqa (%eax,%ecx), %xmm0 { eax and ecx may have any values, } - add $16, %ecx { but their sum is evenly divisible by 16. } - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %ebx - test %ebx, %ebx - jnz .Lmatch -.Lcontinue: - cmp %ecx, %edx - ja .Lloop - pop %ebx -.Lnotfound: - or $-1, %eax end; {$ifndef CPUX86_HAS_SSE2} diff --git a/rtl/x86_64/x86_64.inc b/rtl/x86_64/x86_64.inc index 224686907d..ee92e2555a 100644 --- a/rtl/x86_64/x86_64.inc +++ b/rtl/x86_64/x86_64.inc @@ -595,17 +595,65 @@ function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackfram asm test len, len jz .Lnotfound { exit if len=0 } + movd {$ifdef win64} %r8d {$else} %edx {$endif}, %xmm1 + mov {$ifdef win64} %ecx {$else} %edi {$endif}, %eax + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + and $4095, %eax + pshufd $0, %xmm1, %xmm1 + + cmp $4080, %eax + ja .LCrossPage + + movdqu ({$ifdef win64} %rcx {$else} %rdi {$endif}), %xmm0 { Analyze first 16 bytes, unaligned. } + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jz .LContinueAligned + + bsf %eax, %eax + cmp len, %rax + jae .Lnotfound + ret + + .byte {$ifndef win64}102,102,102,102,{$endif}102,102,102,102,102,102,102,102,102,144 { Make .balign 16 before .Lloop a no-op. } +.LContinueAligned: + cmp $16, len { Length might be explicitly set to 16 or less; if so, skip a bit of work. } + jbe .Lnotfound { (Or rather, this check is *required* unless jumping to .Lcontinue instead of going directly to .Lloop) } + +{$ifdef win64} + mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. } + add $16, %rcx +{$else} + lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. } +{$endif} + and $-0x10, %rcx { first aligned address after buf } + sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr } + + .balign 16 +.Lloop: + movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, } + add $16, %rcx { but their sum is evenly divisible by 16. } + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz .Lmatch +.Lcontinue: + cmp %rcx, len + ja .Lloop +.Lnotfound: + or $-1, %rax + ret + +.LCrossPage: {$ifdef win64} mov %rcx, %r8 { r8 = original ptr, rcx = buf + 16 for aligning & shifts. } add $16, %rcx {$else} lea 16(%rdi), %rcx { rdi = original ptr, rcx = buf + 16 for aligning & shifts. } {$endif} - punpcklbw %xmm1, %xmm1 and $-0x10, %rcx { first aligned address after buf } - punpcklbw %xmm1, %xmm1 - pshufd $0, %xmm1, %xmm1 movdqa -16(%rcx), %xmm0 { Fetch first 16 bytes (up to 15 bytes before target) } sub {$ifdef win64} %r8 {$else} %rdi {$endif}, %rcx { rcx=number of valid bytes, r8/rdi=original ptr } @@ -621,21 +669,6 @@ asm lea -16(%rcx,%rax), %rax cmp %rax, len { check against the buffer length } jbe .Lnotfound - ret - - .balign 16 -.Lloop: - movdqa ({$ifdef win64} %r8 {$else} %rdi {$endif},%rcx), %xmm0 { r8/rdi and rcx may have any values, } - add $16, %rcx { but their sum is evenly divisible by 16. } - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %eax - test %eax, %eax - jnz .Lmatch -.Lcontinue: - cmp %rcx, len - ja .Lloop -.Lnotfound: - or $-1, %rax end; {$endif FPC_SYSTEM_HAS_INDEXBYTE}