* Replaced i386 assembler IndexByte by more sophisticated implementation, larger in size but faster by factor from 2 (on Athlon X2 L310) to 5 (on Core2Duo E7200) for 512 byte buffers.

git-svn-id: trunk@20188 -
This commit is contained in:
sergei 2012-01-28 17:54:36 +00:00
parent f984a3d74e
commit 6874aa9676

View File

@ -301,35 +301,135 @@ end;
{$ifndef FPC_SYSTEM_HAS_INDEXBYTE} {$ifndef FPC_SYSTEM_HAS_INDEXBYTE}
{$define FPC_SYSTEM_HAS_INDEXBYTE} {$define FPC_SYSTEM_HAS_INDEXBYTE}
function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; function IndexByte(Const buf;len:SizeInt;b:byte):SizeInt; assembler; nostackframe;
var
saveedi,saveebx : longint;
asm asm
movl %edi,saveedi push %esi
movl %ebx,saveebx push %edi
movl buf,%edi // Load String push %eax { save initial value of 'buf' }
movb b,%bl
movl len,%ecx // Load len cmp $4,%edx { less than 4 bytes, just test byte by byte. }
xorl %eax,%eax jb .Ltail
testl %ecx,%ecx
jz .Lcharposnotfound mov %cl,%ch { prepare pattern }
cld movzwl %cx,%esi
movl %ecx,%edx // Copy for easy manipulation shl $16,%ecx
movb %bl,%al or %esi,%ecx
repne
scasb .Lalignloop:
jne .Lcharposnotfound test $3,%al { align to 4 bytes if necessary }
incl %ecx je .Laligned
subl %ecx,%edx cmp %cl,(%eax)
movl %edx,%eax je .Lexit
jmp .Lready inc %eax
.Lcharposnotfound: dec %edx
movl $-1,%eax jmp .Lalignloop
.Lready:
movl saveedi,%edi .balign 16 { Main loop, unrolled 4 times for speed }
movl saveebx,%ebx
.Lloop:
mov (%eax),%esi { load dword }
xor %ecx,%esi { XOR with pattern, bytes equal to target are now 0 }
lea -0x01010101(%esi),%edi
xor %esi,%edi { (x-0x01010101) xor x }
not %esi
and $0x80808080,%esi
and %edi,%esi { ((x-0x01010101) xor x) and (not x) and 0x80808080 }
jnz .Lfound { one of the bytes matches }
mov 4(%eax),%esi
xor %ecx,%esi
lea -0x01010101(%esi),%edi
xor %esi,%edi
not %esi
and $0x80808080,%esi
and %edi,%esi
jnz .Lfound4
mov 8(%eax),%esi
xor %ecx,%esi
lea -0x01010101(%esi),%edi
xor %esi,%edi
not %esi
and $0x80808080,%esi
and %edi,%esi
jnz .Lfound8
mov 12(%eax),%esi
xor %ecx,%esi
lea -0x01010101(%esi),%edi
xor %esi,%edi
not %esi
and $0x80808080,%esi
and %edi,%esi
jnz .Lfound12
add $16,%eax
.Laligned:
sub $16,%edx
jae .Lloop { Still more than 16 bytes remaining }
{ Process remaining bytes (<16 left at this point) }
{ length is offset by -16 at this point }
.Lloop2:
cmp $4-16,%edx { < 4 bytes left? }
jb .Ltail
mov (%eax),%esi
xor %ecx,%esi
lea -0x01010101(%esi),%edi
xor %esi,%edi
not %esi
and $0x80808080,%esi
and %edi,%esi
jne .Lfound
add $4,%eax
sub $4,%edx
jmp .Lloop2
.Ltail: { Less than 4 bytes remaining, check one by one }
and $3, %edx
jz .Lnotfound
.Lloop3:
cmp %cl,(%eax)
je .Lexit
inc %eax
dec %edx
jnz .Lloop3
.Lnotfound:
or $-1,%eax
jmp .Lexit1
{ add missing source pointer increments }
.Lfound12:
add $4,%eax
.Lfound8:
add $4,%eax
.Lfound4:
add $4,%eax
.Lfound:
test $0xff,%esi
jnz .Lexit
inc %eax
test $0xff00,%esi
jnz .Lexit
inc %eax
test $0xff0000,%esi
jnz .Lexit
inc %eax
.Lexit:
sub (%esp),%eax
.Lexit1:
pop %ecx { removes initial 'buf' value }
pop %edi
pop %esi
end; end;
{$endif FPC_SYSTEM_HAS_FILLDWORD} {$endif FPC_SYSTEM_HAS_INDEXBYTE}
{$ifndef FPC_SYSTEM_HAS_INDEXWORD} {$ifndef FPC_SYSTEM_HAS_INDEXWORD}