{$ifndef FPC_SYSTEM_HAS_MOVE} {$define FPC_SYSTEM_HAS_MOVE} { at least valgrind up to 3.3 has a bug which prevents the default code to work so we use a rather simple implementation here } procedure Move_8OrMore_Valgrind; assembler; nostackframe; { eax = source, edx = dest, ecx = count (ecx >= 8). If FPC_PIC: ebx pushed. } asm sub %eax, %edx { edx = dest - src } cmp %edx, %ecx ja .LBack { count (ecx) > unsigned(dest - src) (edx) if regions overlap } {$ifdef FPC_ENABLED_CLD} cld {$endif FPC_ENABLED_CLD} push %esi push %edi mov %eax, %esi lea (%edx,%eax), %edi rep movsb pop %edi pop %esi {$ifdef FPC_PIC} pop %ebx {$endif} ret .LBack: {$ifndef FPC_PIC} push %ebx {$endif} add %ecx, %eax .LNextb: dec %eax mov (%eax), %bl mov %bl, (%edx,%eax) dec %ecx jnz .LNextb pop %ebx end; {$if not defined(CPUX86_HAS_SSEUNIT) or defined(FASTMOVE_DISABLE_SSE)} {$define fastmove_has_ia32_and_mmx} procedure Move_8OrMore_IA32; assembler; nostackframe; { eax = source, edx = dest, ecx = count (ecx >= 8). If FPC_PIC: ebx pushed. } asm fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). } fildq -8(%eax,%ecx) cmp $16, %ecx jle .L9to16 cmp $32, %ecx jg .L33OrMore fildq 8(%eax) fildq -16(%eax,%ecx) fistpq -16(%edx,%ecx) fistpq 8(%edx) .L9to16: fistpq -8(%edx,%ecx) { 9–16 bytes } fistpq (%edx) {$ifdef FPC_PIC} pop %ebx {$endif} ret .Lcancel: fstp %st(0) { Pop the “second int64 from the end” .L33OrMore loads. } fucompp { Pop two elements loaded at the beginning. } pop %ebx ret .byte 102,102,144 { Turns .balign 16 before .Lloop16f into a no-op. } .L33OrMore: fildq -16(%eax,%ecx) { Second int64 from the end. } {$ifndef FPC_PIC} push %ebx {$endif} sub %edx, %eax { eax = src - dest } jz .Lcancel { exit if src=dest } mov %eax, %ebx neg %ebx cmp %ebx, %ecx ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap } mov %edx, %ebx { remember original dest to write first 16 bytes } add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } add $8, %edx and $-8, %edx sub %edx, %ecx sub $16, %ecx jbe .LPost16f .balign 16 { no-op } .Lloop16f: fildq (%eax,%edx) fistpq (%edx) fildq 8(%eax,%edx) fistpq 8(%edx) add $16, %edx sub $16, %ecx ja .Lloop16f .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. } fistpq (%edx,%ecx) fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. } fistpq (%ebx) { Important for <8-byte step between src and dest. } pop %ebx ret .byte 102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop16b into a no-op. } { backwards move } .Lback: fstp %st(0) fildq 8(%eax,%edx) { Second int64 from the start. } lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes } mov %ebx, %ecx { move dest to the previous 8-byte boundary... } and $-8, %ecx sub %edx, %ecx add %ecx, %edx sub $16, %ecx jbe .LPost16b .balign 16 { no-op } .Lloop16b: sub $16, %edx fildq 8(%eax,%edx) fistpq 8(%edx) fildq (%eax,%edx) fistpq (%edx) sub $16, %ecx ja .Lloop16b .LPost16b: sub %ecx, %edx fistpq -8(%edx) fistpq -7(%ebx) fistpq -16(%edx) pop %ebx end; procedure Move_8OrMore_MMX; assembler; nostackframe; { eax = source, edx = dest, ecx = count (ecx >= 8). If FPC_PIC: ebx pushed. } asm cmp $72, %ecx { Size at which using MMX becomes worthwhile. } jl Move_8OrMore_IA32 {$ifndef FPC_PIC} push %ebx {$endif} movq (%eax), %mm4 { First and last 8 bytes. } movq -8(%eax,%ecx), %mm5 movq -16(%eax,%ecx), %mm3 { Second vector from the end. } sub %edx, %eax { eax = src - dest } jz .Lquit { exit if src=dest } mov %eax, %ebx neg %ebx cmp %ebx, %ecx ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap } mov %edx, %ebx { remember original dest to write first 16 bytes } add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } add $8, %edx and $-8, %edx sub %edx, %ecx sub $16, %ecx jbe .LPost16f .balign 16 .Lloop16f: movq (%eax,%edx), %mm0 movq %mm0, (%edx) movq 8(%eax,%edx), %mm0 movq %mm0, 8(%edx) add $16, %edx sub $16, %ecx ja .Lloop16f .LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. } movq %mm3, (%edx,%ecx) movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. } movq %mm4, (%ebx) { Important for <8-byte step between src and dest. } .Lquit: emms pop %ebx ret .byte 144 { Turns .balign 16 before .Lloop16b into a no-op. } { backwards move } .Lback: movq 8(%eax,%edx), %mm3 { Second vector from the start. } lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes } mov %ebx, %ecx { move dest to the previous 8-byte boundary... } and $-8, %ecx sub %edx, %ecx add %ecx, %edx sub $16, %ecx jbe .LPost16b .balign 16 { no-op } .Lloop16b: sub $16, %edx movq 8(%eax,%edx), %mm0 movq %mm0, 8(%edx) movq (%eax,%edx), %mm0 movq %mm0, (%edx) sub $16, %ecx ja .Lloop16b .LPost16b: sub %ecx, %edx movq %mm3, -8(%edx) movq %mm4, -16(%edx) movq %mm5, -7(%ebx) emms pop %ebx end; {$endif need IA32 and MMX versions} {$ifndef FASTMOVE_DISABLE_SSE} label Move_8OrMore_SSE_9to15, Move_8OrMore_SSE_33OrMore, Move_8OrMore_SSE_CancelERMSBackwards; const Move_NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) } procedure Move_8OrMore_SSE; assembler; nostackframe; { eax = source, edx = dest, ecx = count (ecx >= 8). If FPC_PIC: ebx pushed. } const PrefetchDistance = 512; asm cmp $15, %ecx jle Move_8OrMore_SSE_9to15 movups (%eax), %xmm4 { First and last 16 bytes, used both in Move_8OrMore_SSE_33OrMore and 17–32 branch. } movups -16(%eax,%ecx), %xmm5 cmp $32, %ecx jg Move_8OrMore_SSE_33OrMore movups %xmm4, (%edx) { 16–32 bytes } movups %xmm5, -16(%edx,%ecx) {$ifdef FPC_PIC} pop %ebx {$endif} ret Move_8OrMore_SSE_9to15: movlps (%eax), %xmm0 movlps -8(%eax,%ecx), %xmm1 movlps %xmm0, (%edx) movlps %xmm1, -8(%edx,%ecx) .Lquit: {$ifdef FPC_PIC} pop %ebx {$endif} ret .byte {$ifndef FPC_PIC}102,{$endif}102,102,102,102,102,102,102,102,102,102,102,102,144 { Turns .balign 16 before .Lloop32f into a no-op. } Move_8OrMore_SSE_33OrMore: movups -32(%eax,%ecx), %xmm3 { Second vector from the end. Wasted read if .Lback branch is taken (it uses second vector from the start instead), } { but -32(%eax,%ecx) is about to become not accessible so easily, .Lback is rare, and small .Lback is even rarer / matters even less. } sub %edx, %eax { eax = src - dest } jz .Lquit { exit if src=dest } {$ifndef FPC_PIC} push %ebx {$endif} mov %eax, %ebx neg %ebx cmp %ebx, %ecx ja .Lback { count (ecx) > unsigned(dest - src) (edx) if regions overlap } mov %edx, %ebx { remember original dest to write first 16 bytes } add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } add $16, %edx and $-16, %edx sub %edx, %ecx .LRestAfterNTf: sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. } jbe .LPost32f cmp $Move_NtThreshold-32, %ecx jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... } .LNtIsNotBetterF: test $15, %eax jz .Lalignedloop32f .balign 16 { no-op } .Lloop32f: movups (%eax,%edx), %xmm0 movaps %xmm0, (%edx) movups 16(%eax,%edx), %xmm0 movaps %xmm0, 16(%edx) add $32, %edx sub $32, %ecx ja .Lloop32f .LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. } movups %xmm3, (%edx, %ecx) movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. } movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. } pop %ebx ret .balign 16 .Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. } movaps (%eax,%edx), %xmm0 movaps %xmm0, (%edx) movaps 16(%eax,%edx), %xmm0 movaps %xmm0, 16(%edx) add $32, %edx sub $32, %ecx ja .Lalignedloop32f .LalignedPost32f: movups %xmm3, (%edx, %ecx) movups %xmm5, 16(%edx,%ecx) movups %xmm4, (%ebx) pop %ebx ret .Lntf: cmp $Move_NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other } jb .LNtIsNotBetterF { (this check is performed here to not stand in the way of smaller counts) } sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. } test $15, %eax jz .Lalignedntloop64f .balign 16 .Lntloop64f: prefetchnta 0+PrefetchDistance(%eax,%edx,1) movups (%eax,%edx,1), %xmm0 movntps %xmm0, (%edx) movups 16(%eax,%edx,1), %xmm0 movntps %xmm0, 16(%edx) movups 32(%eax,%edx,1), %xmm0 movntps %xmm0, 32(%edx) movups 48(%eax,%edx,1), %xmm0 movntps %xmm0, 48(%edx) add $64, %edx sub $64, %ecx jae .Lntloop64f sfence add $PrefetchDistance+64, %ecx jmp .LRestAfterNTf { go handle remaining bytes } .balign 16 .Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. } prefetchnta 0+PrefetchDistance(%eax,%edx,1) movaps (%eax,%edx,1), %xmm0 movntps %xmm0, (%edx) movaps 16(%eax,%edx,1), %xmm0 movntps %xmm0, 16(%edx) movaps 32(%eax,%edx,1), %xmm0 movntps %xmm0, 32(%edx) movaps 48(%eax,%edx,1), %xmm0 movntps %xmm0, 48(%edx) add $64, %edx sub $64, %ecx jae .Lalignedntloop64f sfence add $PrefetchDistance+64, %ecx jmp .LRestAfterNTf .byte {$ifdef FPC_PIC}102,{$endif}102,102,102,144 { Turns .balign 16 before .Lloop32b into a no-op. } Move_8OrMore_SSE_CancelERMSBackwards: { Adapt from Move_8OrMore_SSE_ERMS.Lback where eax = src, edx = dest - src, xmm4 and xmm5 aren’t read, ebx isn't pushed if not FPC_PIC. } {$ifndef FPC_PIC} push %ebx {$endif} add %eax, %edx movups (%eax), %xmm4 movups -16(%eax,%ecx), %xmm5 sub %edx, %eax { backwards move } .Lback: movups 16(%eax,%edx), %xmm3 { Second vector from the start. } lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes } mov %ebx, %ecx { move dest to the previous 16-byte boundary... } and $-16, %ecx sub %edx, %ecx add %ecx, %edx .LRestAfterNTb: sub $32, %ecx jbe .LPost32b cmp $Move_NtThreshold-32, %ecx jae .Lntb .balign 16 { no-op } .Lloop32b: sub $32, %edx movups 16(%eax,%edx), %xmm0 movaps %xmm0, 16(%edx) movups (%eax,%edx), %xmm0 movaps %xmm0, (%edx) sub $32, %ecx ja .Lloop32b .LPost32b: sub %ecx, %edx movups %xmm3, -16(%edx) movups %xmm4, -32(%edx) movups %xmm5, -15(%ebx) pop %ebx ret .Lntb: cmp $-Move_NtThreshold, %eax ja .Lloop32b sub $PrefetchDistance+32, %ecx .balign 16 .Lntloop64b: prefetchnta -PrefetchDistance(%eax,%edx,1) sub $64, %edx movups 48(%eax,%edx,1), %xmm0 movntps %xmm0, 48(%edx) movups 32(%eax,%edx,1), %xmm0 movntps %xmm0, 32(%edx) movups 16(%eax,%edx,1), %xmm0 movntps %xmm0, 16(%edx) movups (%eax,%edx,1), %xmm0 movntps %xmm0, (%edx) sub $64, %ecx jae .Lntloop64b sfence add $PrefetchDistance+64, %ecx jmp .LRestAfterNTb end; procedure Move_8OrMore_SSE_ERMS; assembler; nostackframe; { eax = source, edx = dest, ecx = count (ecx >= 8). If FPC_PIC: ebx pushed. } const ErmsThreshold = 1536; asm cmp $15, %ecx jle Move_8OrMore_SSE_9to15 cmp $ErmsThreshold, %ecx jae .LRepMovs movups (%eax), %xmm4 { Same as in Move_8OrMore_SSE. } movups -16(%eax,%ecx), %xmm5 cmp $32, %ecx jg Move_8OrMore_SSE_33OrMore movups %xmm4, (%edx) { 16–32 bytes } movups %xmm5, -16(%edx,%ecx) {$ifdef FPC_PIC} pop %ebx {$endif} ret .LRepMovs: sub %eax, %edx { edx = dest - src } jz .Lquit { exit if src=dest } cmp %edx, %ecx { count (ecx) > unsigned(dest - src) (edx) if regions overlap } ja .Lback cmp $Move_NtThreshold+16, %ecx jae .LNtF { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT done by Move_8OrMore_SSE. } .LNtIsNotBetterF: push %esi push %edi mov %eax, %esi lea (%edx,%eax), %edi rep movsb pop %edi pop %esi .Lquit: {$ifdef FPC_PIC} pop %ebx {$endif} ret .LNtF: cmp $-Move_NtThreshold, %edx { Check move distance. Bad case for forward NT is 0 < src - dest < NtThreshold => unsigned(dest - src) > unsigned(-NtThreshold). } ja .LNtIsNotBetterF { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. } add %eax, %edx { Recover edx = dest. } jmp Move_8OrMore_SSE { Will perform NT. } .Lback: { dst = 3 v Move(abcdefghijXXX, count=10) ^ src = 0 = abcABCDEFGHIJ can be moved right to left in non-overlapping groups of “dst - src”: abcdefghijHIJ ^^^ abcdefgEFGhij ^^^ abcdBCDefghij ^^^ abcAbcdefghij <- tail is handled by restarting the Move with corresponding count instead, as it can have 0 to dst - src - 1 bytes. ^ Only REP MOVs with DF=0 are fast with ERMS, in case you’re wondering why not just use DF=1. } cmp $ErmsThreshold, %edx jb Move_8OrMore_SSE_CancelERMSBackwards { Unfortunately this branch can’t benefit the common case of small distance (like inserting 1 array element into the beginning). :( } cmp $Move_NtThreshold+16, %ecx jae .LNtB .LNtIsNotBetterB: {$ifndef FPC_PIC} push %ebx {$endif} mov %ecx, %ebx { ebx = remaining } sub %edx, %ebx { edx = dst - src = step; remaining -= step. } add %ecx, %eax push %esi push %edi .LRepMovsNextPieceB: { At least 1 iteration is always performed. } mov %eax, %edi { edi = src before subtracting step = dst = rep movsb dest } sub %edx, %eax { src -= step } mov %eax, %esi { esi = src = rep movsb source } mov %edx, %ecx { ecx = step = rep movsb count } rep movsb sub %edx, %ebx { remaining -= step } jnc .LRepMovsNextPieceB { CF=1 after remaining -= step means that remaining became strictly negative and the loop must be stopped/not performed. } pop %edi pop %esi lea (%edx,%ebx), %ecx { ecx = remaining } sub %ecx, %eax { eax = src } add %eax, %edx { edx = dest } pop %ebx jmp Move { Remaining piece ("a" in the example above). Can save a bit of jumps by doing more checks and jumping to more specific places, but whatever. } .LNtB: cmp $Move_NtThreshold, %edx { Check move distance. Bad case for backward NT is dest - src < NtThreshold; src is always < dest. } jb .LNtIsNotBetterB { NT is not better and Move_8OrMore_SSE won't perform it either. Use REP MOVSB. } add %eax, %edx { Recover edx = dest. } jmp Move_8OrMore_SSE { Will perform NT. } end; {$endif ndef FASTMOVE_DISABLE_SSE} procedure Move_8OrMore_Dispatch; forward; var fastmoveproc : pointer = @Move_8OrMore_Dispatch; {$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION} valgrind_used : boolean;external name '__fpc_valgrind'; {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION} function Move_8OrMore_HumanFriendlyDispatch: pointer; begin { workaround valgrind bug } {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION} if EntryInformation.valgrind_used then {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION} if valgrind_used then {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION} result:=@Move_8OrMore_Valgrind {$ifndef FASTMOVE_DISABLE_SSE} else if fast_large_repmovstosb then result:=@Move_8OrMore_SSE_ERMS else {$ifdef fastmove_has_ia32_and_mmx} if has_sse_support then {$endif} result:=@Move_8OrMore_SSE {$endif ndef FASTMOVE_DISABLE_SSE} {$ifdef fastmove_has_ia32_and_mmx} else if has_mmx_support then result:=@Move_8OrMore_MMX else result:=@Move_8OrMore_IA32 {$endif fastmove_has_ia32_and_mmx}; if fpc_cpucodeinit_performed then fastmoveproc:=result; end; procedure Move_8OrMore_Dispatch; assembler; nostackframe; { eax = source, edx = dest, ecx = count (ecx >= 8). If FPC_PIC: ebx pushed. } asm {$ifndef FPC_PIC} push %ebx {$endif} push %eax push %edx push %ecx call Move_8OrMore_HumanFriendlyDispatch mov %eax, %ebx pop %ecx pop %edx pop %eax {$ifdef FPC_PIC} jmp %ebx {$else} call %ebx pop %ebx {$endif} end; procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe; asm cmp $8, %ecx jle .L8OrLess {$ifdef FPC_PIC} push %ebx call fpc_geteipasebx addl $_GLOBAL_OFFSET_TABLE_, %ebx movl fastmoveproc@GOT(%ebx), %ebx jmp (%ebx) {$else} jmp fastmoveproc {$endif} .L8OrLess: cmp $3, %ecx jle .L3OrLess push %ebx mov (%eax), %ebx mov -4(%eax,%ecx), %eax mov %ebx, (%edx) mov %eax, -4(%edx,%ecx) pop %ebx ret .L3OrLess: cmp $1, %ecx jl .LZero push %ebx movzbl (%eax), %ebx je .LOne movzwl -2(%eax,%ecx), %eax mov %ax, -2(%edx,%ecx) .LOne: mov %bl, (%edx) pop %ebx .LZero: end; {$endif FPC_SYSTEM_HAS_MOVE}