diff --git a/rtl/i386/fastmove.inc b/rtl/i386/fastmove.inc index 78bde37d5a..bfe710694d 100644 --- a/rtl/i386/fastmove.inc +++ b/rtl/i386/fastmove.inc @@ -1,907 +1,533 @@ -{ - Copyright (c) 2004, John O'Harrow (john@almcrest.demon.co.uk) - -This software is provided 'as-is', without any express or implied warranty. -In no event will the authors be held liable for any damages arising from the -use of this software. - -Permission is granted to anyone to use this software for any purpose, including -commercial applications, and to alter it and redistribute it freely, subject to -the following restrictions: - -1. The origin of this software must not be misrepresented; you must not claim - that you wrote the original software. If you use this software in a product, - an acknowledgment in the product documentation would be appreciated but is - not required. - -2. Altered source versions must be plainly marked as such, and must not be - misrepresented as being the original software. - -3. This notice may not be removed or altered from any source distribution. - -------------------------------------------------------------------------------- - -Version: 1.40 - 16-SEP-2004 -} - -{$ifdef USE_FASTMOVE} - {$ifndef FPC_SYSTEM_HAS_MOVE} {$define FPC_SYSTEM_HAS_MOVE} -{$asmmode intel} +{ at least valgrind up to 3.3 has a bug which prevents the default code to + work so we use a rather simple implementation here } +procedure Move_8OrMore_Valgrind; assembler; nostackframe; +{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). } +asm + sub %edx, %eax + jae .LForward + mov %ecx, %ebx + add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap } + jb .LBack { if no overlap, still do forward move } + +.LForward: +{$ifdef FPC_ENABLED_CLD} + cld +{$endif FPC_ENABLED_CLD} + push %esi + push %edi + lea (%eax,%edx), %esi + mov %edx, %edi + rep movsb + pop %edi + pop %esi + pop %ebx + ret + +.LBack: + add %ecx, %edx +.LNextb: + dec %edx + mov (%eax,%edx), %bl + mov %bl, (%edx) + dec %ecx + jnz .LNextb + pop %ebx +end; + +procedure Move_8OrMore_IA32; assembler; nostackframe; +{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). } +asm + fildq (%eax) { First and last 8 bytes, used both in .L33OrMore and ladder ending (.L9to16). } + fildq -8(%eax,%ecx) + cmp $16, %ecx + jle .L9to16 + cmp $32, %ecx + jg .L33OrMore + fildq 8(%eax) + fildq -16(%eax,%ecx) + fistpq -16(%edx,%ecx) + fistpq 8(%edx) +.L9to16: + fistpq -8(%edx,%ecx) { 9–16 bytes } + fistpq (%edx) + pop %ebx + ret + +.Lcancel: + fucompp { Pop two elements loaded at the beginning. } + pop %ebx + ret + .byte 0x66,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16f into a no-op. } + +.L33OrMore: + sub %edx, %eax { eax = src - dest } + jz .Lcancel { exit if src=dest } + jnb .LForward { src>dest => forward move } + + mov %ecx, %ebx + add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap } + jb .Lback { if no overlap, still do forward move } + +.LForward: + mov %edx, %ebx { remember original dest to write first 16 bytes } + add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } + add $8, %edx + and $-8, %edx + sub %edx, %ecx + + sub $16, %ecx + jbe .LPost16f + + .balign 16 { no-op } +.Lloop16f: + fildq (%eax,%edx) + fistpq (%edx) + fildq 8(%eax,%edx) + fistpq 8(%edx) + add $16, %edx + sub $16, %ecx + ja .Lloop16f + +.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. } + cmp $-8, %ecx + jle .LFirstAndLast8f + fildq (%eax,%edx) + fistpq (%edx) +.LFirstAndLast8f: + fistpq 8(%edx,%ecx) { Write first and last 8 bytes after everything else. } + fistpq (%ebx) { Important for <8-byte step between src and dest. } + pop %ebx + ret + .byte 0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. } + +{ backwards move } +.Lback: + lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes } + mov %ebx, %ecx { move dest to the previous 8-byte boundary... } + and $-8, %ecx + sub %edx, %ecx + add %ecx, %edx + + sub $16, %ecx + jbe .LPost16b + + .balign 16 { no-op } +.Lloop16b: + sub $16, %edx + fildq 8(%eax,%edx) + fistpq 8(%edx) + fildq (%eax,%edx) + fistpq (%edx) + sub $16, %ecx + ja .Lloop16b + +.LPost16b: + cmp $-8, %ecx + jle .LFirstAndLast8b + fildq -8(%eax,%edx) + fistpq -8(%edx) +.LFirstAndLast8b: + sub %ecx, %edx + fistpq -7(%ebx) + fistpq -16(%edx) + pop %ebx +end; + +procedure Move_8OrMore_MMX; assembler; nostackframe; +{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). } +asm + cmp $72, %ecx { Size at which using MMX becomes worthwhile. } + jl Move_8OrMore_IA32 + movq (%eax), %mm4 { First and last 8 bytes. } + movq -8(%eax,%ecx), %mm5 + sub %edx, %eax { eax = src - dest } + jz .Lquit { exit if src=dest } + jnb .LForward { src>dest => forward move } + + mov %ecx, %ebx + add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap } + jb .Lback { if no overlap, still do forward move } + +.LForward: + mov %edx, %ebx { remember original dest to write first 16 bytes } + add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } + add $8, %edx + and $-8, %edx + sub %edx, %ecx + + sub $16, %ecx + jbe .LPost16f + + .balign 16 +.Lloop16f: + movq (%eax,%edx), %mm0 + movq %mm0, (%edx) + movq 8(%eax,%edx), %mm0 + movq %mm0, 8(%edx) + add $16, %edx + sub $16, %ecx + ja .Lloop16f + +.LPost16f: { +16 fixup not applied after 16× loop, ecx = remaining - 16 here. } + cmp $-8, %ecx + jle .LFirstAndLast8f + movq (%eax,%edx), %mm0 + movq %mm0, (%edx) +.LFirstAndLast8f: + movq %mm5, 8(%edx,%ecx) { Write first and last 8 bytes after everything else. } + movq %mm4, (%ebx) { Important for <8-byte step between src and dest. } +.Lquit: + emms + pop %ebx + ret + .byte 0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop16b into a no-op. } + +{ backwards move } +.Lback: + lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 8 bytes } + mov %ebx, %ecx { move dest to the previous 8-byte boundary... } + and $-8, %ecx + sub %edx, %ecx + add %ecx, %edx + + sub $16, %ecx + jbe .LPost16b + + .balign 16 { no-op } +.Lloop16b: + sub $16, %edx + movq 8(%eax,%edx), %mm0 + movq %mm0, 8(%edx) + movq (%eax,%edx), %mm0 + movq %mm0, (%edx) + sub $16, %ecx + ja .Lloop16b + +.LPost16b: + cmp $-8, %ecx + jle .LFirstAndLast8b + movq -8(%eax,%edx), %mm0 + movq %mm0, -8(%edx) +.LFirstAndLast8b: + sub %ecx, %edx + movq %mm4, -16(%edx) + movq %mm5, -7(%ebx) + emms + pop %ebx +end; + +{$ifndef FASTMOVE_DISABLE_SSE} +procedure Move_8OrMore_SSE; assembler; nostackframe; +{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). } +const + ErmsThreshold = 1536; + NtThreshold = 256 * 1024; { this limit must be processor-specific (1/2 L2 cache size) } + PrefetchDistance = 512; +asm + cmp $16, %ecx + jle .L9to16 + movups (%eax), %xmm4 { First and last 16 bytes, used both in .L33OrMore and 17–32 branch. } + movups -16(%eax,%ecx), %xmm5 + cmp $32, %ecx + jg .L33OrMore + movups %xmm4, (%edx) { 17–32 bytes } + movups %xmm5, -16(%edx,%ecx) + pop %ebx + ret + +.L9to16: + movq (%eax), %xmm0 + movq -8(%eax,%ecx), %xmm1 + movq %xmm0, (%edx) + movq %xmm1, -8(%edx,%ecx) +.Lquit: + pop %ebx + ret + .byte 0x66,0x66,0x66,0x66,0x66,0x2E,0x0F,0x1F,0x84,0,0,0,0,0 { Turns .balign 16 before .Lloop32f into a no-op. } + +.L33OrMore: + sub %edx, %eax { eax = src - dest } + jz .Lquit { exit if src=dest } + jnb .LForward { src>dest => forward move } + + mov %ecx, %ebx + add %eax, %ebx { eax is negative => ecx+eax > 0 if regions overlap } + jb .Lback { if no overlap, still do forward move } + +.LForward: + mov %edx, %ebx { remember original dest to write first 16 bytes } + add %edx, %ecx { Move dest to the next 16-byte boundary. +16 if already aligned, as first 16 bytes will be writen separately anyway. } + add $16, %edx + and $-16, %edx + sub %edx, %ecx + +.LRestAfterNTf: + sub $32, %ecx { During the N× loop, ecx is N bytes less than actually remained to allow sub N+jae .LLoop instead of sub N+cmp N+jae .LLoop. } + jbe .LPost32f + cmp $NtThreshold-32, %ecx + jae .Lntf { might jump back right away after more checks, but the branch is taken only on huge moves so it's better to take these checks out of here... } +.LNtIsNotBetter: + cmp $ErmsThreshold-32, %ecx { Even enhanced REP MOV does not seem to use NT so falls behind on huge moves. So prioritize NT. } + jae .LRepMovsF +.LRepMovsIsNotBetter: + test $15, %eax + jz .Lalignedloop32f + + .balign 16 { no-op } +.Lloop32f: + movups (%eax,%edx), %xmm0 + movaps %xmm0, (%edx) + movups 16(%eax,%edx), %xmm0 + movaps %xmm0, 16(%edx) + add $32, %edx + sub $32, %ecx + ja .Lloop32f + +.LPost32f: { +32 fixup not applied after 32× loop, ecx = remaining - 32 here. } + cmp $-16, %ecx + jle .LFirstAndLast16f + movups (%eax,%edx), %xmm0 + movaps %xmm0, (%edx) +.LFirstAndLast16f: + movups %xmm5, 16(%edx,%ecx) { Write first and last 16 bytes after everything else. } + movups %xmm4, (%ebx) { Important for <16-byte step between src and dest. } + pop %ebx + ret + + .balign 16 +.Lalignedloop32f: { Same as above starting from .Lloop32f but with MOVAPSes. } + movaps (%eax,%edx), %xmm0 + movaps %xmm0, (%edx) + movaps 16(%eax,%edx), %xmm0 + movaps %xmm0, 16(%edx) + add $32, %edx + sub $32, %ecx + ja .Lalignedloop32f + +.LalignedPost32f: + cmp $-16, %ecx + jle .LalignedFirstAndLast16f + movaps (%eax,%edx), %xmm0 + movaps %xmm0, (%edx) +.LalignedFirstAndLast16f: + movups %xmm5, 16(%edx,%ecx) + movups %xmm4, (%ebx) + pop %ebx + ret + +.LRepMovsF: +{$ifdef FPC_PIC} + push %ebx + call fpc_geteipasebx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + movl fast_large_repmovstosb@GOT(%ebx), %ebx + cmpb $1, (%ebx) + pop %ebx +{$else FPC_PIC} + cmpb $1, fast_large_repmovstosb +{$endif FPC_PIC} + jne .LRepMovsIsNotBetter + push %esi + push %edi + lea (%eax,%edx), %esi + mov %edx, %edi + add $32, %ecx + rep movsb + movups %xmm4, (%ebx) { last 16 aren't required } + pop %edi + pop %esi + pop %ebx + ret + +.Lntf: + cmp $NtThreshold, %eax { Maybe change mind: don't bother bypassing cache if src and dest are close to each other } + jb .LNtIsNotBetter { (this check is performed here to not stand in the way of smaller counts) } + sub $PrefetchDistance+32, %ecx { ecx = remaining - prefetch distance - bytes per loop (64), but 32 was subtracted already. } + test $15, %eax + jz .Lalignedntloop64f + + .balign 16 +.Lntloop64f: + prefetchnta 0+PrefetchDistance(%eax,%edx,1) + movups (%eax,%edx,1), %xmm0 + movntps %xmm0, (%edx) + movups 16(%eax,%edx,1), %xmm0 + movntps %xmm0, 16(%edx) + movups 32(%eax,%edx,1), %xmm0 + movntps %xmm0, 32(%edx) + movups 48(%eax,%edx,1), %xmm0 + movntps %xmm0, 48(%edx) + add $64, %edx + sub $64, %ecx + jae .Lntloop64f + + sfence + add $PrefetchDistance+64, %ecx + jmp .LRestAfterNTf { go handle remaining bytes } + + .balign 16 +.Lalignedntloop64f: { Same as above starting from .Lntloop64f but with MOVAPSes. } + prefetchnta 0+PrefetchDistance(%eax,%edx,1) + movaps (%eax,%edx,1), %xmm0 + movntps %xmm0, (%edx) + movaps 16(%eax,%edx,1), %xmm0 + movntps %xmm0, 16(%edx) + movaps 32(%eax,%edx,1), %xmm0 + movntps %xmm0, 32(%edx) + movaps 48(%eax,%edx,1), %xmm0 + movntps %xmm0, 48(%edx) + add $64, %edx + sub $64, %ecx + jae .Lalignedntloop64f + + sfence + add $PrefetchDistance+64, %ecx + jmp .LRestAfterNTf + .byte 0x66,0x0F,0x1F,0x44,0,0 { Turns .balign 16 before .Lloop32b into a no-op. } + +{ backwards move } +.Lback: + lea -1(%edx,%ecx), %ebx { points to the end of dest; remember to write last 16 bytes } + mov %ebx, %ecx { move dest to the previous 16-byte boundary... } + and $-16, %ecx + sub %edx, %ecx + add %ecx, %edx + +.LRestAfterNTb: + sub $32, %ecx + jbe .LPost32b + cmp $NtThreshold-32, %ecx + jae .Lntb + + .balign 16 { no-op } +.Lloop32b: + sub $32, %edx + movups 16(%eax,%edx), %xmm0 + movaps %xmm0, 16(%edx) + movups (%eax,%edx), %xmm0 + movaps %xmm0, (%edx) + sub $32, %ecx + ja .Lloop32b + +.LPost32b: + cmp $-16, %ecx + jle .LFirstAndLast16b + movups -16(%eax,%edx), %xmm0 + movaps %xmm0, -16(%edx) +.LFirstAndLast16b: + sub %ecx, %edx + movups %xmm4, -32(%edx) + movups %xmm5, -15(%ebx) + pop %ebx + ret + +.Lntb: + cmp $-NtThreshold, %eax + jnb .Lloop32b + sub $PrefetchDistance+32, %ecx + + .balign 16 +.Lntloop64b: + prefetchnta -PrefetchDistance(%eax,%edx,1) + sub $64, %edx + movups 48(%eax,%edx,1), %xmm0 + movntps %xmm0, 48(%edx) + movups 32(%eax,%edx,1), %xmm0 + movntps %xmm0, 32(%edx) + movups 16(%eax,%edx,1), %xmm0 + movntps %xmm0, 16(%edx) + movups (%eax,%edx,1), %xmm0 + movntps %xmm0, (%edx) + sub $64, %ecx + jae .Lntloop64b + + sfence + add $PrefetchDistance+64, %ecx + jmp .LRestAfterNTb +end; +{$endif ndef FASTMOVE_DISABLE_SSE} + +procedure Move_8OrMore_Dispatch; forward; -{-------------------------------------------------------------------------} -(* -{Just to show that a good Pascal algorithm can beat the default BASM} -procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer); var - S, D : PtrUInt; - Temp, C, I : PtrInt; - L : PPtrInt; -begin - S := Cardinal(@Source); - D := Cardinal(@Dest); - if S = D then - Exit; - if Count <= 4 then - case Count of - 1 : PByte(@Dest)^ := PByte(S)^; - 2 : PWord(@Dest)^ := PWord(S)^; - 3 : if D > S then - begin - PByte(Integer(@Dest)+2)^ := PByte(S+2)^; - PWord(@Dest)^ := PWord(S)^; - end - else - begin - PWord(@Dest)^ := PWord(S)^; - PByte(Integer(@Dest)+2)^ := PByte(S+2)^; - end; - 4 : PInteger(@Dest)^ := PInteger(S)^ - else Exit; {Count <= 0} - end - else - if D > S then - begin - Temp := PInteger(S)^; - I := Integer(@Dest); - C := Count - 4; - L := PInteger(Integer(@Dest) + C); - Inc(S, C); - repeat - L^ := PInteger(S)^; - if Count <= 8 then - Break; - Dec(Count, 4); - Dec(S, 4); - Dec(L); - until False; - PInteger(I)^ := Temp; - end - else - begin - C := Count - 4; - Temp := PInteger(S + Cardinal(C))^; - I := Integer(@Dest) + C; - L := @Dest; - repeat - L^ := PInteger(S)^; - if Count <= 8 then - Break; - Dec(Count, 4); - Inc(S, 4); - Inc(L); - until False; - PInteger(I)^ := Temp; - end; -end; {MoveJOH_PAS} -*) - -const - SMALLMOVESIZE = 36; - -{-------------------------------------------------------------------------} -{Perform Forward Move of 0..36 Bytes} -{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX} -procedure SmallForwardMove_3;assembler;nostackframe; -asm - jmp dword ptr @@FwdJumpTable[ecx*4] - align 16 -@@FwdJumpTable: - dd @@Done {Removes need to test for zero size move} - dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08 - dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16 - dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24 - dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32 - dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36 -@@Fwd36: - mov ecx,[eax-36] - mov [edx-36],ecx -@@Fwd32: - mov ecx,[eax-32] - mov [edx-32],ecx -@@Fwd28: - mov ecx,[eax-28] - mov [edx-28],ecx -@@Fwd24: - mov ecx,[eax-24] - mov [edx-24],ecx -@@Fwd20: - mov ecx,[eax-20] - mov [edx-20],ecx -@@Fwd16: - mov ecx,[eax-16] - mov [edx-16],ecx -@@Fwd12: - mov ecx,[eax-12] - mov [edx-12],ecx -@@Fwd08: - mov ecx,[eax-8] - mov [edx-8],ecx -@@Fwd04: - mov ecx,[eax-4] - mov [edx-4],ecx - ret -@@Fwd35: - mov ecx,[eax-35] - mov [edx-35],ecx -@@Fwd31: - mov ecx,[eax-31] - mov [edx-31],ecx -@@Fwd27: - mov ecx,[eax-27] - mov [edx-27],ecx -@@Fwd23: - mov ecx,[eax-23] - mov [edx-23],ecx -@@Fwd19: - mov ecx,[eax-19] - mov [edx-19],ecx -@@Fwd15: - mov ecx,[eax-15] - mov [edx-15],ecx -@@Fwd11: - mov ecx,[eax-11] - mov [edx-11],ecx -@@Fwd07: - mov ecx,[eax-7] - mov [edx-7],ecx - mov ecx,[eax-4] - mov [edx-4],ecx - ret -@@Fwd03: - movzx ecx, word ptr [eax-3] - mov [edx-3],cx - movzx ecx, byte ptr [eax-1] - mov [edx-1],cl - ret -@@Fwd34: - mov ecx,[eax-34] - mov [edx-34],ecx -@@Fwd30: - mov ecx,[eax-30] - mov [edx-30],ecx -@@Fwd26: - mov ecx,[eax-26] - mov [edx-26],ecx -@@Fwd22: - mov ecx,[eax-22] - mov [edx-22],ecx -@@Fwd18: - mov ecx,[eax-18] - mov [edx-18],ecx -@@Fwd14: - mov ecx,[eax-14] - mov [edx-14],ecx -@@Fwd10: - mov ecx,[eax-10] - mov [edx-10],ecx -@@Fwd06: - mov ecx,[eax-6] - mov [edx-6],ecx -@@Fwd02: - movzx ecx, word ptr [eax-2] - mov [edx-2],cx - ret -@@Fwd33: - mov ecx,[eax-33] - mov [edx-33],ecx -@@Fwd29: - mov ecx,[eax-29] - mov [edx-29],ecx -@@Fwd25: - mov ecx,[eax-25] - mov [edx-25],ecx -@@Fwd21: - mov ecx,[eax-21] - mov [edx-21],ecx -@@Fwd17: - mov ecx,[eax-17] - mov [edx-17],ecx -@@Fwd13: - mov ecx,[eax-13] - mov [edx-13],ecx -@@Fwd09: - mov ecx,[eax-9] - mov [edx-9],ecx -@@Fwd05: - mov ecx,[eax-5] - mov [edx-5],ecx -@@Fwd01: - movzx ecx, byte ptr [eax-1] - mov [edx-1],cl -@@Done: -end; {SmallForwardMove} - -{-------------------------------------------------------------------------} -{Perform Backward Move of 0..36 Bytes} -{On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX} -procedure SmallBackwardMove_3;assembler;nostackframe; -asm - jmp dword ptr @@BwdJumpTable[ecx*4] - align 16 -@@BwdJumpTable: - dd @@Done {Removes need to test for zero size move} - dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08 - dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16 - dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24 - dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32 - dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36 -@@Bwd36: - mov ecx,[eax+32] - mov [edx+32],ecx -@@Bwd32: - mov ecx,[eax+28] - mov [edx+28],ecx -@@Bwd28: - mov ecx,[eax+24] - mov [edx+24],ecx -@@Bwd24: - mov ecx,[eax+20] - mov [edx+20],ecx -@@Bwd20: - mov ecx,[eax+16] - mov [edx+16],ecx -@@Bwd16: - mov ecx,[eax+12] - mov [edx+12],ecx -@@Bwd12: - mov ecx,[eax+8] - mov [edx+8],ecx -@@Bwd08: - mov ecx,[eax+4] - mov [edx+4],ecx -@@Bwd04: - mov ecx,[eax] - mov [edx],ecx - ret -@@Bwd35: - mov ecx,[eax+31] - mov [edx+31],ecx -@@Bwd31: - mov ecx,[eax+27] - mov [edx+27],ecx -@@Bwd27: - mov ecx,[eax+23] - mov [edx+23],ecx -@@Bwd23: - mov ecx,[eax+19] - mov [edx+19],ecx -@@Bwd19: - mov ecx,[eax+15] - mov [edx+15],ecx -@@Bwd15: - mov ecx,[eax+11] - mov [edx+11],ecx -@@Bwd11: - mov ecx,[eax+7] - mov [edx+7],ecx -@@Bwd07: - mov ecx,[eax+3] - mov [edx+3],ecx - mov ecx,[eax] - mov [edx],ecx - ret -@@Bwd03: - movzx ecx, word ptr [eax+1] - mov [edx+1],cx - movzx ecx, byte ptr [eax] - mov [edx],cl - ret -@@Bwd34: - mov ecx,[eax+30] - mov [edx+30],ecx -@@Bwd30: - mov ecx,[eax+26] - mov [edx+26],ecx -@@Bwd26: - mov ecx,[eax+22] - mov [edx+22],ecx -@@Bwd22: - mov ecx,[eax+18] - mov [edx+18],ecx -@@Bwd18: - mov ecx,[eax+14] - mov [edx+14],ecx -@@Bwd14: - mov ecx,[eax+10] - mov [edx+10],ecx -@@Bwd10: - mov ecx,[eax+6] - mov [edx+6],ecx -@@Bwd06: - mov ecx,[eax+2] - mov [edx+2],ecx -@@Bwd02: - movzx ecx, word ptr [eax] - mov [edx],cx - ret -@@Bwd33: - mov ecx,[eax+29] - mov [edx+29],ecx -@@Bwd29: - mov ecx,[eax+25] - mov [edx+25],ecx -@@Bwd25: - mov ecx,[eax+21] - mov [edx+21],ecx -@@Bwd21: - mov ecx,[eax+17] - mov [edx+17],ecx -@@Bwd17: - mov ecx,[eax+13] - mov [edx+13],ecx -@@Bwd13: - mov ecx,[eax+9] - mov [edx+9],ecx -@@Bwd09: - mov ecx,[eax+5] - mov [edx+5],ecx -@@Bwd05: - mov ecx,[eax+1] - mov [edx+1],ecx -@@Bwd01: - movzx ecx, byte ptr[eax] - mov [edx],cl -@@Done: -end; {SmallBackwardMove} - - -{ at least valgrind up to 3.3 has a bug which prevents the default code to - work so we use a rather simple implementation here -} -procedure Forwards_Valgrind;assembler;nostackframe; -asm -{$ifdef FPC_ENABLED_CLD} - cld -{$endif FPC_ENABLED_CLD} - push esi - push edi - mov esi,eax - mov edi,edx - rep movsb - pop edi - pop esi -end; - -{ at least valgrind up to 3.3 has a bug which prevents the default code to - work so we use a rather simple implementation here -} -procedure Backwards_Valgrind;assembler;nostackframe; -asm - push esi - push edi - lea esi,[eax+ecx-1] - lea edi,[edx+ecx-1] -@@repeat: - mov al,[esi] - mov [edi],al - dec esi - dec edi - dec ecx - jnz @@repeat - pop edi - pop esi -end; - -{-------------------------------------------------------------------------} -{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} -procedure Forwards_IA32_3;assembler;nostackframe; -asm - push ebx - mov ebx,edx - fild qword ptr [eax] - add eax,ecx {QWORD Align Writes} - add ecx,edx - add edx,7 - and edx,-8 - sub ecx,edx - add edx,ecx {Now QWORD Aligned} - sub ecx,16 - neg ecx -@FwdLoop: - fild qword ptr [eax+ecx-16] - fistp qword ptr [edx+ecx-16] - fild qword ptr [eax+ecx-8] - fistp qword ptr [edx+ecx-8] - add ecx,16 - jle @FwdLoop - fistp qword ptr [ebx] - neg ecx - add ecx,16 - pop ebx - jmp SmallForwardMove_3 -end; {Forwards_IA32} - -{-------------------------------------------------------------------------} -{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} -procedure Backwards_IA32_3;assembler;nostackframe; -asm - push ebx - fild qword ptr [eax+ecx-8] - lea ebx,[edx+ecx] {QWORD Align Writes} - and ebx,7 - sub ecx,ebx - add ebx,ecx {Now QWORD Aligned, EBX = Original Length} - sub ecx,16 -@BwdLoop: - fild qword ptr [eax+ecx] - fild qword ptr [eax+ecx+8] - fistp qword ptr [edx+ecx+8] - fistp qword ptr [edx+ecx] - sub ecx,16 - jge @BwdLoop - fistp qword ptr [edx+ebx-8] - add ecx,16 - pop ebx - jmp SmallBackwardMove_3 -end; {Backwards_IA32} - -{-------------------------------------------------------------------------} -{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} -procedure Forwards_MMX_3;assembler;nostackframe; -const - LARGESIZE = 1024; -asm - cmp ecx,LARGESIZE - jge @FwdLargeMove - cmp ecx,72 {Size at which using MMX becomes worthwhile} - jl Forwards_IA32_3 - push ebx - mov ebx,edx - movq mm0,[eax] {First 8 Characters} - {QWORD Align Writes} - add eax,ecx - add ecx,edx - add edx,7 - and edx,-8 - sub ecx,edx - add edx,ecx - {Now QWORD Aligned} - sub ecx,32 - neg ecx -@FwdLoopMMX: - movq mm1,[eax+ecx-32] - movq mm2,[eax+ecx-24] - movq mm3,[eax+ecx-16] - movq mm4,[eax+ecx- 8] - movq [edx+ecx-32],mm1 - movq [edx+ecx-24],mm2 - movq [edx+ecx-16],mm3 - movq [edx+ecx- 8],mm4 - add ecx,32 - jle @FwdLoopMMX - movq [ebx],mm0 {First 8 Characters} - emms - pop ebx - neg ecx - add ecx,32 - jmp SmallForwardMove_3 -@FwdLargeMove: - push ebx - mov ebx,ecx - test edx,15 - jz @FwdAligned - {16 byte Align Destination} - mov ecx,edx - add ecx,15 - and ecx,-16 - sub ecx,edx - add eax,ecx - add edx,ecx - sub ebx,ecx - {Destination now 16 Byte Aligned} - call SmallForwardMove_3 -@FwdAligned: - mov ecx,ebx - and ecx,-16 - sub ebx,ecx {EBX = Remainder} - push esi - push edi - mov esi,eax {ESI = Source} - mov edi,edx {EDI = Dest} - mov eax,ecx {EAX = Count} - and eax,-64 {EAX = No of Bytes to Blocks Moves} - and ecx,$3F {ECX = Remaining Bytes to Move (0..63)} - add esi,eax - add edi,eax - shr eax,3 {EAX = No of QWORD's to Block Move} - neg eax -@MMXcopyloop: - movq mm0,[esi+eax*8 ] - movq mm1,[esi+eax*8+ 8] - movq mm2,[esi+eax*8+16] - movq mm3,[esi+eax*8+24] - movq mm4,[esi+eax*8+32] - movq mm5,[esi+eax*8+40] - movq mm6,[esi+eax*8+48] - movq mm7,[esi+eax*8+56] - movq [edi+eax*8 ],mm0 - movq [edi+eax*8+ 8],mm1 - movq [edi+eax*8+16],mm2 - movq [edi+eax*8+24],mm3 - movq [edi+eax*8+32],mm4 - movq [edi+eax*8+40],mm5 - movq [edi+eax*8+48],mm6 - movq [edi+eax*8+56],mm7 - add eax,8 - jnz @MMXcopyloop - emms {Empty MMX State} -{$ifdef FPC_ENABLED_CLD} - cld -{$endif FPC_ENABLED_CLD} - add ecx,ebx - shr ecx,2 - rep movsd - mov ecx,ebx - and ecx,3 - rep movsb - pop edi - pop esi - pop ebx -end; {Forwards_MMX} - -{-------------------------------------------------------------------------} -{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} -procedure Backwards_MMX_3;assembler;nostackframe; -asm - cmp ecx,72 {Size at which using MMX becomes worthwhile} - jl Backwards_IA32_3 - push ebx - movq mm0,[eax+ecx-8] {Get Last QWORD} - {QWORD Align Writes} - lea ebx,[edx+ecx] - and ebx,7 - sub ecx,ebx - add ebx,ecx - {Now QWORD Aligned} - sub ecx,32 -@BwdLoopMMX: - movq mm1,[eax+ecx ] - movq mm2,[eax+ecx+ 8] - movq mm3,[eax+ecx+16] - movq mm4,[eax+ecx+24] - movq [edx+ecx+24],mm4 - movq [edx+ecx+16],mm3 - movq [edx+ecx+ 8],mm2 - movq [edx+ecx ],mm1 - sub ecx,32 - jge @BwdLoopMMX - movq [edx+ebx-8], mm0 {Last QWORD} - emms - add ecx,32 - pop ebx - jmp SmallBackwardMove_3 -end; {Backwards_MMX} - -{$ifndef FASTMOVE_DISABLE_SSE3} -{-------------------------------------------------------------------------} -{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 } -procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe; -const - Prefetch = 512; -asm - push esi - mov esi,eax {ESI = Source} - mov eax,ecx {EAX = Count} - and eax,-128 {EAX = No of Bytes to Block Move} - add esi,eax - add edx,eax - shr eax,3 {EAX = No of QWORD's to Block Move} - neg eax - cmp eax, -(32*1024) {Count > 256K} - jl @Large -@Small: {Count<=256K} - test esi,15 {Check if Both Source/Dest Aligned} - jnz @SmallUnaligned -@SmallAligned: {Both Source and Dest 16-Byte Aligned} -@SmallAlignedLoop: - movaps xmm0,[esi+8*eax] - movaps xmm1,[esi+8*eax+16] - movaps xmm2,[esi+8*eax+32] - movaps xmm3,[esi+8*eax+48] - movaps [edx+8*eax],xmm0 - movaps [edx+8*eax+16],xmm1 - movaps [edx+8*eax+32],xmm2 - movaps [edx+8*eax+48],xmm3 - movaps xmm4,[esi+8*eax+64] - movaps xmm5,[esi+8*eax+80] - movaps xmm6,[esi+8*eax+96] - movaps xmm7,[esi+8*eax+112] - movaps [edx+8*eax+64],xmm4 - movaps [edx+8*eax+80],xmm5 - movaps [edx+8*eax+96],xmm6 - movaps [edx+8*eax+112],xmm7 - add eax,16 - js @SmallAlignedLoop - jmp @Remainder -@SmallUnaligned: {Source Not 16-Byte Aligned} -@SmallUnalignedLoop: - movups xmm0,[esi+8*eax] - movups xmm1,[esi+8*eax+16] - movups xmm2,[esi+8*eax+32] - movups xmm3,[esi+8*eax+48] - movaps [edx+8*eax],xmm0 - movaps [edx+8*eax+16],xmm1 - movaps [edx+8*eax+32],xmm2 - movaps [edx+8*eax+48],xmm3 - movups xmm4,[esi+8*eax+64] - movups xmm5,[esi+8*eax+80] - movups xmm6,[esi+8*eax+96] - movups xmm7,[esi+8*eax+112] - movaps [edx+8*eax+64],xmm4 - movaps [edx+8*eax+80],xmm5 - movaps [edx+8*eax+96],xmm6 - movaps [edx+8*eax+112],xmm7 - add eax,16 - js @SmallUnalignedLoop - jmp @Remainder -@Large: {Count>256K} - test esi,15 {Check if Both Source/Dest Aligned} - jnz @LargeUnaligned -@LargeAligned: {Both Source and Dest 16-Byte Aligned} -@LargeAlignedLoop: - prefetchnta [esi+8*eax+Prefetch] - prefetchnta [esi+8*eax+Prefetch+64] - movaps xmm0,[esi+8*eax] - movaps xmm1,[esi+8*eax+16] - movaps xmm2,[esi+8*eax+32] - movaps xmm3,[esi+8*eax+48] - movntps [edx+8*eax],xmm0 - movntps [edx+8*eax+16],xmm1 - movntps [edx+8*eax+32],xmm2 - movntps [edx+8*eax+48],xmm3 - movaps xmm4,[esi+8*eax+64] - movaps xmm5,[esi+8*eax+80] - movaps xmm6,[esi+8*eax+96] - movaps xmm7,[esi+8*eax+112] - movntps [edx+8*eax+64],xmm4 - movntps [edx+8*eax+80],xmm5 - movntps [edx+8*eax+96],xmm6 - movntps [edx+8*eax+112],xmm7 - add eax,16 - js @LargeAlignedLoop - sfence - jmp @Remainder -@LargeUnaligned: {Source Not 16-Byte Aligned} -@LargeUnalignedLoop: - prefetchnta [esi+8*eax+Prefetch] - prefetchnta [esi+8*eax+Prefetch+64] - movups xmm0,[esi+8*eax] - movups xmm1,[esi+8*eax+16] - movups xmm2,[esi+8*eax+32] - movups xmm3,[esi+8*eax+48] - movntps [edx+8*eax],xmm0 - movntps [edx+8*eax+16],xmm1 - movntps [edx+8*eax+32],xmm2 - movntps [edx+8*eax+48],xmm3 - movups xmm4,[esi+8*eax+64] - movups xmm5,[esi+8*eax+80] - movups xmm6,[esi+8*eax+96] - movups xmm7,[esi+8*eax+112] - movntps [edx+8*eax+64],xmm4 - movntps [edx+8*eax+80],xmm5 - movntps [edx+8*eax+96],xmm6 - movntps [edx+8*eax+112],xmm7 - add eax,16 - js @LargeUnalignedLoop - sfence -@Remainder: - and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)} - jz @Done - add esi,ecx - add edx,ecx - neg ecx -@RemainderLoop: - movups xmm0,[esi+ecx] - movaps [edx+ecx],xmm0 - add ecx,16 - jnz @RemainderLoop -@Done: - pop esi -end; {AlignedFwdMoveSSE} - -{-------------------------------------------------------------------------} -{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} -procedure Forwards_SSE_3;assembler;nostackframe; -const - LARGESIZE = 2048; -asm - cmp ecx,LARGESIZE - jge @FwdLargeMove - cmp ecx,SMALLMOVESIZE+32 - movups xmm0,[eax] - jg @FwdMoveSSE - movups xmm1,[eax+16] - movups [edx],xmm0 - movups [edx+16],xmm1 - add eax,ecx - add edx,ecx - sub ecx,32 - jmp SmallForwardMove_3 -@FwdMoveSSE: - push ebx - mov ebx,edx - {Align Writes} - add eax,ecx - add ecx,edx - add edx,15 - and edx,-16 - sub ecx,edx - add edx,ecx - {Now Aligned} - sub ecx,32 - neg ecx -@FwdLoopSSE: - movups xmm1,[eax+ecx-32] - movups xmm2,[eax+ecx-16] - movaps [edx+ecx-32],xmm1 - movaps [edx+ecx-16],xmm2 - add ecx,32 - jle @FwdLoopSSE - movups [ebx],xmm0 {First 16 Bytes} - neg ecx - add ecx,32 - pop ebx - jmp SmallForwardMove_3 -@FwdLargeMove: - push ebx - mov ebx,ecx - test edx,15 - jz @FwdLargeAligned - {16 byte Align Destination} - mov ecx,edx - add ecx,15 - and ecx,-16 - sub ecx,edx - add eax,ecx - add edx,ecx - sub ebx,ecx - {Destination now 16 Byte Aligned} - call SmallForwardMove_3 - mov ecx,ebx -@FwdLargeAligned: - and ecx,-16 - sub ebx,ecx {EBX = Remainder} - push edx - push eax - push ecx - call AlignedFwdMoveSSE_3 - pop ecx - pop eax - pop edx - add ecx,ebx - add eax,ecx - add edx,ecx - mov ecx,ebx - pop ebx - jmp SmallForwardMove_3 -end; {Forwards_SSE} - -{-------------------------------------------------------------------------} -{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} -procedure Backwards_SSE_3;assembler;nostackframe; -asm - cmp ecx,SMALLMOVESIZE+32 - jg @BwdMoveSSE - sub ecx,32 - movups xmm1,[eax+ecx] - movups xmm2,[eax+ecx+16] - movups [edx+ecx],xmm1 - movups [edx+ecx+16],xmm2 - jmp SmallBackwardMove_3 -@BwdMoveSSE: - push ebx - movups xmm0,[eax+ecx-16] {Last 16 Bytes} - {Align Writes} - lea ebx,[edx+ecx] - and ebx,15 - sub ecx,ebx - add ebx,ecx - {Now Aligned} - sub ecx,32 -@BwdLoop: - movups xmm1,[eax+ecx] - movups xmm2,[eax+ecx+16] - movaps [edx+ecx],xmm1 - movaps [edx+ecx+16],xmm2 - sub ecx,32 - jge @BwdLoop - movups [edx+ebx-16],xmm0 {Last 16 Bytes} - add ecx,32 - pop ebx - jmp SmallBackwardMove_3 -end; {Backwards_SSE} -{$endif ndef FASTMOVE_DISABLE_SSE3} - -const - fastmoveproc_forward : pointer = @Forwards_IA32_3; - fastmoveproc_backward : pointer = @Backwards_IA32_3; - -procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe; -asm - cmp ecx,SMALLMOVESIZE - ja @Large - cmp eax,edx - lea eax,[eax+ecx] - jle @SmallCheck -@SmallForward: - add edx,ecx - jmp SmallForwardMove_3 -@SmallCheck: - je @Done {For Compatibility with Delphi's move for Source = Dest} - sub eax,ecx - jmp SmallBackwardMove_3 -@Large: - jng @Done {For Compatibility with Delphi's move for Count < 0} - cmp eax,edx - jg @moveforward - je @Done {For Compatibility with Delphi's move for Source = Dest} - push eax - add eax,ecx - cmp eax,edx - pop eax - jg @movebackward -@moveforward: - jmp dword ptr fastmoveproc_forward -@movebackward: - jmp dword ptr fastmoveproc_backward {Source/Dest Overlap} -@Done: -end; - -{$asmmode att} + fastmoveproc : pointer = @Move_8OrMore_Dispatch; {$ifndef FPC_HAS_INDIRECT_ENTRY_INFORMATION} -var valgrind_used : boolean;external name '__fpc_valgrind'; {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION} -procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif} - begin - { workaround valgrind bug } +function Move_8OrMore_HumanFriendlyDispatch: pointer; +begin + { workaround valgrind bug } {$ifdef FPC_HAS_INDIRECT_ENTRY_INFORMATION} - if EntryInformation.valgrind_used then + if EntryInformation.valgrind_used then {$else FPC_HAS_INDIRECT_ENTRY_INFORMATION} - if valgrind_used then + if valgrind_used then {$endif FPC_HAS_INDIRECT_ENTRY_INFORMATION} - begin - fastmoveproc_forward:=@Forwards_Valgrind; - fastmoveproc_backward:=@Backwards_Valgrind; - end -{$ifndef FASTMOVE_DISABLE_SSE3} - else if has_sse_support then - begin - fastmoveproc_forward:=@Forwards_SSE_3; - fastmoveproc_backward:=@Backwards_SSE_3; - end -{$endif ndef FASTMOVE_DISABLE_SSE3} - else if has_mmx_support then - begin - fastmoveproc_forward:=@Forwards_MMX_3; - fastmoveproc_backward:=@Backwards_MMX_3; - end; - end; + result:=@Move_8OrMore_Valgrind +{$ifndef FASTMOVE_DISABLE_SSE} + else if has_sse_support then + result:=@Move_8OrMore_SSE +{$endif ndef FASTMOVE_DISABLE_SSE} + else if has_mmx_support then + result:=@Move_8OrMore_MMX + else + result:=@Move_8OrMore_IA32; + if fpc_cpucodeinit_performed then + fastmoveproc:=result; +end; + +procedure Move_8OrMore_Dispatch; assembler; nostackframe; +{ ebx pushed, eax = source, edx = dest, ecx = count (ecx >= 8). } +asm + push %eax + push %edx + push %ecx + call Move_8OrMore_HumanFriendlyDispatch + mov %eax, %ebx + pop %ecx + pop %edx + pop %eax + jmp %ebx +end; + +procedure Move(const source;var dest;count:SizeInt); [public, alias: 'FPC_MOVE']; assembler; nostackframe; +asm + push %ebx + cmp $8, %ecx + jle .L8OrLess +{$ifdef FPC_PIC} + call fpc_geteipasebx + addl $_GLOBAL_OFFSET_TABLE_, %ebx + movl fastmoveproc@GOT(%ebx), %ebx + jmp (%ebx) +{$else} + jmp fastmoveproc +{$endif} + +.L8OrLess: + cmp $3, %ecx + jle .L3OrLess + mov (%eax), %ebx + mov -4(%eax,%ecx), %eax + mov %ebx, (%edx) + mov %eax, -4(%edx,%ecx) + pop %ebx + ret + +.L3OrLess: + cmp $1, %ecx + jl .LZero + movzbl (%eax), %ebx + je .LOne + movzwl -2(%eax,%ecx), %eax + mov %ax, -2(%edx,%ecx) +.LOne: + mov %bl, (%edx) +.LZero: + pop %ebx +end; {$endif FPC_SYSTEM_HAS_MOVE} - -{$endif} diff --git a/rtl/i386/i386.inc b/rtl/i386/i386.inc index f79abc3953..c72a938b8d 100644 --- a/rtl/i386/i386.inc +++ b/rtl/i386/i386.inc @@ -25,6 +25,8 @@ var os_supports_sse : boolean; { this variable is set to true, if currently an sse check is executed and no sig ill should be generated } sse_check : boolean; + fast_large_repmovstosb : boolean; { Enhanced REP MOVSB and STOSB (ERMSB) feature @ CPUID(7).ebx[9]. } + fpc_cpucodeinit_performed : boolean; { Code before fpc_cpucodeinit can call certain dispatched functions, such as Move. } {$asmmode ATT} @@ -47,15 +49,6 @@ function cpuid_support : boolean;assembler;nostackframe; setnz %al end; -{$ifndef FPC_PIC} -{$ifndef FPC_SYSTEM_HAS_MOVE} -{$ifndef OLD_ASSEMBLER} -{$define USE_FASTMOVE} -{$i fastmove.inc} -{$endif not OLD_ASSEMBLER} -{$endif FPC_SYSTEM_HAS_MOVE} -{$endif FPC_PIC} - {$define FPC_SYSTEM_HAS_FPC_CPUINIT} procedure fpc_cpuinit; begin @@ -63,7 +56,6 @@ procedure fpc_cpuinit; must be implemented OS dependend (FK) has_sse_support:=sse_support; has_mmx_support:=mmx_support; - setup_fastmove; } end; @@ -80,6 +72,12 @@ asm end; {$endif} +{$if not defined(FPC_SYSTEM_HAS_MOVE) + and not defined(OLD_ASSEMBLER) + and not defined(darwin)} +{$i fastmove.inc} +{$endif} + {$ifndef FPC_SYSTEM_HAS_MOVE} {$define FPC_SYSTEM_HAS_MOVE} @@ -2027,7 +2025,7 @@ Procedure SysResetFPU; { because of the brain dead sse detection on x86, this test is post poned } procedure fpc_cpucodeinit; var - _eax,_ecx_cpuid1,_edx_cpuid1,_ebx : longint; + _eax,_ecx_cpuid1,_edx_cpuid1,_ebx_cpuid7 : longint; begin if cpuid_support then begin @@ -2067,23 +2065,27 @@ procedure fpc_cpucodeinit; cpuid movl %eax,_eax end; - if (_eax>=7) and (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then + if _eax>=7 then begin asm + movl $7,%eax xorl %ecx,%ecx - .byte 0x0f,0x01,0xd0 { xgetbv } - movl %eax,_eax + cpuid + movl %ebx,_ebx_cpuid7 end; - if (_eax and 6)=6 then + fast_large_repmovstosb:=_ebx_cpuid7 and (1 shl 9)<>0; + if (_ecx_cpuid1 and $08000000<>0 {XGETBV support?}) then begin - has_avx_support:=(_ecx_cpuid1 and $10000000)<>0; asm - movl $7,%eax xorl %ecx,%ecx - cpuid - movl %ebx,_ebx + .byte 0x0f,0x01,0xd0 { xgetbv } + movl %eax,_eax end; - has_avx2_support:=(_ebx and $20)<>0; + if (_eax and 6)=6 then + begin + has_avx_support:=(_ecx_cpuid1 and $10000000)<>0; + has_avx2_support:=(_ebx_cpuid7 and $20)<>0; + end; end; end; end; @@ -2098,9 +2100,7 @@ procedure fpc_cpucodeinit; end; SysResetFPU; -{$ifdef USE_FASTMOVE} - setup_fastmove; -{$endif} + fpc_cpucodeinit_performed:=true; end; diff --git a/rtl/watcom/system.pp b/rtl/watcom/system.pp index 21d2ccedec..88a0e044f7 100644 --- a/rtl/watcom/system.pp +++ b/rtl/watcom/system.pp @@ -25,8 +25,8 @@ INTERFACE {$define FPC_ANSI_TEXTFILEREC} { include system-independent routine headers } -{ wasm does not support SSE3 instructions } -{$define FASTMOVE_DISABLE_SSE3} +{ wasm does not support SSE instructions } +{$define FASTMOVE_DISABLE_SSE} {$include systemh.inc}