diff --git a/rtl/i386/fastmove.inc b/rtl/i386/fastmove.inc new file mode 100644 index 0000000000..d5b1a88288 --- /dev/null +++ b/rtl/i386/fastmove.inc @@ -0,0 +1,854 @@ +{ + $Id$ + Copyright (c) 2004, John O'Harrow (john@almcrest.demon.co.uk) + +This software is provided 'as-is', without any express or implied warranty. +In no event will the authors be held liable for any damages arising from the +use of this software. + +Permission is granted to anyone to use this software for any purpose, including +commercial applications, and to alter it and redistribute it freely, subject to +the following restrictions: + +1. The origin of this software must not be misrepresented; you must not claim + that you wrote the original software. If you use this software in a product, + an acknowledgment in the product documentation would be appreciated but is + not required. + +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original software. + +3. This notice may not be removed or altered from any source distribution. + +------------------------------------------------------------------------------- + +Version: 1.40 - 16-SEP-2004 +} + + +{$if (FPC_VERSION>1) or ((FPC_RELEASE>=9) and (FPC_PATCH>6))} + +{$ifndef FPC_SYSTEM_HAS_MOVE} +{$define FPC_SYSTEM_HAS_MOVE} + +{$asmmode intel} + +{-------------------------------------------------------------------------} +{Just to show that a good Pascal algorithm can beat the default BASM} +procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer); +var + S, D : PtrUInt; + Temp, C, I : PtrInt; + L : PPtrInt; +begin + S := Cardinal(@Source); + D := Cardinal(@Dest); + if S = D then + Exit; + if Count <= 4 then + case Count of + 1 : PByte(@Dest)^ := PByte(S)^; + 2 : PWord(@Dest)^ := PWord(S)^; + 3 : if D > S then + begin + PByte(Integer(@Dest)+2)^ := PByte(S+2)^; + PWord(@Dest)^ := PWord(S)^; + end + else + begin + PWord(@Dest)^ := PWord(S)^; + PByte(Integer(@Dest)+2)^ := PByte(S+2)^; + end; + 4 : PInteger(@Dest)^ := PInteger(S)^ + else Exit; {Count <= 0} + end + else + if D > S then + begin + Temp := PInteger(S)^; + I := Integer(@Dest); + C := Count - 4; + L := PInteger(Integer(@Dest) + C); + Inc(S, C); + repeat + L^ := PInteger(S)^; + if Count <= 8 then + Break; + Dec(Count, 4); + Dec(S, 4); + Dec(L); + until False; + PInteger(I)^ := Temp; + end + else + begin + C := Count - 4; + Temp := PInteger(S + Cardinal(C))^; + I := Integer(@Dest) + C; + L := @Dest; + repeat + L^ := PInteger(S)^; + if Count <= 8 then + Break; + Dec(Count, 4); + Inc(S, 4); + Inc(L); + until False; + PInteger(I)^ := Temp; + end; +end; {MoveJOH_PAS} + +const + SMALLMOVESIZE = 36; + +{-------------------------------------------------------------------------} +{Perform Forward Move of 0..36 Bytes} +{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX} +procedure SmallForwardMove_3;assembler;nostackframe; +asm + jmp dword ptr @@FwdJumpTable[ecx*4] + align 16 +@@FwdJumpTable: + dd @@Done {Removes need to test for zero size move} + dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08 + dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16 + dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24 + dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32 + dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36 +@@Fwd36: + mov ecx,[eax-36] + mov [edx-36],ecx +@@Fwd32: + mov ecx,[eax-32] + mov [edx-32],ecx +@@Fwd28: + mov ecx,[eax-28] + mov [edx-28],ecx +@@Fwd24: + mov ecx,[eax-24] + mov [edx-24],ecx +@@Fwd20: + mov ecx,[eax-20] + mov [edx-20],ecx +@@Fwd16: + mov ecx,[eax-16] + mov [edx-16],ecx +@@Fwd12: + mov ecx,[eax-12] + mov [edx-12],ecx +@@Fwd08: + mov ecx,[eax-8] + mov [edx-8],ecx +@@Fwd04: + mov ecx,[eax-4] + mov [edx-4],ecx + ret +@@Fwd35: + mov ecx,[eax-35] + mov [edx-35],ecx +@@Fwd31: + mov ecx,[eax-31] + mov [edx-31],ecx +@@Fwd27: + mov ecx,[eax-27] + mov [edx-27],ecx +@@Fwd23: + mov ecx,[eax-23] + mov [edx-23],ecx +@@Fwd19: + mov ecx,[eax-19] + mov [edx-19],ecx +@@Fwd15: + mov ecx,[eax-15] + mov [edx-15],ecx +@@Fwd11: + mov ecx,[eax-11] + mov [edx-11],ecx +@@Fwd07: + mov ecx,[eax-7] + mov [edx-7],ecx + mov ecx,[eax-4] + mov [edx-4],ecx + ret +@@Fwd03: + movzx ecx, word ptr [eax-3] + mov [edx-3],cx + movzx ecx, byte ptr [eax-1] + mov [edx-1],cl + ret +@@Fwd34: + mov ecx,[eax-34] + mov [edx-34],ecx +@@Fwd30: + mov ecx,[eax-30] + mov [edx-30],ecx +@@Fwd26: + mov ecx,[eax-26] + mov [edx-26],ecx +@@Fwd22: + mov ecx,[eax-22] + mov [edx-22],ecx +@@Fwd18: + mov ecx,[eax-18] + mov [edx-18],ecx +@@Fwd14: + mov ecx,[eax-14] + mov [edx-14],ecx +@@Fwd10: + mov ecx,[eax-10] + mov [edx-10],ecx +@@Fwd06: + mov ecx,[eax-6] + mov [edx-6],ecx +@@Fwd02: + movzx ecx, word ptr [eax-2] + mov [edx-2],cx + ret +@@Fwd33: + mov ecx,[eax-33] + mov [edx-33],ecx +@@Fwd29: + mov ecx,[eax-29] + mov [edx-29],ecx +@@Fwd25: + mov ecx,[eax-25] + mov [edx-25],ecx +@@Fwd21: + mov ecx,[eax-21] + mov [edx-21],ecx +@@Fwd17: + mov ecx,[eax-17] + mov [edx-17],ecx +@@Fwd13: + mov ecx,[eax-13] + mov [edx-13],ecx +@@Fwd09: + mov ecx,[eax-9] + mov [edx-9],ecx +@@Fwd05: + mov ecx,[eax-5] + mov [edx-5],ecx +@@Fwd01: + movzx ecx, byte ptr [eax-1] + mov [edx-1],cl +@@Done: +end; {SmallForwardMove} + +{-------------------------------------------------------------------------} +{Perform Backward Move of 0..36 Bytes} +{On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX} +procedure SmallBackwardMove_3;assembler;nostackframe; +asm + jmp dword ptr @@BwdJumpTable[ecx*4] + align 16 +@@BwdJumpTable: + dd @@Done {Removes need to test for zero size move} + dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08 + dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16 + dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24 + dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32 + dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36 +@@Bwd36: + mov ecx,[eax+32] + mov [edx+32],ecx +@@Bwd32: + mov ecx,[eax+28] + mov [edx+28],ecx +@@Bwd28: + mov ecx,[eax+24] + mov [edx+24],ecx +@@Bwd24: + mov ecx,[eax+20] + mov [edx+20],ecx +@@Bwd20: + mov ecx,[eax+16] + mov [edx+16],ecx +@@Bwd16: + mov ecx,[eax+12] + mov [edx+12],ecx +@@Bwd12: + mov ecx,[eax+8] + mov [edx+8],ecx +@@Bwd08: + mov ecx,[eax+4] + mov [edx+4],ecx +@@Bwd04: + mov ecx,[eax] + mov [edx],ecx + ret +@@Bwd35: + mov ecx,[eax+31] + mov [edx+31],ecx +@@Bwd31: + mov ecx,[eax+27] + mov [edx+27],ecx +@@Bwd27: + mov ecx,[eax+23] + mov [edx+23],ecx +@@Bwd23: + mov ecx,[eax+19] + mov [edx+19],ecx +@@Bwd19: + mov ecx,[eax+15] + mov [edx+15],ecx +@@Bwd15: + mov ecx,[eax+11] + mov [edx+11],ecx +@@Bwd11: + mov ecx,[eax+7] + mov [edx+7],ecx +@@Bwd07: + mov ecx,[eax+3] + mov [edx+3],ecx + mov ecx,[eax] + mov [edx],ecx + ret +@@Bwd03: + movzx ecx, word ptr [eax+1] + mov [edx+1],cx + movzx ecx, byte ptr [eax] + mov [edx],cl + ret +@@Bwd34: + mov ecx,[eax+30] + mov [edx+30],ecx +@@Bwd30: + mov ecx,[eax+26] + mov [edx+26],ecx +@@Bwd26: + mov ecx,[eax+22] + mov [edx+22],ecx +@@Bwd22: + mov ecx,[eax+18] + mov [edx+18],ecx +@@Bwd18: + mov ecx,[eax+14] + mov [edx+14],ecx +@@Bwd14: + mov ecx,[eax+10] + mov [edx+10],ecx +@@Bwd10: + mov ecx,[eax+6] + mov [edx+6],ecx +@@Bwd06: + mov ecx,[eax+2] + mov [edx+2],ecx +@@Bwd02: + movzx ecx, word ptr [eax] + mov [edx],cx + ret +@@Bwd33: + mov ecx,[eax+29] + mov [edx+29],ecx +@@Bwd29: + mov ecx,[eax+25] + mov [edx+25],ecx +@@Bwd25: + mov ecx,[eax+21] + mov [edx+21],ecx +@@Bwd21: + mov ecx,[eax+17] + mov [edx+17],ecx +@@Bwd17: + mov ecx,[eax+13] + mov [edx+13],ecx +@@Bwd13: + mov ecx,[eax+9] + mov [edx+9],ecx +@@Bwd09: + mov ecx,[eax+5] + mov [edx+5],ecx +@@Bwd05: + mov ecx,[eax+1] + mov [edx+1],ecx +@@Bwd01: + movzx ecx, byte ptr[eax] + mov [edx],cl +@@Done: +end; {SmallBackwardMove} + +{-------------------------------------------------------------------------} +{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} +procedure Forwards_IA32_3;assembler;nostackframe; +asm + push ebx + mov ebx,edx + fild qword ptr [eax] + add eax,ecx {QWORD Align Writes} + add ecx,edx + add edx,7 + and edx,-8 + sub ecx,edx + add edx,ecx {Now QWORD Aligned} + sub ecx,16 + neg ecx +@FwdLoop: + fild qword ptr [eax+ecx-16] + fistp qword ptr [edx+ecx-16] + fild qword ptr [eax+ecx-8] + fistp qword ptr [edx+ecx-8] + add ecx,16 + jle @FwdLoop + fistp qword ptr [ebx] + neg ecx + add ecx,16 + pop ebx + jmp SmallForwardMove_3 +end; {Forwards_IA32} + +{-------------------------------------------------------------------------} +{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} +procedure Backwards_IA32_3;assembler;nostackframe; +asm + push ebx + fild qword ptr [eax+ecx-8] + lea ebx,[edx+ecx] {QWORD Align Writes} + and ebx,7 + sub ecx,ebx + add ebx,ecx {Now QWORD Aligned, EBX = Original Length} + sub ecx,16 +@BwdLoop: + fild qword ptr [eax+ecx] + fild qword ptr [eax+ecx+8] + fistp qword ptr [edx+ecx+8] + fistp qword ptr [edx+ecx] + sub ecx,16 + jge @BwdLoop + fistp qword ptr [edx+ebx-8] + add ecx,16 + pop ebx + jmp SmallBackwardMove_3 +end; {Backwards_IA32} + +{-------------------------------------------------------------------------} +{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} +procedure Forwards_MMX_3;assembler;nostackframe; +const + LARGESIZE = 1024; +asm + cmp ecx,LARGESIZE + jge @FwdLargeMove + cmp ecx,72 {Size at which using MMX becomes worthwhile} + jl Forwards_IA32_3 + push ebx + mov ebx,edx + movq mm0,[eax] {First 8 Characters} + {QWORD Align Writes} + add eax,ecx + add ecx,edx + add edx,7 + and edx,-8 + sub ecx,edx + add edx,ecx + {Now QWORD Aligned} + sub ecx,32 + neg ecx +@FwdLoopMMX: + movq mm1,[eax+ecx-32] + movq mm2,[eax+ecx-24] + movq mm3,[eax+ecx-16] + movq mm4,[eax+ecx- 8] + movq [edx+ecx-32],mm1 + movq [edx+ecx-24],mm2 + movq [edx+ecx-16],mm3 + movq [edx+ecx- 8],mm4 + add ecx,32 + jle @FwdLoopMMX + movq [ebx],mm0 {First 8 Characters} + emms + pop ebx + neg ecx + add ecx,32 + jmp SmallForwardMove_3 +@FwdLargeMove: + push ebx + mov ebx,ecx + test edx,15 + jz @FwdAligned + {16 byte Align Destination} + mov ecx,edx + add ecx,15 + and ecx,-16 + sub ecx,edx + add eax,ecx + add edx,ecx + sub ebx,ecx + {Destination now 16 Byte Aligned} + call SmallForwardMove_3 +@FwdAligned: + mov ecx,ebx + and ecx,-16 + sub ebx,ecx {EBX = Remainder} + push esi + push edi + mov esi,eax {ESI = Source} + mov edi,edx {EDI = Dest} + mov eax,ecx {EAX = Count} + and eax,-64 {EAX = No of Bytes to Blocks Moves} + and ecx,$3F {ECX = Remaining Bytes to Move (0..63)} + add esi,eax + add edi,eax + shr eax,3 {EAX = No of QWORD's to Block Move} + neg eax +@MMXcopyloop: + movq mm0,[esi+eax*8 ] + movq mm1,[esi+eax*8+ 8] + movq mm2,[esi+eax*8+16] + movq mm3,[esi+eax*8+24] + movq mm4,[esi+eax*8+32] + movq mm5,[esi+eax*8+40] + movq mm6,[esi+eax*8+48] + movq mm7,[esi+eax*8+56] + movq [edi+eax*8 ],mm0 + movq [edi+eax*8+ 8],mm1 + movq [edi+eax*8+16],mm2 + movq [edi+eax*8+24],mm3 + movq [edi+eax*8+32],mm4 + movq [edi+eax*8+40],mm5 + movq [edi+eax*8+48],mm6 + movq [edi+eax*8+56],mm7 + add eax,8 + jnz @MMXcopyloop + emms {Empty MMX State} + add ecx,ebx + shr ecx,2 + rep movsd + mov ecx,ebx + and ecx,3 + rep movsb + pop edi + pop esi + pop ebx +end; {Forwards_MMX} + +{-------------------------------------------------------------------------} +{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} +procedure Backwards_MMX_3;assembler;nostackframe; +asm + cmp ecx,72 {Size at which using MMX becomes worthwhile} + jl Backwards_IA32_3 + push ebx + movq mm0,[eax+ecx-8] {Get Last QWORD} + {QWORD Align Writes} + lea ebx,[edx+ecx] + and ebx,7 + sub ecx,ebx + add ebx,ecx + {Now QWORD Aligned} + sub ecx,32 +@BwdLoopMMX: + movq mm1,[eax+ecx ] + movq mm2,[eax+ecx+ 8] + movq mm3,[eax+ecx+16] + movq mm4,[eax+ecx+24] + movq [edx+ecx+24],mm4 + movq [edx+ecx+16],mm3 + movq [edx+ecx+ 8],mm2 + movq [edx+ecx ],mm1 + sub ecx,32 + jge @BwdLoopMMX + movq [edx+ebx-8], mm0 {Last QWORD} + emms + add ecx,32 + pop ebx + jmp SmallBackwardMove_3 +end; {Backwards_MMX} + +{-------------------------------------------------------------------------} +{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 } +procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe; +const + Prefetch = 512; +asm + push esi + mov esi,eax {ESI = Source} + mov eax,ecx {EAX = Count} + and eax,-128 {EAX = No of Bytes to Block Move} + add esi,eax + add edx,eax + shr eax,3 {EAX = No of QWORD's to Block Move} + neg eax + cmp eax, -(32*1024) {Count > 256K} + jl @Large +@Small: {Count<=256K} + test esi,15 {Check if Both Source/Dest Aligned} + jnz @SmallUnaligned +@SmallAligned: {Both Source and Dest 16-Byte Aligned} +@SmallAlignedLoop: + movaps xmm0,[esi+8*eax] + movaps xmm1,[esi+8*eax+16] + movaps xmm2,[esi+8*eax+32] + movaps xmm3,[esi+8*eax+48] + movaps [edx+8*eax],xmm0 + movaps [edx+8*eax+16],xmm1 + movaps [edx+8*eax+32],xmm2 + movaps [edx+8*eax+48],xmm3 + movaps xmm4,[esi+8*eax+64] + movaps xmm5,[esi+8*eax+80] + movaps xmm6,[esi+8*eax+96] + movaps xmm7,[esi+8*eax+112] + movaps [edx+8*eax+64],xmm4 + movaps [edx+8*eax+80],xmm5 + movaps [edx+8*eax+96],xmm6 + movaps [edx+8*eax+112],xmm7 + add eax,16 + js @SmallAlignedLoop + jmp @Remainder +@SmallUnaligned: {Source Not 16-Byte Aligned} +@SmallUnalignedLoop: + movups xmm0,[esi+8*eax] + movups xmm1,[esi+8*eax+16] + movups xmm2,[esi+8*eax+32] + movups xmm3,[esi+8*eax+48] + movaps [edx+8*eax],xmm0 + movaps [edx+8*eax+16],xmm1 + movaps [edx+8*eax+32],xmm2 + movaps [edx+8*eax+48],xmm3 + movups xmm4,[esi+8*eax+64] + movups xmm5,[esi+8*eax+80] + movups xmm6,[esi+8*eax+96] + movups xmm7,[esi+8*eax+112] + movaps [edx+8*eax+64],xmm4 + movaps [edx+8*eax+80],xmm5 + movaps [edx+8*eax+96],xmm6 + movaps [edx+8*eax+112],xmm7 + add eax,16 + js @SmallUnalignedLoop + jmp @Remainder +@Large: {Count>256K} + test esi,15 {Check if Both Source/Dest Aligned} + jnz @LargeUnaligned +@LargeAligned: {Both Source and Dest 16-Byte Aligned} +@LargeAlignedLoop: + prefetchnta [esi+8*eax+Prefetch] + prefetchnta [esi+8*eax+Prefetch+64] + movaps xmm0,[esi+8*eax] + movaps xmm1,[esi+8*eax+16] + movaps xmm2,[esi+8*eax+32] + movaps xmm3,[esi+8*eax+48] + movntps [edx+8*eax],xmm0 + movntps [edx+8*eax+16],xmm1 + movntps [edx+8*eax+32],xmm2 + movntps [edx+8*eax+48],xmm3 + movaps xmm4,[esi+8*eax+64] + movaps xmm5,[esi+8*eax+80] + movaps xmm6,[esi+8*eax+96] + movaps xmm7,[esi+8*eax+112] + movntps [edx+8*eax+64],xmm4 + movntps [edx+8*eax+80],xmm5 + movntps [edx+8*eax+96],xmm6 + movntps [edx+8*eax+112],xmm7 + add eax,16 + js @LargeAlignedLoop + sfence + jmp @Remainder +@LargeUnaligned: {Source Not 16-Byte Aligned} +@LargeUnalignedLoop: + prefetchnta [esi+8*eax+Prefetch] + prefetchnta [esi+8*eax+Prefetch+64] + movups xmm0,[esi+8*eax] + movups xmm1,[esi+8*eax+16] + movups xmm2,[esi+8*eax+32] + movups xmm3,[esi+8*eax+48] + movntps [edx+8*eax],xmm0 + movntps [edx+8*eax+16],xmm1 + movntps [edx+8*eax+32],xmm2 + movntps [edx+8*eax+48],xmm3 + movups xmm4,[esi+8*eax+64] + movups xmm5,[esi+8*eax+80] + movups xmm6,[esi+8*eax+96] + movups xmm7,[esi+8*eax+112] + movntps [edx+8*eax+64],xmm4 + movntps [edx+8*eax+80],xmm5 + movntps [edx+8*eax+96],xmm6 + movntps [edx+8*eax+112],xmm7 + add eax,16 + js @LargeUnalignedLoop + sfence +@Remainder: + and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)} + jz @Done + add esi,ecx + add edx,ecx + neg ecx +@RemainderLoop: + movups xmm0,[esi+ecx] + movaps [edx+ecx],xmm0 + add ecx,16 + jnz @RemainderLoop +@Done: + pop esi +end; {AlignedFwdMoveSSE} + +{-------------------------------------------------------------------------} +{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)} +procedure Forwards_SSE_3;assembler;nostackframe; +const + LARGESIZE = 2048; +asm + cmp ecx,LARGESIZE + jge @FwdLargeMove + cmp ecx,SMALLMOVESIZE+32 + movups xmm0,[eax] + jg @FwdMoveSSE + movups xmm1,[eax+16] + movups [edx],xmm0 + movups [edx+16],xmm1 + add eax,ecx + add edx,ecx + sub ecx,32 + jmp SmallForwardMove_3 +@FwdMoveSSE: + push ebx + mov ebx,edx + {Align Writes} + add eax,ecx + add ecx,edx + add edx,15 + and edx,-16 + sub ecx,edx + add edx,ecx + {Now Aligned} + sub ecx,32 + neg ecx +@FwdLoopSSE: + movups xmm1,[eax+ecx-32] + movups xmm2,[eax+ecx-16] + movaps [edx+ecx-32],xmm1 + movaps [edx+ecx-16],xmm2 + add ecx,32 + jle @FwdLoopSSE + movups [ebx],xmm0 {First 16 Bytes} + neg ecx + add ecx,32 + pop ebx + jmp SmallForwardMove_3 +@FwdLargeMove: + push ebx + mov ebx,ecx + test edx,15 + jz @FwdLargeAligned + {16 byte Align Destination} + mov ecx,edx + add ecx,15 + and ecx,-16 + sub ecx,edx + add eax,ecx + add edx,ecx + sub ebx,ecx + {Destination now 16 Byte Aligned} + call SmallForwardMove_3 + mov ecx,ebx +@FwdLargeAligned: + and ecx,-16 + sub ebx,ecx {EBX = Remainder} + push edx + push eax + push ecx + call AlignedFwdMoveSSE_3 + pop ecx + pop eax + pop edx + add ecx,ebx + add eax,ecx + add edx,ecx + mov ecx,ebx + pop ebx + jmp SmallForwardMove_3 +end; {Forwards_SSE} + +{-------------------------------------------------------------------------} +{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)} +procedure Backwards_SSE_3;assembler;nostackframe; +asm + cmp ecx,SMALLMOVESIZE+32 + jg @BwdMoveSSE + sub ecx,32 + movups xmm1,[eax+ecx] + movups xmm2,[eax+ecx+16] + movups [edx+ecx],xmm1 + movups [edx+ecx+16],xmm2 + jmp SmallBackwardMove_3 +@BwdMoveSSE: + push ebx + movups xmm0,[eax+ecx-16] {Last 16 Bytes} + {Align Writes} + lea ebx,[edx+ecx] + and ebx,15 + sub ecx,ebx + add ebx,ecx + {Now Aligned} + sub ecx,32 +@BwdLoop: + movups xmm1,[eax+ecx] + movups xmm2,[eax+ecx+16] + movaps [edx+ecx],xmm1 + movaps [edx+ecx+16],xmm2 + sub ecx,32 + jge @BwdLoop + movups [edx+ebx-16],xmm0 {Last 16 Bytes} + add ecx,32 + pop ebx + jmp SmallBackwardMove_3 +end; {Backwards_SSE} + +const + fastmoveproc_forward : pointer = @Forwards_SSE_3; + fastmoveproc_backward : pointer = @Backwards_SSE_3; + +procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe; +asm + cmp ecx,SMALLMOVESIZE + ja @Large + cmp eax,edx + lea eax,[eax+ecx] + jle @SmallCheck +@SmallForward: + add edx,ecx + jmp SmallForwardMove_3 +@SmallCheck: + je @Done {For Compatibility with Delphi's move for Source = Dest} + sub eax,ecx + jmp SmallBackwardMove_3 +@Large: + jng @Done {For Compatibility with Delphi's move for Count < 0} + cmp eax,edx + jg @moveforward + je @Done {For Compatibility with Delphi's move for Source = Dest} + push eax + add eax,ecx + cmp eax,edx + pop eax + jg @movebackward +@moveforward: + jmp dword ptr fastmoveproc_forward +@movebackward: + jmp dword ptr fastmoveproc_backward {Source/Dest Overlap} +@Done: +end; + +{$asmmode att} + +procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif} + begin + if has_sse_support then + begin + fastmoveproc_forward:=@Forwards_SSE_3; + fastmoveproc_backward:=@Backwards_SSE_3; + end + else if has_mmx_support then + begin + fastmoveproc_forward:=@Forwards_MMX_3; + fastmoveproc_backward:=@Backwards_MMX_3; + end; + end; + +{$endif FPC_SYSTEM_HAS_MOVE} + +{$else} + +procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif} + begin + end; + +{$endif} diff --git a/rtl/i386/i386.inc b/rtl/i386/i386.inc index e0b5de5906..47914a6d35 100644 --- a/rtl/i386/i386.inc +++ b/rtl/i386/i386.inc @@ -15,14 +15,85 @@ **********************************************************************} -{$asmmode ATT} - {**************************************************************************** Primitives ****************************************************************************} +var + has_sse_support,has_mmx_support : boolean; + +{$asmmode intel} + +function cpuid_support : boolean;assembler; + { + Check if the ID-flag can be changed, if changed then CpuID is supported. + Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV) + } + asm + pushf + pushf + pop eax + mov ebx,eax + xor eax,200000h + push eax + popf + pushf + pop eax + popf + and eax,200000h + and ebx,200000h + cmp eax,ebx + setnz al + end; + +{$asmmode ATT} + +function sse_support : boolean; + var + _edx : longint; + begin + if cpuid_support then + begin + asm + movl $1,%eax + cpuid + movl %edx,_edx + end; + sse_support:=(_edx and $2000000)<>0; + end + else + { a cpu with without cpuid instruction supports never sse } + sse_support:=false; + end; + + +{ returns true, if the processor supports the mmx instructions } +function mmx_support : boolean; + + var + _edx : longint; + + begin + if cpuid_support then + begin + asm + movl $1,%eax + cpuid + movl %edx,_edx + end; + mmx_support:=(_edx and $800000)<>0; + end + else + { a cpu with without cpuid instruction supports never mmx } + mmx_support:=false; + end; + +{$i fastmove.inc} procedure fpc_cpuinit; begin + has_sse_support:=sse_support; + has_mmx_support:=mmx_support; + setup_fastmove; end; @@ -32,7 +103,6 @@ asm ret end; - {$ifndef FPC_SYSTEM_HAS_MOVE} {$define FPC_SYSTEM_HAS_MOVE} procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler; @@ -1510,7 +1580,10 @@ end; { $Log$ - Revision 1.66 2004-11-17 22:19:04 peter + Revision 1.67 2005-01-23 20:03:23 florian + + fastmove from John O'Harrow integrated + + Revision 1.66 2004/11/17 22:19:04 peter internconst, internproc and some external declarations moved to interface Revision 1.65 2004/11/01 12:43:29 peter