mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-07-21 10:56:05 +02:00
+ fastmove from John O'Harrow integrated
This commit is contained in:
parent
00324d38bf
commit
3600b51d32
854
rtl/i386/fastmove.inc
Normal file
854
rtl/i386/fastmove.inc
Normal file
@ -0,0 +1,854 @@
|
||||
{
|
||||
$Id$
|
||||
Copyright (c) 2004, John O'Harrow (john@almcrest.demon.co.uk)
|
||||
|
||||
This software is provided 'as-is', without any express or implied warranty.
|
||||
In no event will the authors be held liable for any damages arising from the
|
||||
use of this software.
|
||||
|
||||
Permission is granted to anyone to use this software for any purpose, including
|
||||
commercial applications, and to alter it and redistribute it freely, subject to
|
||||
the following restrictions:
|
||||
|
||||
1. The origin of this software must not be misrepresented; you must not claim
|
||||
that you wrote the original software. If you use this software in a product,
|
||||
an acknowledgment in the product documentation would be appreciated but is
|
||||
not required.
|
||||
|
||||
2. Altered source versions must be plainly marked as such, and must not be
|
||||
misrepresented as being the original software.
|
||||
|
||||
3. This notice may not be removed or altered from any source distribution.
|
||||
|
||||
-------------------------------------------------------------------------------
|
||||
|
||||
Version: 1.40 - 16-SEP-2004
|
||||
}
|
||||
|
||||
|
||||
{$if (FPC_VERSION>1) or ((FPC_RELEASE>=9) and (FPC_PATCH>6))}
|
||||
|
||||
{$ifndef FPC_SYSTEM_HAS_MOVE}
|
||||
{$define FPC_SYSTEM_HAS_MOVE}
|
||||
|
||||
{$asmmode intel}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Just to show that a good Pascal algorithm can beat the default BASM}
|
||||
procedure MoveJOH_PAS_3(const Source; var Dest; Count : Integer);
|
||||
var
|
||||
S, D : PtrUInt;
|
||||
Temp, C, I : PtrInt;
|
||||
L : PPtrInt;
|
||||
begin
|
||||
S := Cardinal(@Source);
|
||||
D := Cardinal(@Dest);
|
||||
if S = D then
|
||||
Exit;
|
||||
if Count <= 4 then
|
||||
case Count of
|
||||
1 : PByte(@Dest)^ := PByte(S)^;
|
||||
2 : PWord(@Dest)^ := PWord(S)^;
|
||||
3 : if D > S then
|
||||
begin
|
||||
PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
|
||||
PWord(@Dest)^ := PWord(S)^;
|
||||
end
|
||||
else
|
||||
begin
|
||||
PWord(@Dest)^ := PWord(S)^;
|
||||
PByte(Integer(@Dest)+2)^ := PByte(S+2)^;
|
||||
end;
|
||||
4 : PInteger(@Dest)^ := PInteger(S)^
|
||||
else Exit; {Count <= 0}
|
||||
end
|
||||
else
|
||||
if D > S then
|
||||
begin
|
||||
Temp := PInteger(S)^;
|
||||
I := Integer(@Dest);
|
||||
C := Count - 4;
|
||||
L := PInteger(Integer(@Dest) + C);
|
||||
Inc(S, C);
|
||||
repeat
|
||||
L^ := PInteger(S)^;
|
||||
if Count <= 8 then
|
||||
Break;
|
||||
Dec(Count, 4);
|
||||
Dec(S, 4);
|
||||
Dec(L);
|
||||
until False;
|
||||
PInteger(I)^ := Temp;
|
||||
end
|
||||
else
|
||||
begin
|
||||
C := Count - 4;
|
||||
Temp := PInteger(S + Cardinal(C))^;
|
||||
I := Integer(@Dest) + C;
|
||||
L := @Dest;
|
||||
repeat
|
||||
L^ := PInteger(S)^;
|
||||
if Count <= 8 then
|
||||
Break;
|
||||
Dec(Count, 4);
|
||||
Inc(S, 4);
|
||||
Inc(L);
|
||||
until False;
|
||||
PInteger(I)^ := Temp;
|
||||
end;
|
||||
end; {MoveJOH_PAS}
|
||||
|
||||
const
|
||||
SMALLMOVESIZE = 36;
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Perform Forward Move of 0..36 Bytes}
|
||||
{On Entry, ECX = Count, EAX = Source+Count, EDX = Dest+Count. Destroys ECX}
|
||||
procedure SmallForwardMove_3;assembler;nostackframe;
|
||||
asm
|
||||
jmp dword ptr @@FwdJumpTable[ecx*4]
|
||||
align 16
|
||||
@@FwdJumpTable:
|
||||
dd @@Done {Removes need to test for zero size move}
|
||||
dd @@Fwd01,@@Fwd02,@@Fwd03,@@Fwd04,@@Fwd05,@@Fwd06,@@Fwd07,@@Fwd08
|
||||
dd @@Fwd09,@@Fwd10,@@Fwd11,@@Fwd12,@@Fwd13,@@Fwd14,@@Fwd15,@@Fwd16
|
||||
dd @@Fwd17,@@Fwd18,@@Fwd19,@@Fwd20,@@Fwd21,@@Fwd22,@@Fwd23,@@Fwd24
|
||||
dd @@Fwd25,@@Fwd26,@@Fwd27,@@Fwd28,@@Fwd29,@@Fwd30,@@Fwd31,@@Fwd32
|
||||
dd @@Fwd33,@@Fwd34,@@Fwd35,@@Fwd36
|
||||
@@Fwd36:
|
||||
mov ecx,[eax-36]
|
||||
mov [edx-36],ecx
|
||||
@@Fwd32:
|
||||
mov ecx,[eax-32]
|
||||
mov [edx-32],ecx
|
||||
@@Fwd28:
|
||||
mov ecx,[eax-28]
|
||||
mov [edx-28],ecx
|
||||
@@Fwd24:
|
||||
mov ecx,[eax-24]
|
||||
mov [edx-24],ecx
|
||||
@@Fwd20:
|
||||
mov ecx,[eax-20]
|
||||
mov [edx-20],ecx
|
||||
@@Fwd16:
|
||||
mov ecx,[eax-16]
|
||||
mov [edx-16],ecx
|
||||
@@Fwd12:
|
||||
mov ecx,[eax-12]
|
||||
mov [edx-12],ecx
|
||||
@@Fwd08:
|
||||
mov ecx,[eax-8]
|
||||
mov [edx-8],ecx
|
||||
@@Fwd04:
|
||||
mov ecx,[eax-4]
|
||||
mov [edx-4],ecx
|
||||
ret
|
||||
@@Fwd35:
|
||||
mov ecx,[eax-35]
|
||||
mov [edx-35],ecx
|
||||
@@Fwd31:
|
||||
mov ecx,[eax-31]
|
||||
mov [edx-31],ecx
|
||||
@@Fwd27:
|
||||
mov ecx,[eax-27]
|
||||
mov [edx-27],ecx
|
||||
@@Fwd23:
|
||||
mov ecx,[eax-23]
|
||||
mov [edx-23],ecx
|
||||
@@Fwd19:
|
||||
mov ecx,[eax-19]
|
||||
mov [edx-19],ecx
|
||||
@@Fwd15:
|
||||
mov ecx,[eax-15]
|
||||
mov [edx-15],ecx
|
||||
@@Fwd11:
|
||||
mov ecx,[eax-11]
|
||||
mov [edx-11],ecx
|
||||
@@Fwd07:
|
||||
mov ecx,[eax-7]
|
||||
mov [edx-7],ecx
|
||||
mov ecx,[eax-4]
|
||||
mov [edx-4],ecx
|
||||
ret
|
||||
@@Fwd03:
|
||||
movzx ecx, word ptr [eax-3]
|
||||
mov [edx-3],cx
|
||||
movzx ecx, byte ptr [eax-1]
|
||||
mov [edx-1],cl
|
||||
ret
|
||||
@@Fwd34:
|
||||
mov ecx,[eax-34]
|
||||
mov [edx-34],ecx
|
||||
@@Fwd30:
|
||||
mov ecx,[eax-30]
|
||||
mov [edx-30],ecx
|
||||
@@Fwd26:
|
||||
mov ecx,[eax-26]
|
||||
mov [edx-26],ecx
|
||||
@@Fwd22:
|
||||
mov ecx,[eax-22]
|
||||
mov [edx-22],ecx
|
||||
@@Fwd18:
|
||||
mov ecx,[eax-18]
|
||||
mov [edx-18],ecx
|
||||
@@Fwd14:
|
||||
mov ecx,[eax-14]
|
||||
mov [edx-14],ecx
|
||||
@@Fwd10:
|
||||
mov ecx,[eax-10]
|
||||
mov [edx-10],ecx
|
||||
@@Fwd06:
|
||||
mov ecx,[eax-6]
|
||||
mov [edx-6],ecx
|
||||
@@Fwd02:
|
||||
movzx ecx, word ptr [eax-2]
|
||||
mov [edx-2],cx
|
||||
ret
|
||||
@@Fwd33:
|
||||
mov ecx,[eax-33]
|
||||
mov [edx-33],ecx
|
||||
@@Fwd29:
|
||||
mov ecx,[eax-29]
|
||||
mov [edx-29],ecx
|
||||
@@Fwd25:
|
||||
mov ecx,[eax-25]
|
||||
mov [edx-25],ecx
|
||||
@@Fwd21:
|
||||
mov ecx,[eax-21]
|
||||
mov [edx-21],ecx
|
||||
@@Fwd17:
|
||||
mov ecx,[eax-17]
|
||||
mov [edx-17],ecx
|
||||
@@Fwd13:
|
||||
mov ecx,[eax-13]
|
||||
mov [edx-13],ecx
|
||||
@@Fwd09:
|
||||
mov ecx,[eax-9]
|
||||
mov [edx-9],ecx
|
||||
@@Fwd05:
|
||||
mov ecx,[eax-5]
|
||||
mov [edx-5],ecx
|
||||
@@Fwd01:
|
||||
movzx ecx, byte ptr [eax-1]
|
||||
mov [edx-1],cl
|
||||
@@Done:
|
||||
end; {SmallForwardMove}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Perform Backward Move of 0..36 Bytes}
|
||||
{On Entry, ECX = Count, EAX = Source, EDX = Dest. Destroys ECX}
|
||||
procedure SmallBackwardMove_3;assembler;nostackframe;
|
||||
asm
|
||||
jmp dword ptr @@BwdJumpTable[ecx*4]
|
||||
align 16
|
||||
@@BwdJumpTable:
|
||||
dd @@Done {Removes need to test for zero size move}
|
||||
dd @@Bwd01,@@Bwd02,@@Bwd03,@@Bwd04,@@Bwd05,@@Bwd06,@@Bwd07,@@Bwd08
|
||||
dd @@Bwd09,@@Bwd10,@@Bwd11,@@Bwd12,@@Bwd13,@@Bwd14,@@Bwd15,@@Bwd16
|
||||
dd @@Bwd17,@@Bwd18,@@Bwd19,@@Bwd20,@@Bwd21,@@Bwd22,@@Bwd23,@@Bwd24
|
||||
dd @@Bwd25,@@Bwd26,@@Bwd27,@@Bwd28,@@Bwd29,@@Bwd30,@@Bwd31,@@Bwd32
|
||||
dd @@Bwd33,@@Bwd34,@@Bwd35,@@Bwd36
|
||||
@@Bwd36:
|
||||
mov ecx,[eax+32]
|
||||
mov [edx+32],ecx
|
||||
@@Bwd32:
|
||||
mov ecx,[eax+28]
|
||||
mov [edx+28],ecx
|
||||
@@Bwd28:
|
||||
mov ecx,[eax+24]
|
||||
mov [edx+24],ecx
|
||||
@@Bwd24:
|
||||
mov ecx,[eax+20]
|
||||
mov [edx+20],ecx
|
||||
@@Bwd20:
|
||||
mov ecx,[eax+16]
|
||||
mov [edx+16],ecx
|
||||
@@Bwd16:
|
||||
mov ecx,[eax+12]
|
||||
mov [edx+12],ecx
|
||||
@@Bwd12:
|
||||
mov ecx,[eax+8]
|
||||
mov [edx+8],ecx
|
||||
@@Bwd08:
|
||||
mov ecx,[eax+4]
|
||||
mov [edx+4],ecx
|
||||
@@Bwd04:
|
||||
mov ecx,[eax]
|
||||
mov [edx],ecx
|
||||
ret
|
||||
@@Bwd35:
|
||||
mov ecx,[eax+31]
|
||||
mov [edx+31],ecx
|
||||
@@Bwd31:
|
||||
mov ecx,[eax+27]
|
||||
mov [edx+27],ecx
|
||||
@@Bwd27:
|
||||
mov ecx,[eax+23]
|
||||
mov [edx+23],ecx
|
||||
@@Bwd23:
|
||||
mov ecx,[eax+19]
|
||||
mov [edx+19],ecx
|
||||
@@Bwd19:
|
||||
mov ecx,[eax+15]
|
||||
mov [edx+15],ecx
|
||||
@@Bwd15:
|
||||
mov ecx,[eax+11]
|
||||
mov [edx+11],ecx
|
||||
@@Bwd11:
|
||||
mov ecx,[eax+7]
|
||||
mov [edx+7],ecx
|
||||
@@Bwd07:
|
||||
mov ecx,[eax+3]
|
||||
mov [edx+3],ecx
|
||||
mov ecx,[eax]
|
||||
mov [edx],ecx
|
||||
ret
|
||||
@@Bwd03:
|
||||
movzx ecx, word ptr [eax+1]
|
||||
mov [edx+1],cx
|
||||
movzx ecx, byte ptr [eax]
|
||||
mov [edx],cl
|
||||
ret
|
||||
@@Bwd34:
|
||||
mov ecx,[eax+30]
|
||||
mov [edx+30],ecx
|
||||
@@Bwd30:
|
||||
mov ecx,[eax+26]
|
||||
mov [edx+26],ecx
|
||||
@@Bwd26:
|
||||
mov ecx,[eax+22]
|
||||
mov [edx+22],ecx
|
||||
@@Bwd22:
|
||||
mov ecx,[eax+18]
|
||||
mov [edx+18],ecx
|
||||
@@Bwd18:
|
||||
mov ecx,[eax+14]
|
||||
mov [edx+14],ecx
|
||||
@@Bwd14:
|
||||
mov ecx,[eax+10]
|
||||
mov [edx+10],ecx
|
||||
@@Bwd10:
|
||||
mov ecx,[eax+6]
|
||||
mov [edx+6],ecx
|
||||
@@Bwd06:
|
||||
mov ecx,[eax+2]
|
||||
mov [edx+2],ecx
|
||||
@@Bwd02:
|
||||
movzx ecx, word ptr [eax]
|
||||
mov [edx],cx
|
||||
ret
|
||||
@@Bwd33:
|
||||
mov ecx,[eax+29]
|
||||
mov [edx+29],ecx
|
||||
@@Bwd29:
|
||||
mov ecx,[eax+25]
|
||||
mov [edx+25],ecx
|
||||
@@Bwd25:
|
||||
mov ecx,[eax+21]
|
||||
mov [edx+21],ecx
|
||||
@@Bwd21:
|
||||
mov ecx,[eax+17]
|
||||
mov [edx+17],ecx
|
||||
@@Bwd17:
|
||||
mov ecx,[eax+13]
|
||||
mov [edx+13],ecx
|
||||
@@Bwd13:
|
||||
mov ecx,[eax+9]
|
||||
mov [edx+9],ecx
|
||||
@@Bwd09:
|
||||
mov ecx,[eax+5]
|
||||
mov [edx+5],ecx
|
||||
@@Bwd05:
|
||||
mov ecx,[eax+1]
|
||||
mov [edx+1],ecx
|
||||
@@Bwd01:
|
||||
movzx ecx, byte ptr[eax]
|
||||
mov [edx],cl
|
||||
@@Done:
|
||||
end; {SmallBackwardMove}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
|
||||
procedure Forwards_IA32_3;assembler;nostackframe;
|
||||
asm
|
||||
push ebx
|
||||
mov ebx,edx
|
||||
fild qword ptr [eax]
|
||||
add eax,ecx {QWORD Align Writes}
|
||||
add ecx,edx
|
||||
add edx,7
|
||||
and edx,-8
|
||||
sub ecx,edx
|
||||
add edx,ecx {Now QWORD Aligned}
|
||||
sub ecx,16
|
||||
neg ecx
|
||||
@FwdLoop:
|
||||
fild qword ptr [eax+ecx-16]
|
||||
fistp qword ptr [edx+ecx-16]
|
||||
fild qword ptr [eax+ecx-8]
|
||||
fistp qword ptr [edx+ecx-8]
|
||||
add ecx,16
|
||||
jle @FwdLoop
|
||||
fistp qword ptr [ebx]
|
||||
neg ecx
|
||||
add ecx,16
|
||||
pop ebx
|
||||
jmp SmallForwardMove_3
|
||||
end; {Forwards_IA32}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
|
||||
procedure Backwards_IA32_3;assembler;nostackframe;
|
||||
asm
|
||||
push ebx
|
||||
fild qword ptr [eax+ecx-8]
|
||||
lea ebx,[edx+ecx] {QWORD Align Writes}
|
||||
and ebx,7
|
||||
sub ecx,ebx
|
||||
add ebx,ecx {Now QWORD Aligned, EBX = Original Length}
|
||||
sub ecx,16
|
||||
@BwdLoop:
|
||||
fild qword ptr [eax+ecx]
|
||||
fild qword ptr [eax+ecx+8]
|
||||
fistp qword ptr [edx+ecx+8]
|
||||
fistp qword ptr [edx+ecx]
|
||||
sub ecx,16
|
||||
jge @BwdLoop
|
||||
fistp qword ptr [edx+ebx-8]
|
||||
add ecx,16
|
||||
pop ebx
|
||||
jmp SmallBackwardMove_3
|
||||
end; {Backwards_IA32}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
|
||||
procedure Forwards_MMX_3;assembler;nostackframe;
|
||||
const
|
||||
LARGESIZE = 1024;
|
||||
asm
|
||||
cmp ecx,LARGESIZE
|
||||
jge @FwdLargeMove
|
||||
cmp ecx,72 {Size at which using MMX becomes worthwhile}
|
||||
jl Forwards_IA32_3
|
||||
push ebx
|
||||
mov ebx,edx
|
||||
movq mm0,[eax] {First 8 Characters}
|
||||
{QWORD Align Writes}
|
||||
add eax,ecx
|
||||
add ecx,edx
|
||||
add edx,7
|
||||
and edx,-8
|
||||
sub ecx,edx
|
||||
add edx,ecx
|
||||
{Now QWORD Aligned}
|
||||
sub ecx,32
|
||||
neg ecx
|
||||
@FwdLoopMMX:
|
||||
movq mm1,[eax+ecx-32]
|
||||
movq mm2,[eax+ecx-24]
|
||||
movq mm3,[eax+ecx-16]
|
||||
movq mm4,[eax+ecx- 8]
|
||||
movq [edx+ecx-32],mm1
|
||||
movq [edx+ecx-24],mm2
|
||||
movq [edx+ecx-16],mm3
|
||||
movq [edx+ecx- 8],mm4
|
||||
add ecx,32
|
||||
jle @FwdLoopMMX
|
||||
movq [ebx],mm0 {First 8 Characters}
|
||||
emms
|
||||
pop ebx
|
||||
neg ecx
|
||||
add ecx,32
|
||||
jmp SmallForwardMove_3
|
||||
@FwdLargeMove:
|
||||
push ebx
|
||||
mov ebx,ecx
|
||||
test edx,15
|
||||
jz @FwdAligned
|
||||
{16 byte Align Destination}
|
||||
mov ecx,edx
|
||||
add ecx,15
|
||||
and ecx,-16
|
||||
sub ecx,edx
|
||||
add eax,ecx
|
||||
add edx,ecx
|
||||
sub ebx,ecx
|
||||
{Destination now 16 Byte Aligned}
|
||||
call SmallForwardMove_3
|
||||
@FwdAligned:
|
||||
mov ecx,ebx
|
||||
and ecx,-16
|
||||
sub ebx,ecx {EBX = Remainder}
|
||||
push esi
|
||||
push edi
|
||||
mov esi,eax {ESI = Source}
|
||||
mov edi,edx {EDI = Dest}
|
||||
mov eax,ecx {EAX = Count}
|
||||
and eax,-64 {EAX = No of Bytes to Blocks Moves}
|
||||
and ecx,$3F {ECX = Remaining Bytes to Move (0..63)}
|
||||
add esi,eax
|
||||
add edi,eax
|
||||
shr eax,3 {EAX = No of QWORD's to Block Move}
|
||||
neg eax
|
||||
@MMXcopyloop:
|
||||
movq mm0,[esi+eax*8 ]
|
||||
movq mm1,[esi+eax*8+ 8]
|
||||
movq mm2,[esi+eax*8+16]
|
||||
movq mm3,[esi+eax*8+24]
|
||||
movq mm4,[esi+eax*8+32]
|
||||
movq mm5,[esi+eax*8+40]
|
||||
movq mm6,[esi+eax*8+48]
|
||||
movq mm7,[esi+eax*8+56]
|
||||
movq [edi+eax*8 ],mm0
|
||||
movq [edi+eax*8+ 8],mm1
|
||||
movq [edi+eax*8+16],mm2
|
||||
movq [edi+eax*8+24],mm3
|
||||
movq [edi+eax*8+32],mm4
|
||||
movq [edi+eax*8+40],mm5
|
||||
movq [edi+eax*8+48],mm6
|
||||
movq [edi+eax*8+56],mm7
|
||||
add eax,8
|
||||
jnz @MMXcopyloop
|
||||
emms {Empty MMX State}
|
||||
add ecx,ebx
|
||||
shr ecx,2
|
||||
rep movsd
|
||||
mov ecx,ebx
|
||||
and ecx,3
|
||||
rep movsb
|
||||
pop edi
|
||||
pop esi
|
||||
pop ebx
|
||||
end; {Forwards_MMX}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
|
||||
procedure Backwards_MMX_3;assembler;nostackframe;
|
||||
asm
|
||||
cmp ecx,72 {Size at which using MMX becomes worthwhile}
|
||||
jl Backwards_IA32_3
|
||||
push ebx
|
||||
movq mm0,[eax+ecx-8] {Get Last QWORD}
|
||||
{QWORD Align Writes}
|
||||
lea ebx,[edx+ecx]
|
||||
and ebx,7
|
||||
sub ecx,ebx
|
||||
add ebx,ecx
|
||||
{Now QWORD Aligned}
|
||||
sub ecx,32
|
||||
@BwdLoopMMX:
|
||||
movq mm1,[eax+ecx ]
|
||||
movq mm2,[eax+ecx+ 8]
|
||||
movq mm3,[eax+ecx+16]
|
||||
movq mm4,[eax+ecx+24]
|
||||
movq [edx+ecx+24],mm4
|
||||
movq [edx+ecx+16],mm3
|
||||
movq [edx+ecx+ 8],mm2
|
||||
movq [edx+ecx ],mm1
|
||||
sub ecx,32
|
||||
jge @BwdLoopMMX
|
||||
movq [edx+ebx-8], mm0 {Last QWORD}
|
||||
emms
|
||||
add ecx,32
|
||||
pop ebx
|
||||
jmp SmallBackwardMove_3
|
||||
end; {Backwards_MMX}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Dest MUST be 16-Byes Aligned, Count MUST be multiple of 16 }
|
||||
procedure AlignedFwdMoveSSE_3(const Source; var Dest; Count: Integer);assembler;nostackframe;
|
||||
const
|
||||
Prefetch = 512;
|
||||
asm
|
||||
push esi
|
||||
mov esi,eax {ESI = Source}
|
||||
mov eax,ecx {EAX = Count}
|
||||
and eax,-128 {EAX = No of Bytes to Block Move}
|
||||
add esi,eax
|
||||
add edx,eax
|
||||
shr eax,3 {EAX = No of QWORD's to Block Move}
|
||||
neg eax
|
||||
cmp eax, -(32*1024) {Count > 256K}
|
||||
jl @Large
|
||||
@Small: {Count<=256K}
|
||||
test esi,15 {Check if Both Source/Dest Aligned}
|
||||
jnz @SmallUnaligned
|
||||
@SmallAligned: {Both Source and Dest 16-Byte Aligned}
|
||||
@SmallAlignedLoop:
|
||||
movaps xmm0,[esi+8*eax]
|
||||
movaps xmm1,[esi+8*eax+16]
|
||||
movaps xmm2,[esi+8*eax+32]
|
||||
movaps xmm3,[esi+8*eax+48]
|
||||
movaps [edx+8*eax],xmm0
|
||||
movaps [edx+8*eax+16],xmm1
|
||||
movaps [edx+8*eax+32],xmm2
|
||||
movaps [edx+8*eax+48],xmm3
|
||||
movaps xmm4,[esi+8*eax+64]
|
||||
movaps xmm5,[esi+8*eax+80]
|
||||
movaps xmm6,[esi+8*eax+96]
|
||||
movaps xmm7,[esi+8*eax+112]
|
||||
movaps [edx+8*eax+64],xmm4
|
||||
movaps [edx+8*eax+80],xmm5
|
||||
movaps [edx+8*eax+96],xmm6
|
||||
movaps [edx+8*eax+112],xmm7
|
||||
add eax,16
|
||||
js @SmallAlignedLoop
|
||||
jmp @Remainder
|
||||
@SmallUnaligned: {Source Not 16-Byte Aligned}
|
||||
@SmallUnalignedLoop:
|
||||
movups xmm0,[esi+8*eax]
|
||||
movups xmm1,[esi+8*eax+16]
|
||||
movups xmm2,[esi+8*eax+32]
|
||||
movups xmm3,[esi+8*eax+48]
|
||||
movaps [edx+8*eax],xmm0
|
||||
movaps [edx+8*eax+16],xmm1
|
||||
movaps [edx+8*eax+32],xmm2
|
||||
movaps [edx+8*eax+48],xmm3
|
||||
movups xmm4,[esi+8*eax+64]
|
||||
movups xmm5,[esi+8*eax+80]
|
||||
movups xmm6,[esi+8*eax+96]
|
||||
movups xmm7,[esi+8*eax+112]
|
||||
movaps [edx+8*eax+64],xmm4
|
||||
movaps [edx+8*eax+80],xmm5
|
||||
movaps [edx+8*eax+96],xmm6
|
||||
movaps [edx+8*eax+112],xmm7
|
||||
add eax,16
|
||||
js @SmallUnalignedLoop
|
||||
jmp @Remainder
|
||||
@Large: {Count>256K}
|
||||
test esi,15 {Check if Both Source/Dest Aligned}
|
||||
jnz @LargeUnaligned
|
||||
@LargeAligned: {Both Source and Dest 16-Byte Aligned}
|
||||
@LargeAlignedLoop:
|
||||
prefetchnta [esi+8*eax+Prefetch]
|
||||
prefetchnta [esi+8*eax+Prefetch+64]
|
||||
movaps xmm0,[esi+8*eax]
|
||||
movaps xmm1,[esi+8*eax+16]
|
||||
movaps xmm2,[esi+8*eax+32]
|
||||
movaps xmm3,[esi+8*eax+48]
|
||||
movntps [edx+8*eax],xmm0
|
||||
movntps [edx+8*eax+16],xmm1
|
||||
movntps [edx+8*eax+32],xmm2
|
||||
movntps [edx+8*eax+48],xmm3
|
||||
movaps xmm4,[esi+8*eax+64]
|
||||
movaps xmm5,[esi+8*eax+80]
|
||||
movaps xmm6,[esi+8*eax+96]
|
||||
movaps xmm7,[esi+8*eax+112]
|
||||
movntps [edx+8*eax+64],xmm4
|
||||
movntps [edx+8*eax+80],xmm5
|
||||
movntps [edx+8*eax+96],xmm6
|
||||
movntps [edx+8*eax+112],xmm7
|
||||
add eax,16
|
||||
js @LargeAlignedLoop
|
||||
sfence
|
||||
jmp @Remainder
|
||||
@LargeUnaligned: {Source Not 16-Byte Aligned}
|
||||
@LargeUnalignedLoop:
|
||||
prefetchnta [esi+8*eax+Prefetch]
|
||||
prefetchnta [esi+8*eax+Prefetch+64]
|
||||
movups xmm0,[esi+8*eax]
|
||||
movups xmm1,[esi+8*eax+16]
|
||||
movups xmm2,[esi+8*eax+32]
|
||||
movups xmm3,[esi+8*eax+48]
|
||||
movntps [edx+8*eax],xmm0
|
||||
movntps [edx+8*eax+16],xmm1
|
||||
movntps [edx+8*eax+32],xmm2
|
||||
movntps [edx+8*eax+48],xmm3
|
||||
movups xmm4,[esi+8*eax+64]
|
||||
movups xmm5,[esi+8*eax+80]
|
||||
movups xmm6,[esi+8*eax+96]
|
||||
movups xmm7,[esi+8*eax+112]
|
||||
movntps [edx+8*eax+64],xmm4
|
||||
movntps [edx+8*eax+80],xmm5
|
||||
movntps [edx+8*eax+96],xmm6
|
||||
movntps [edx+8*eax+112],xmm7
|
||||
add eax,16
|
||||
js @LargeUnalignedLoop
|
||||
sfence
|
||||
@Remainder:
|
||||
and ecx,$7F {ECX = Remainder (0..112 - Multiple of 16)}
|
||||
jz @Done
|
||||
add esi,ecx
|
||||
add edx,ecx
|
||||
neg ecx
|
||||
@RemainderLoop:
|
||||
movups xmm0,[esi+ecx]
|
||||
movaps [edx+ecx],xmm0
|
||||
add ecx,16
|
||||
jnz @RemainderLoop
|
||||
@Done:
|
||||
pop esi
|
||||
end; {AlignedFwdMoveSSE}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Move ECX Bytes from EAX to EDX, where EAX > EDX and ECX > 36 (SMALLMOVESIZE)}
|
||||
procedure Forwards_SSE_3;assembler;nostackframe;
|
||||
const
|
||||
LARGESIZE = 2048;
|
||||
asm
|
||||
cmp ecx,LARGESIZE
|
||||
jge @FwdLargeMove
|
||||
cmp ecx,SMALLMOVESIZE+32
|
||||
movups xmm0,[eax]
|
||||
jg @FwdMoveSSE
|
||||
movups xmm1,[eax+16]
|
||||
movups [edx],xmm0
|
||||
movups [edx+16],xmm1
|
||||
add eax,ecx
|
||||
add edx,ecx
|
||||
sub ecx,32
|
||||
jmp SmallForwardMove_3
|
||||
@FwdMoveSSE:
|
||||
push ebx
|
||||
mov ebx,edx
|
||||
{Align Writes}
|
||||
add eax,ecx
|
||||
add ecx,edx
|
||||
add edx,15
|
||||
and edx,-16
|
||||
sub ecx,edx
|
||||
add edx,ecx
|
||||
{Now Aligned}
|
||||
sub ecx,32
|
||||
neg ecx
|
||||
@FwdLoopSSE:
|
||||
movups xmm1,[eax+ecx-32]
|
||||
movups xmm2,[eax+ecx-16]
|
||||
movaps [edx+ecx-32],xmm1
|
||||
movaps [edx+ecx-16],xmm2
|
||||
add ecx,32
|
||||
jle @FwdLoopSSE
|
||||
movups [ebx],xmm0 {First 16 Bytes}
|
||||
neg ecx
|
||||
add ecx,32
|
||||
pop ebx
|
||||
jmp SmallForwardMove_3
|
||||
@FwdLargeMove:
|
||||
push ebx
|
||||
mov ebx,ecx
|
||||
test edx,15
|
||||
jz @FwdLargeAligned
|
||||
{16 byte Align Destination}
|
||||
mov ecx,edx
|
||||
add ecx,15
|
||||
and ecx,-16
|
||||
sub ecx,edx
|
||||
add eax,ecx
|
||||
add edx,ecx
|
||||
sub ebx,ecx
|
||||
{Destination now 16 Byte Aligned}
|
||||
call SmallForwardMove_3
|
||||
mov ecx,ebx
|
||||
@FwdLargeAligned:
|
||||
and ecx,-16
|
||||
sub ebx,ecx {EBX = Remainder}
|
||||
push edx
|
||||
push eax
|
||||
push ecx
|
||||
call AlignedFwdMoveSSE_3
|
||||
pop ecx
|
||||
pop eax
|
||||
pop edx
|
||||
add ecx,ebx
|
||||
add eax,ecx
|
||||
add edx,ecx
|
||||
mov ecx,ebx
|
||||
pop ebx
|
||||
jmp SmallForwardMove_3
|
||||
end; {Forwards_SSE}
|
||||
|
||||
{-------------------------------------------------------------------------}
|
||||
{Move ECX Bytes from EAX to EDX, where EAX < EDX and ECX > 36 (SMALLMOVESIZE)}
|
||||
procedure Backwards_SSE_3;assembler;nostackframe;
|
||||
asm
|
||||
cmp ecx,SMALLMOVESIZE+32
|
||||
jg @BwdMoveSSE
|
||||
sub ecx,32
|
||||
movups xmm1,[eax+ecx]
|
||||
movups xmm2,[eax+ecx+16]
|
||||
movups [edx+ecx],xmm1
|
||||
movups [edx+ecx+16],xmm2
|
||||
jmp SmallBackwardMove_3
|
||||
@BwdMoveSSE:
|
||||
push ebx
|
||||
movups xmm0,[eax+ecx-16] {Last 16 Bytes}
|
||||
{Align Writes}
|
||||
lea ebx,[edx+ecx]
|
||||
and ebx,15
|
||||
sub ecx,ebx
|
||||
add ebx,ecx
|
||||
{Now Aligned}
|
||||
sub ecx,32
|
||||
@BwdLoop:
|
||||
movups xmm1,[eax+ecx]
|
||||
movups xmm2,[eax+ecx+16]
|
||||
movaps [edx+ecx],xmm1
|
||||
movaps [edx+ecx+16],xmm2
|
||||
sub ecx,32
|
||||
jge @BwdLoop
|
||||
movups [edx+ebx-16],xmm0 {Last 16 Bytes}
|
||||
add ecx,32
|
||||
pop ebx
|
||||
jmp SmallBackwardMove_3
|
||||
end; {Backwards_SSE}
|
||||
|
||||
const
|
||||
fastmoveproc_forward : pointer = @Forwards_SSE_3;
|
||||
fastmoveproc_backward : pointer = @Backwards_SSE_3;
|
||||
|
||||
procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;nostackframe;
|
||||
asm
|
||||
cmp ecx,SMALLMOVESIZE
|
||||
ja @Large
|
||||
cmp eax,edx
|
||||
lea eax,[eax+ecx]
|
||||
jle @SmallCheck
|
||||
@SmallForward:
|
||||
add edx,ecx
|
||||
jmp SmallForwardMove_3
|
||||
@SmallCheck:
|
||||
je @Done {For Compatibility with Delphi's move for Source = Dest}
|
||||
sub eax,ecx
|
||||
jmp SmallBackwardMove_3
|
||||
@Large:
|
||||
jng @Done {For Compatibility with Delphi's move for Count < 0}
|
||||
cmp eax,edx
|
||||
jg @moveforward
|
||||
je @Done {For Compatibility with Delphi's move for Source = Dest}
|
||||
push eax
|
||||
add eax,ecx
|
||||
cmp eax,edx
|
||||
pop eax
|
||||
jg @movebackward
|
||||
@moveforward:
|
||||
jmp dword ptr fastmoveproc_forward
|
||||
@movebackward:
|
||||
jmp dword ptr fastmoveproc_backward {Source/Dest Overlap}
|
||||
@Done:
|
||||
end;
|
||||
|
||||
{$asmmode att}
|
||||
|
||||
procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
if has_sse_support then
|
||||
begin
|
||||
fastmoveproc_forward:=@Forwards_SSE_3;
|
||||
fastmoveproc_backward:=@Backwards_SSE_3;
|
||||
end
|
||||
else if has_mmx_support then
|
||||
begin
|
||||
fastmoveproc_forward:=@Forwards_MMX_3;
|
||||
fastmoveproc_backward:=@Backwards_MMX_3;
|
||||
end;
|
||||
end;
|
||||
|
||||
{$endif FPC_SYSTEM_HAS_MOVE}
|
||||
|
||||
{$else}
|
||||
|
||||
procedure setup_fastmove;{$ifdef SYSTEMINLINE}inline;{$endif}
|
||||
begin
|
||||
end;
|
||||
|
||||
{$endif}
|
@ -15,14 +15,85 @@
|
||||
|
||||
**********************************************************************}
|
||||
|
||||
{$asmmode ATT}
|
||||
|
||||
{****************************************************************************
|
||||
Primitives
|
||||
****************************************************************************}
|
||||
var
|
||||
has_sse_support,has_mmx_support : boolean;
|
||||
|
||||
{$asmmode intel}
|
||||
|
||||
function cpuid_support : boolean;assembler;
|
||||
{
|
||||
Check if the ID-flag can be changed, if changed then CpuID is supported.
|
||||
Tested under go32v1 and Linux on c6x86 with CpuID enabled and disabled (PFV)
|
||||
}
|
||||
asm
|
||||
pushf
|
||||
pushf
|
||||
pop eax
|
||||
mov ebx,eax
|
||||
xor eax,200000h
|
||||
push eax
|
||||
popf
|
||||
pushf
|
||||
pop eax
|
||||
popf
|
||||
and eax,200000h
|
||||
and ebx,200000h
|
||||
cmp eax,ebx
|
||||
setnz al
|
||||
end;
|
||||
|
||||
{$asmmode ATT}
|
||||
|
||||
function sse_support : boolean;
|
||||
var
|
||||
_edx : longint;
|
||||
begin
|
||||
if cpuid_support then
|
||||
begin
|
||||
asm
|
||||
movl $1,%eax
|
||||
cpuid
|
||||
movl %edx,_edx
|
||||
end;
|
||||
sse_support:=(_edx and $2000000)<>0;
|
||||
end
|
||||
else
|
||||
{ a cpu with without cpuid instruction supports never sse }
|
||||
sse_support:=false;
|
||||
end;
|
||||
|
||||
|
||||
{ returns true, if the processor supports the mmx instructions }
|
||||
function mmx_support : boolean;
|
||||
|
||||
var
|
||||
_edx : longint;
|
||||
|
||||
begin
|
||||
if cpuid_support then
|
||||
begin
|
||||
asm
|
||||
movl $1,%eax
|
||||
cpuid
|
||||
movl %edx,_edx
|
||||
end;
|
||||
mmx_support:=(_edx and $800000)<>0;
|
||||
end
|
||||
else
|
||||
{ a cpu with without cpuid instruction supports never mmx }
|
||||
mmx_support:=false;
|
||||
end;
|
||||
|
||||
{$i fastmove.inc}
|
||||
|
||||
procedure fpc_cpuinit;
|
||||
begin
|
||||
has_sse_support:=sse_support;
|
||||
has_mmx_support:=mmx_support;
|
||||
setup_fastmove;
|
||||
end;
|
||||
|
||||
|
||||
@ -32,7 +103,6 @@ asm
|
||||
ret
|
||||
end;
|
||||
|
||||
|
||||
{$ifndef FPC_SYSTEM_HAS_MOVE}
|
||||
{$define FPC_SYSTEM_HAS_MOVE}
|
||||
procedure Move(const source;var dest;count:SizeInt);[public, alias: 'FPC_MOVE'];assembler;
|
||||
@ -1510,7 +1580,10 @@ end;
|
||||
|
||||
{
|
||||
$Log$
|
||||
Revision 1.66 2004-11-17 22:19:04 peter
|
||||
Revision 1.67 2005-01-23 20:03:23 florian
|
||||
+ fastmove from John O'Harrow integrated
|
||||
|
||||
Revision 1.66 2004/11/17 22:19:04 peter
|
||||
internconst, internproc and some external declarations moved to interface
|
||||
|
||||
Revision 1.65 2004/11/01 12:43:29 peter
|
||||
|
Loading…
Reference in New Issue
Block a user