+ new, complete implementation of move procedure (including support for

overlapping regions)
This commit is contained in:
Jonas Maebe 2001-03-02 13:24:10 +00:00
parent 6874cceff4
commit b7970bf7a4

View File

@ -24,80 +24,148 @@
{$define FPC_SYSTEM_HAS_MOVE}
procedure Move(var source;var dest;count:longint);
begin
{ register usage:
r3 source
r4 dest
r5 count
r13 ptr to end of source
r14 ptr to end of dest
r15 counter 1
r16 counter 2
r17 addr increment
r18 ptr to current source block
r19 ptr to current dest block
r20-24 buffer
f1-4 buffer
ctr Loop counter
notes:
Move uses FPRs for increased bandwidth
}
asm
{ do some param checking, initialization }
cmplwi cr2,r3,0
cmplwi cr3,r4,0
cmplw cr4,r3,r4
add r13,r3,r5
add r14,r4,r5
bt cr2,.MoveEnd //end if source=nil
bt cr3,.MoveEnd //end if dest=nil
bt cr4,.MoveEnd //end if source=dest
{ see if source and dest overlap }
cmplw cr2,r13,r4
cmplw cr3,r4,r3
srawi. r15,r5,$5 //r15 := count div 32
andi r16,r5,$1F //r16 := count mod 32
crand cr3,cr2,cr3
mtctr r15 //Load loop counter
bgt cr3,.MoveRL //dest overlaps source on right
li r17,$8 //Offset 8 bytes per doubleword copy
sub r18,r17,r3 //calculate the starting source
sub r19,r17,r4 // and dest ptrs
beq .MoveByByte //If count<32 skip 32 byte block copy
srawi. r15,r16,$2 //r15 := r16 div 4
andi r16,r15,$3 //r16 := r15 mod 4
cmpwi cr2,r16,0 //r16 = 0 ?
crand cr3,cr2,cr0 //r15 = 0 AND r16 = 0 ?
.MoveBlockLoop: //32 Byte block copy (fully optimized)
lfdux f1,r18,r17
lfdux f2,r18,r17
lfdux f3,r18,r17
lfdux f4,r18,r17
stfdux f1,r19,r17
stfdux f2,r19,r17
stfdux f3,r19,r17
stfdux f4,r19,r17
bdnz .MoveBlockLoop
procedure Move(var sou{}rce;var dest;count:longint);assembler;
asm
{ count <= 0 ? }
cmpwi cr0,r5,0
{ check if we have to do the move backwards because of overlap }
sub r30,r4,r3
{ carry := boolean(dest-source < count) = boolean(overlap) }
subc r30,r30,r5
{ count < 11 ? (to decide whether we will move dwords or bytes }
cmpwi cr1,r5,11
{ if overlap, then r30 := -1 else r30 := 0 }
subfe r30,r30,r30
{ count < 39 ? (32 + max. alignment (7) }
cmpwi cr7,r5,39
{ if count <= 0, stop }
ble cr0,LMoveDone
{ if overlap, then r29 := count else r29 := 0 }
and r29,r5,r30
{ if overlap, then point source and dest to the end }
add r3,r3,r29
add r4,r4,r29
{ if overlap, then r29 := 0, else r29 := -1 }
not r29,r30
{ if overlap, then r30 := -2, else r30 := 0 }
slwi r30,r30,1
{ if overlap, then r30 := -1, else r30 := 1 }
addi r30,r30,1
{ if overlap, then source/dest += -1, otherwise they stay }
{ After the next instruction, r3/r4 + r30 = next position }
{ to load/store from/to }
add r3,r3,r29
add r4,r4,r29
bt cr3,MoveEnd //Nothing left to do...
mtspr 1,r16 //XER := r16
beq .MoveBytes //There are fewer than 4 bytes left
mtctr r15 //load counter
andi r15,r15,$3 //r15 := r15 mod 4
srawi r17,$1 //Offset := Offset div 2
.MoveWordLoop: //4 byte copy
lwzux r20,r18,r17
stwux r20,r19,r17
bdnz .WordCopyLoop
{ if count < 11, copy everything byte by byte }
blt cr1,LMoveBytes
bt cr2,MoveEnd //Nothing left to do...
.MoveBytes: //Copy remaining stragglers
lswx r20,r0,r18
stswx r20,r0,r19
.MoveEnd:
End;
End;
{ otherwise, guarantee 4 byte alignment for dest for starters }
LMove4ByteAlignLoop:
lbzux r29,r3,r30
stbux r29,r4,r30
{ is dest now 4 aligned? }
andi. r29,r4,3
subi r5,r5,1
{ while not aligned, continue }
bne cr0,LMove4ByteAlignLoop
{ check for 8 byte alignment }
andi. r29,r4,7
{ we are going to copy one byte again (the one at the newly }
{ aligned address), so increase count again }
addi r5,r5,1
{ count div 4 for number of dwords to copy }
srwi r29,r5,2
{ if 11 <= count < 39, copy using dwords }
blt cr7,LMoveDWords
beq cr0,L8BytesAligned
{ count >= 39 -> align to 8 byte boundary and then use the FPU }
{ since we're already at 4 byte alignment, use dword store }
lwzux r29,r3,r30
stwux r29,r4,r30
L8BytesAligned:
{ count div 32 ( >= 1, since count was >=39 }
srwi r29,r5,5
{ remainder }
andi. r5,r5,31
{ to decide if we will do some dword stores afterwards or not }
cmpwi cr1,r5,11
mtctr r29
{ r29 := count div 4, will be moved to ctr when copying dwords }
srwi r29,r5,2
{ adjust the update count: it will now be 8 or -8 depending on overlap }
slwi r30,r30,3
{ adjust source and dest pointers: because of the above loop, dest is now }
{ aligned to 8 bytes. So if we substract r30 we will still have an 8 bytes }
{ aligned address) }
sub r3,r3,r30
sub r4,r4,r30
LMove32ByteLoop:
lfdux f31,r3,r30
lfdux f30,r3,r30
lfdux f29,r3,r30
lfdux f28,r3,r30
stfdux f31,r4,r30
stfdux f30,r4,r30
stfdux f29,r4,r30
stfdux f28,r4,r30
bdnz LMove32ByteLoop
{ cr0*4+eq is true if "count and 31" = 0 }
beq cr0,LMoveDone
{ make r30 again -1 or 1, but first adjust source/dest pointers }
add r3,r3,r30
add r4,r4,r30
srawi r30,r30,3
sub r3,r3,r30
sub r4,r4,r30
{ cr1 contains whether count <= 11 }
ble cr1,LMoveBytes
add r3,r3,r30
add r4,r4,r30
LMoveDWords:
mtctr r29
andi. r5,r5,3
{ r30 * 4 }
slwi r30,r30,2
sub r3,r3,r30
sub r4,r4,r30
LMoveDWordsLoop:
lwzux r29,r3,r30
stwux r29,r4,r30
bdnz LMoveDWordsLoop
beq cr0,LMoveDone
{ make r30 again -1 or 1 }
add r3,r3,r30
add r4,r4,r30
srawi r30,r30,2
sub r3,r3,r30
sub r4,r4,r30
LMoveBytes:
mtctr r5
LMoveBytesLoop:
lbzux r29,r3,r30
stbux r29,r4,r30
bdnz LMoveBytesLoop
LMoveDone:
end ['R3','R4','R5','R29','R30','F28','F29','F30','F31','CTR','CR0','CR1','CR7'];
{$define FPC_SYSTEM_HAS_FILLCHAR}
@ -380,7 +448,11 @@ end ['r3','r4','r5','r29','r30','cr0','ctr'];
{
$Log$
Revision 1.2 2001-02-11 17:59:46 jonas
Revision 1.3 2001-03-02 13:24:10 jonas
+ new, complete implementation of move procedure (including support for
overlapping regions)
Revision 1.2 2001/02/11 17:59:46 jonas
* implemented several more procedures
Revision 1.1 2000/07/27 07:32:12 jonas