+ new, complete implementation of move procedure (including support for

overlapping regions)
2025-08-10 14:46:02 +02:00 · 2001-03-02 13:24:10 +00:00 · 2001-03-02 13:24:10 +00:00 · b7970bf7a4
commit b7970bf7a4
parent 6874cceff4
1 changed files with 145 additions and 73 deletions
--- a/rtl/powerpc/powerpc.inc
+++ b/rtl/powerpc/powerpc.inc
@ -24,80 +24,148 @@

 {$define FPC_SYSTEM_HAS_MOVE}

-procedure Move(var source;var dest;count:longint);
-begin
-{ register usage:
-  r3    source
-  r4    dest
-  r5    count
-  r13   ptr to end of source
-  r14   ptr to end of dest
-  r15   counter 1
-  r16   counter 2
-  r17   addr increment
-  r18   ptr to current source block
-  r19   ptr to current dest block
-  r20-24        buffer
-  f1-4  buffer
-  ctr   Loop counter
-  notes:
-  Move uses FPRs for increased bandwidth
-}
-        asm
-        { do some param checking, initialization }
-        cmplwi  cr2,r3,0
-        cmplwi  cr3,r4,0
-        cmplw   cr4,r3,r4
-        add             r13,r3,r5
-        add             r14,r4,r5
-        bt              cr2,.MoveEnd    //end if source=nil
-        bt              cr3,.MoveEnd    //end if dest=nil
-        bt              cr4,.MoveEnd    //end if source=dest
-        { see if source and dest overlap }
-        cmplw   cr2,r13,r4
-        cmplw   cr3,r4,r3
-        srawi.  r15,r5,$5               //r15 := count div 32
-        andi    r16,r5,$1F              //r16 := count mod 32
-        crand   cr3,cr2,cr3
-        mtctr   r15                             //Load loop counter
-        bgt             cr3,.MoveRL             //dest overlaps source on right
-        li              r17,$8                  //Offset 8 bytes per doubleword copy
-        sub             r18,r17,r3              //calculate the starting source
-        sub             r19,r17,r4              //                                      and dest ptrs
-        beq             .MoveByByte             //If count<32 skip 32 byte block copy
-        srawi.  r15,r16,$2              //r15 := r16 div 4
-        andi    r16,r15,$3              //r16 := r15 mod 4
-        cmpwi   cr2,r16,0               //r16 = 0 ?
-        crand   cr3,cr2,cr0             //r15 = 0 AND r16 = 0 ?
-.MoveBlockLoop:                                 //32 Byte block copy (fully optimized)
-                lfdux   f1,r18,r17
-                lfdux   f2,r18,r17
-                lfdux   f3,r18,r17
-                lfdux   f4,r18,r17
-                stfdux  f1,r19,r17
-                stfdux  f2,r19,r17
-                stfdux  f3,r19,r17
-                stfdux  f4,r19,r17
-                bdnz    .MoveBlockLoop
+procedure Move(var sou{}rce;var dest;count:longint);assembler;
+asm
+                {  count <= 0 ?  }
+                cmpwi   cr0,r5,0
+                {  check if we have to do the move backwards because of overlap  }
+                sub     r30,r4,r3
+                {  carry := boolean(dest-source < count) = boolean(overlap) }
+                subc    r30,r30,r5
+                
+                {  count < 11 ? (to decide whether we will move dwords or bytes  }
+                cmpwi   cr1,r5,11
+                
+                {  if overlap, then r30 := -1 else r30 := 0  }
+                subfe   r30,r30,r30
+                
+                {  count < 39 ? (32 + max. alignment (7) }
+                cmpwi   cr7,r5,39
+                
+                {  if count <= 0, stop  }
+                ble     cr0,LMoveDone
+                
+                {  if overlap, then r29 := count else r29 := 0  }
+                and     r29,r5,r30
+                {  if overlap, then point source and dest to the end  }
+                add     r3,r3,r29
+                add     r4,r4,r29
+                {  if overlap, then r29 := 0, else r29 := -1  }
+                not     r29,r30
+                {  if overlap, then r30 := -2, else r30 := 0  }
+                slwi    r30,r30,1
+                {  if overlap, then r30 := -1, else r30 := 1  }
+                addi    r30,r30,1
+                {  if overlap, then source/dest += -1, otherwise they stay }
+                {  After the next instruction, r3/r4 + r30 = next position }
+                {  to load/store from/to                                   }
+                add     r3,r3,r29
+                add     r4,r4,r29

-                bt              cr3,MoveEnd             //Nothing left to do...
-                mtspr   1,r16                   //XER := r16
-                beq             .MoveBytes              //There are fewer than 4 bytes left
-                mtctr   r15                             //load counter
-                andi    r15,r15,$3              //r15 := r15 mod 4
-                srawi   r17,$1                  //Offset := Offset div 2
-.MoveWordLoop:                                  //4 byte copy
-                lwzux   r20,r18,r17
-                stwux   r20,r19,r17
-                bdnz    .WordCopyLoop
+                {  if count < 11, copy everything byte by byte  }
+                blt     cr1,LMoveBytes

-                bt              cr2,MoveEnd             //Nothing left to do...
-.MoveBytes:                                             //Copy remaining stragglers
-                lswx    r20,r0,r18
-                stswx   r20,r0,r19
-.MoveEnd:
-                End;
-End;
+                {  otherwise, guarantee 4 byte alignment for dest for starters  }
+LMove4ByteAlignLoop:
+                lbzux   r29,r3,r30
+                stbux   r29,r4,r30
+                {  is dest now 4 aligned?  }
+                andi.   r29,r4,3
+                subi    r5,r5,1
+                {  while not aligned, continue  }
+                bne     cr0,LMove4ByteAlignLoop
+
+                { check for 8 byte alignment }
+                andi.   r29,r4,7
+                { we are going to copy one byte again (the one at the newly }
+                { aligned address), so increase count again                 }
+                addi    r5,r5,1
+                { count div 4 for number of dwords to copy }
+                srwi    r29,r5,2
+                {  if 11 <= count < 39, copy using dwords }
+                blt     cr7,LMoveDWords
+
+                beq     cr0,L8BytesAligned
+                
+                {  count >= 39 -> align to 8 byte boundary and then use the FPU  }
+                {  since we're already at 4 byte alignment, use dword store      }
+                lwzux   r29,r3,r30
+                stwux   r29,r4,r30
+L8BytesAligned:
+                { count div 32 ( >= 1, since count was >=39 }
+                srwi    r29,r5,5
+                { remainder }
+                andi.   r5,r5,31
+                { to decide if we will do some dword stores afterwards or not }
+                cmpwi   cr1,r5,11
+                mtctr   r29
+                
+                {  r29 := count div 4, will be moved to ctr when copying dwords  }
+                srwi    r29,r5,2
+                
+                {  adjust the update count: it will now be 8 or -8 depending on overlap  }
+                slwi    r30,r30,3
+                
+                {  adjust source and dest pointers: because of the above loop, dest is now   }
+                {  aligned to 8 bytes. So if we substract r30 we will still have an 8 bytes  }
+                { aligned address)                                                           }
+                sub     r3,r3,r30
+                sub     r4,r4,r30
+
+LMove32ByteLoop:
+                lfdux   f31,r3,r30
+                lfdux   f30,r3,r30
+                lfdux   f29,r3,r30
+                lfdux   f28,r3,r30
+                stfdux  f31,r4,r30
+                stfdux  f30,r4,r30
+                stfdux  f29,r4,r30
+                stfdux  f28,r4,r30
+                bdnz    LMove32ByteLoop
+
+                { cr0*4+eq is true if "count and 31" = 0 }
+                beq     cr0,LMoveDone
+
+                {  make r30 again -1 or 1, but first adjust source/dest pointers }
+                add		r3,r3,r30
+                add		r4,r4,r30
+                srawi   r30,r30,3
+                sub     r3,r3,r30
+                sub     r4,r4,r30
+
+                { cr1 contains whether count <= 11 }
+                ble     cr1,LMoveBytes
+                add     r3,r3,r30
+                add     r4,r4,r30
+
+LMoveDWords:
+                mtctr   r29
+                andi.   r5,r5,3
+                {  r30 * 4  }
+                slwi    r30,r30,2
+                sub		r3,r3,r30
+                sub		r4,r4,r30
+
+LMoveDWordsLoop:
+                lwzux   r29,r3,r30
+                stwux   r29,r4,r30
+                bdnz    LMoveDWordsLoop
+
+                beq     cr0,LMoveDone
+                {  make r30 again -1 or 1  }
+                add		r3,r3,r30
+                add		r4,r4,r30
+                srawi   r30,r30,2
+                sub     r3,r3,r30
+                sub     r4,r4,r30
+LMoveBytes:
+                mtctr   r5
+LMoveBytesLoop:
+                lbzux   r29,r3,r30
+                stbux   r29,r4,r30
+                bdnz    LMoveBytesLoop
+LMoveDone:
+end ['R3','R4','R5','R29','R30','F28','F29','F30','F31','CTR','CR0','CR1','CR7'];


 {$define FPC_SYSTEM_HAS_FILLCHAR}
@ -380,7 +448,11 @@ end ['r3','r4','r5','r29','r30','cr0','ctr'];

 {
  $Log$
-  Revision 1.2  2001-02-11 17:59:46  jonas
+  Revision 1.3  2001-03-02 13:24:10  jonas
+    + new, complete implementation of move procedure (including support for
+      overlapping regions)
+
+  Revision 1.2  2001/02/11 17:59:46  jonas
    * implemented several more procedures

  Revision 1.1  2000/07/27 07:32:12  jonas