* move() now uses dcbz if possible

2025-04-21 18:09:30 +02:00 · 2003-05-29 12:14:02 +00:00 · 2003-05-29 12:14:02 +00:00 · 71626ce890
commit 71626ce890
parent dbf5fd90ca
1 changed files with 53 additions and 14 deletions
--- a/rtl/powerpc/powerpc.inc
+++ b/rtl/powerpc/powerpc.inc
@ -136,8 +136,8 @@ asm
          {  if overlap, then r10 := -1 else r10 := 0  }
          subfe   r10,r10,r10

-          {  count < 39 ? (32 + max. alignment (7) }
-          cmpwi   cr7,r5,39
+          {  count < 63 ? (32 + max. alignment (31) }
+          cmpwi   cr7,r5,63

          {  if count <= 0, stop  }
          ble     cr0,LMoveDone
@ -152,7 +152,7 @@ asm
          {  if overlap, then point source and dest to the end  }
          add     r3,r3,r0
          add     r4,r4,r0
-          {  if overlap, then r0 := 6, else r6 := -1  }
+          {  if overlap, then r6 := 0, else r6 := -1  }
          not     r6,r10
          {  if overlap, then r10 := -2, else r10 := 0  }
          slwi    r10,r10,1
@ -178,16 +178,30 @@ LMove4ByteAlignLoop:
          {  while not aligned, continue  }
          bne     cr0,LMove4ByteAlignLoop

-          { check for 8 byte alignment }
-          andi.   r0,r4,7
+          { check for 32 byte alignment }
+          andi.   r7,r4,31
          { we are going to copy one byte again (the one at the newly }
          { aligned address), so increase count byte 1                }
          addi    r5,r5,1
          { count div 4 for number of dwords to copy }
          srwi    r0,r5,2
-          {  if 11 <= count < 39, copy using dwords }
+          {  if 11 <= count < 63, copy using dwords }
          blt     cr7,LMoveDWords

+          { # of dwords to copy to reach 32 byte alignment (*4) }
+          { (depends on forward/backward copy)                  }
+
+          { if forward copy, r6 = -1 -> r8 := 32 }
+          { if backward copy, r6 = 0 -> r8 := 0  }
+          rlwinm  r8,r6,0,31-6+1,31-6+1
+          { if forward copy, we have to copy 32 - unaligned count bytes }
+          { if backward copy unaligned count bytes                      }
+          sub     r7,r8,r7
+          { if backward copy, the calculated value is now negate -> }
+          { make it positive again                                 }
+          not     r8, r6
+          add     r7, r7, r8
+          xor     r7, r7, r8
          { multiply the update count with 4 }
          slwi    r10,r10,2
          slwi    r6,r6,2
@ -195,15 +209,18 @@ LMove4ByteAlignLoop:
          add     r3,r3,r6
          add     r4,r4,r6

-          beq     cr0,L8BytesAligned
-
+          beq     cr0,LMove32BytesAligned
+L32BytesAlignMoveLoop:
          {  count >= 39 -> align to 8 byte boundary and then use the FPU  }
          {  since we're already at 4 byte alignment, use dword store      }
+          subic.  r7,r7,4
          lwzux   r0,r3,r10
-          stwux   r0,r4,r10
          subi    r5,r5,4
-L8BytesAligned:
-          { count div 32 ( >= 1, since count was >=39 }
+          stwux   r0,r4,r10
+          bne     L32BytesAlignMoveLoop
+
+LMove32BytesAligned:
+          { count div 32 ( >= 1, since count was >=63 }
          srwi    r0,r5,5
          { remainder }
          andi.   r5,r5,31
@ -217,6 +234,7 @@ L8BytesAligned:

          {  adjust the update count: it will now be 8 or -8 depending on overlap  }
          slwi    r10,r10,1
+          { get dcbz offset }

          {  adjust source and dest pointers: because of the above loop, dest is now   }
          {  aligned to 8 bytes. So if we add r6 we will still have an 8 bytes         }
@ -226,16 +244,34 @@ L8BytesAligned:

          slwi    r6,r6,1

-LMove32ByteLoop:
+          { the dcbz offset must give a 32 byte aligned address when added   }
+          { to the current dest address and its address must point to the    }
+          { bytes that will be overwritten in the current iteration. In case }
+          { of a forward loop, the dest address has currently an offset of   }
+          { -8 compared to the bytes that will be overwritten (and r6 = -8). }
+          { In case of a backward of a loop, the dest address currently has  }
+          { an offset of +32 compared to the bytes that will be overwritten  }
+          { (and r6 = 0). So the forward dcbz offset must become +8 and the  }
+          { backward -32 -> (-r6 * 5) - 32 gives the correct offset          }
+          slwi    r7,r6,2
+          add     r7,r7,r6
+          neg     r7,r7
+          subi    r7,r7,32
+
+LMove32ByteDcbz:
          lfdux   f0,r3,r10
          lfdux   f1,r3,r10
          lfdux   f2,r3,r10
          lfdux   f3,r3,r10
+          { must be done only now, in case source and dest are less than }
+          { 32 bytes apart!                                              }
+          dcbz    r4,r7
          stfdux  f0,r4,r10
          stfdux  f1,r4,r10
          stfdux  f2,r4,r10
          stfdux  f3,r4,r10
-          bdnz    LMove32ByteLoop
+          bdnz    LMove32ByteDcbz
+LMove32ByteLoopDone:

          { cr0*4+eq is true if "count and 31" = 0 }
          beq     cr0,LMoveDone
@ -897,7 +933,10 @@ end ['R3','R10'];

 {
  $Log$
-  Revision 1.46  2003-05-17 00:19:51  jonas
+  Revision 1.47  2003-05-29 12:14:02  jonas
+    * move() now uses dcbz if possible
+
+  Revision 1.46  2003/05/17 00:19:51  jonas
    * fixed inclocked

  Revision 1.45  2003/05/14 19:47:35  jonas