mirror of
https://gitlab.com/freepascal.org/fpc/source.git
synced 2025-04-21 18:09:30 +02:00
* move() now uses dcbz if possible
This commit is contained in:
parent
dbf5fd90ca
commit
71626ce890
@ -136,8 +136,8 @@ asm
|
||||
{ if overlap, then r10 := -1 else r10 := 0 }
|
||||
subfe r10,r10,r10
|
||||
|
||||
{ count < 39 ? (32 + max. alignment (7) }
|
||||
cmpwi cr7,r5,39
|
||||
{ count < 63 ? (32 + max. alignment (31) }
|
||||
cmpwi cr7,r5,63
|
||||
|
||||
{ if count <= 0, stop }
|
||||
ble cr0,LMoveDone
|
||||
@ -152,7 +152,7 @@ asm
|
||||
{ if overlap, then point source and dest to the end }
|
||||
add r3,r3,r0
|
||||
add r4,r4,r0
|
||||
{ if overlap, then r0 := 6, else r6 := -1 }
|
||||
{ if overlap, then r6 := 0, else r6 := -1 }
|
||||
not r6,r10
|
||||
{ if overlap, then r10 := -2, else r10 := 0 }
|
||||
slwi r10,r10,1
|
||||
@ -178,16 +178,30 @@ LMove4ByteAlignLoop:
|
||||
{ while not aligned, continue }
|
||||
bne cr0,LMove4ByteAlignLoop
|
||||
|
||||
{ check for 8 byte alignment }
|
||||
andi. r0,r4,7
|
||||
{ check for 32 byte alignment }
|
||||
andi. r7,r4,31
|
||||
{ we are going to copy one byte again (the one at the newly }
|
||||
{ aligned address), so increase count byte 1 }
|
||||
addi r5,r5,1
|
||||
{ count div 4 for number of dwords to copy }
|
||||
srwi r0,r5,2
|
||||
{ if 11 <= count < 39, copy using dwords }
|
||||
{ if 11 <= count < 63, copy using dwords }
|
||||
blt cr7,LMoveDWords
|
||||
|
||||
{ # of dwords to copy to reach 32 byte alignment (*4) }
|
||||
{ (depends on forward/backward copy) }
|
||||
|
||||
{ if forward copy, r6 = -1 -> r8 := 32 }
|
||||
{ if backward copy, r6 = 0 -> r8 := 0 }
|
||||
rlwinm r8,r6,0,31-6+1,31-6+1
|
||||
{ if forward copy, we have to copy 32 - unaligned count bytes }
|
||||
{ if backward copy unaligned count bytes }
|
||||
sub r7,r8,r7
|
||||
{ if backward copy, the calculated value is now negate -> }
|
||||
{ make it positive again }
|
||||
not r8, r6
|
||||
add r7, r7, r8
|
||||
xor r7, r7, r8
|
||||
{ multiply the update count with 4 }
|
||||
slwi r10,r10,2
|
||||
slwi r6,r6,2
|
||||
@ -195,15 +209,18 @@ LMove4ByteAlignLoop:
|
||||
add r3,r3,r6
|
||||
add r4,r4,r6
|
||||
|
||||
beq cr0,L8BytesAligned
|
||||
|
||||
beq cr0,LMove32BytesAligned
|
||||
L32BytesAlignMoveLoop:
|
||||
{ count >= 39 -> align to 8 byte boundary and then use the FPU }
|
||||
{ since we're already at 4 byte alignment, use dword store }
|
||||
subic. r7,r7,4
|
||||
lwzux r0,r3,r10
|
||||
stwux r0,r4,r10
|
||||
subi r5,r5,4
|
||||
L8BytesAligned:
|
||||
{ count div 32 ( >= 1, since count was >=39 }
|
||||
stwux r0,r4,r10
|
||||
bne L32BytesAlignMoveLoop
|
||||
|
||||
LMove32BytesAligned:
|
||||
{ count div 32 ( >= 1, since count was >=63 }
|
||||
srwi r0,r5,5
|
||||
{ remainder }
|
||||
andi. r5,r5,31
|
||||
@ -217,6 +234,7 @@ L8BytesAligned:
|
||||
|
||||
{ adjust the update count: it will now be 8 or -8 depending on overlap }
|
||||
slwi r10,r10,1
|
||||
{ get dcbz offset }
|
||||
|
||||
{ adjust source and dest pointers: because of the above loop, dest is now }
|
||||
{ aligned to 8 bytes. So if we add r6 we will still have an 8 bytes }
|
||||
@ -226,16 +244,34 @@ L8BytesAligned:
|
||||
|
||||
slwi r6,r6,1
|
||||
|
||||
LMove32ByteLoop:
|
||||
{ the dcbz offset must give a 32 byte aligned address when added }
|
||||
{ to the current dest address and its address must point to the }
|
||||
{ bytes that will be overwritten in the current iteration. In case }
|
||||
{ of a forward loop, the dest address has currently an offset of }
|
||||
{ -8 compared to the bytes that will be overwritten (and r6 = -8). }
|
||||
{ In case of a backward of a loop, the dest address currently has }
|
||||
{ an offset of +32 compared to the bytes that will be overwritten }
|
||||
{ (and r6 = 0). So the forward dcbz offset must become +8 and the }
|
||||
{ backward -32 -> (-r6 * 5) - 32 gives the correct offset }
|
||||
slwi r7,r6,2
|
||||
add r7,r7,r6
|
||||
neg r7,r7
|
||||
subi r7,r7,32
|
||||
|
||||
LMove32ByteDcbz:
|
||||
lfdux f0,r3,r10
|
||||
lfdux f1,r3,r10
|
||||
lfdux f2,r3,r10
|
||||
lfdux f3,r3,r10
|
||||
{ must be done only now, in case source and dest are less than }
|
||||
{ 32 bytes apart! }
|
||||
dcbz r4,r7
|
||||
stfdux f0,r4,r10
|
||||
stfdux f1,r4,r10
|
||||
stfdux f2,r4,r10
|
||||
stfdux f3,r4,r10
|
||||
bdnz LMove32ByteLoop
|
||||
bdnz LMove32ByteDcbz
|
||||
LMove32ByteLoopDone:
|
||||
|
||||
{ cr0*4+eq is true if "count and 31" = 0 }
|
||||
beq cr0,LMoveDone
|
||||
@ -897,7 +933,10 @@ end ['R3','R10'];
|
||||
|
||||
{
|
||||
$Log$
|
||||
Revision 1.46 2003-05-17 00:19:51 jonas
|
||||
Revision 1.47 2003-05-29 12:14:02 jonas
|
||||
* move() now uses dcbz if possible
|
||||
|
||||
Revision 1.46 2003/05/17 00:19:51 jonas
|
||||
* fixed inclocked
|
||||
|
||||
Revision 1.45 2003/05/14 19:47:35 jonas
|
||||
|
Loading…
Reference in New Issue
Block a user