|
@@ -136,8 +136,8 @@ asm
|
|
{ if overlap, then r10 := -1 else r10 := 0 }
|
|
{ if overlap, then r10 := -1 else r10 := 0 }
|
|
subfe r10,r10,r10
|
|
subfe r10,r10,r10
|
|
|
|
|
|
- { count < 39 ? (32 + max. alignment (7) }
|
|
|
|
- cmpwi cr7,r5,39
|
|
|
|
|
|
+ { count < 63 ? (32 + max. alignment (31) }
|
|
|
|
+ cmpwi cr7,r5,63
|
|
|
|
|
|
{ if count <= 0, stop }
|
|
{ if count <= 0, stop }
|
|
ble cr0,LMoveDone
|
|
ble cr0,LMoveDone
|
|
@@ -152,7 +152,7 @@ asm
|
|
{ if overlap, then point source and dest to the end }
|
|
{ if overlap, then point source and dest to the end }
|
|
add r3,r3,r0
|
|
add r3,r3,r0
|
|
add r4,r4,r0
|
|
add r4,r4,r0
|
|
- { if overlap, then r0 := 6, else r6 := -1 }
|
|
|
|
|
|
+ { if overlap, then r6 := 0, else r6 := -1 }
|
|
not r6,r10
|
|
not r6,r10
|
|
{ if overlap, then r10 := -2, else r10 := 0 }
|
|
{ if overlap, then r10 := -2, else r10 := 0 }
|
|
slwi r10,r10,1
|
|
slwi r10,r10,1
|
|
@@ -178,16 +178,30 @@ LMove4ByteAlignLoop:
|
|
{ while not aligned, continue }
|
|
{ while not aligned, continue }
|
|
bne cr0,LMove4ByteAlignLoop
|
|
bne cr0,LMove4ByteAlignLoop
|
|
|
|
|
|
- { check for 8 byte alignment }
|
|
|
|
- andi. r0,r4,7
|
|
|
|
|
|
+ { check for 32 byte alignment }
|
|
|
|
+ andi. r7,r4,31
|
|
{ we are going to copy one byte again (the one at the newly }
|
|
{ we are going to copy one byte again (the one at the newly }
|
|
{ aligned address), so increase count byte 1 }
|
|
{ aligned address), so increase count byte 1 }
|
|
addi r5,r5,1
|
|
addi r5,r5,1
|
|
{ count div 4 for number of dwords to copy }
|
|
{ count div 4 for number of dwords to copy }
|
|
srwi r0,r5,2
|
|
srwi r0,r5,2
|
|
- { if 11 <= count < 39, copy using dwords }
|
|
|
|
|
|
+ { if 11 <= count < 63, copy using dwords }
|
|
blt cr7,LMoveDWords
|
|
blt cr7,LMoveDWords
|
|
|
|
|
|
|
|
+ { # of dwords to copy to reach 32 byte alignment (*4) }
|
|
|
|
+ { (depends on forward/backward copy) }
|
|
|
|
+
|
|
|
|
+ { if forward copy, r6 = -1 -> r8 := 32 }
|
|
|
|
+ { if backward copy, r6 = 0 -> r8 := 0 }
|
|
|
|
+ rlwinm r8,r6,0,31-6+1,31-6+1
|
|
|
|
+ { if forward copy, we have to copy 32 - unaligned count bytes }
|
|
|
|
+ { if backward copy unaligned count bytes }
|
|
|
|
+ sub r7,r8,r7
|
|
|
|
+ { if backward copy, the calculated value is now negate -> }
|
|
|
|
+ { make it positive again }
|
|
|
|
+ not r8, r6
|
|
|
|
+ add r7, r7, r8
|
|
|
|
+ xor r7, r7, r8
|
|
{ multiply the update count with 4 }
|
|
{ multiply the update count with 4 }
|
|
slwi r10,r10,2
|
|
slwi r10,r10,2
|
|
slwi r6,r6,2
|
|
slwi r6,r6,2
|
|
@@ -195,15 +209,18 @@ LMove4ByteAlignLoop:
|
|
add r3,r3,r6
|
|
add r3,r3,r6
|
|
add r4,r4,r6
|
|
add r4,r4,r6
|
|
|
|
|
|
- beq cr0,L8BytesAligned
|
|
|
|
-
|
|
|
|
|
|
+ beq cr0,LMove32BytesAligned
|
|
|
|
+L32BytesAlignMoveLoop:
|
|
{ count >= 39 -> align to 8 byte boundary and then use the FPU }
|
|
{ count >= 39 -> align to 8 byte boundary and then use the FPU }
|
|
{ since we're already at 4 byte alignment, use dword store }
|
|
{ since we're already at 4 byte alignment, use dword store }
|
|
|
|
+ subic. r7,r7,4
|
|
lwzux r0,r3,r10
|
|
lwzux r0,r3,r10
|
|
- stwux r0,r4,r10
|
|
|
|
subi r5,r5,4
|
|
subi r5,r5,4
|
|
-L8BytesAligned:
|
|
|
|
- { count div 32 ( >= 1, since count was >=39 }
|
|
|
|
|
|
+ stwux r0,r4,r10
|
|
|
|
+ bne L32BytesAlignMoveLoop
|
|
|
|
+
|
|
|
|
+LMove32BytesAligned:
|
|
|
|
+ { count div 32 ( >= 1, since count was >=63 }
|
|
srwi r0,r5,5
|
|
srwi r0,r5,5
|
|
{ remainder }
|
|
{ remainder }
|
|
andi. r5,r5,31
|
|
andi. r5,r5,31
|
|
@@ -217,6 +234,7 @@ L8BytesAligned:
|
|
|
|
|
|
{ adjust the update count: it will now be 8 or -8 depending on overlap }
|
|
{ adjust the update count: it will now be 8 or -8 depending on overlap }
|
|
slwi r10,r10,1
|
|
slwi r10,r10,1
|
|
|
|
+ { get dcbz offset }
|
|
|
|
|
|
{ adjust source and dest pointers: because of the above loop, dest is now }
|
|
{ adjust source and dest pointers: because of the above loop, dest is now }
|
|
{ aligned to 8 bytes. So if we add r6 we will still have an 8 bytes }
|
|
{ aligned to 8 bytes. So if we add r6 we will still have an 8 bytes }
|
|
@@ -226,16 +244,34 @@ L8BytesAligned:
|
|
|
|
|
|
slwi r6,r6,1
|
|
slwi r6,r6,1
|
|
|
|
|
|
-LMove32ByteLoop:
|
|
|
|
|
|
+ { the dcbz offset must give a 32 byte aligned address when added }
|
|
|
|
+ { to the current dest address and its address must point to the }
|
|
|
|
+ { bytes that will be overwritten in the current iteration. In case }
|
|
|
|
+ { of a forward loop, the dest address has currently an offset of }
|
|
|
|
+ { -8 compared to the bytes that will be overwritten (and r6 = -8). }
|
|
|
|
+ { In case of a backward of a loop, the dest address currently has }
|
|
|
|
+ { an offset of +32 compared to the bytes that will be overwritten }
|
|
|
|
+ { (and r6 = 0). So the forward dcbz offset must become +8 and the }
|
|
|
|
+ { backward -32 -> (-r6 * 5) - 32 gives the correct offset }
|
|
|
|
+ slwi r7,r6,2
|
|
|
|
+ add r7,r7,r6
|
|
|
|
+ neg r7,r7
|
|
|
|
+ subi r7,r7,32
|
|
|
|
+
|
|
|
|
+LMove32ByteDcbz:
|
|
lfdux f0,r3,r10
|
|
lfdux f0,r3,r10
|
|
lfdux f1,r3,r10
|
|
lfdux f1,r3,r10
|
|
lfdux f2,r3,r10
|
|
lfdux f2,r3,r10
|
|
lfdux f3,r3,r10
|
|
lfdux f3,r3,r10
|
|
|
|
+ { must be done only now, in case source and dest are less than }
|
|
|
|
+ { 32 bytes apart! }
|
|
|
|
+ dcbz r4,r7
|
|
stfdux f0,r4,r10
|
|
stfdux f0,r4,r10
|
|
stfdux f1,r4,r10
|
|
stfdux f1,r4,r10
|
|
stfdux f2,r4,r10
|
|
stfdux f2,r4,r10
|
|
stfdux f3,r4,r10
|
|
stfdux f3,r4,r10
|
|
- bdnz LMove32ByteLoop
|
|
|
|
|
|
+ bdnz LMove32ByteDcbz
|
|
|
|
+LMove32ByteLoopDone:
|
|
|
|
|
|
{ cr0*4+eq is true if "count and 31" = 0 }
|
|
{ cr0*4+eq is true if "count and 31" = 0 }
|
|
beq cr0,LMoveDone
|
|
beq cr0,LMoveDone
|
|
@@ -897,7 +933,10 @@ end ['R3','R10'];
|
|
|
|
|
|
{
|
|
{
|
|
$Log$
|
|
$Log$
|
|
- Revision 1.46 2003-05-17 00:19:51 jonas
|
|
|
|
|
|
+ Revision 1.47 2003-05-29 12:14:02 jonas
|
|
|
|
+ * move() now uses dcbz if possible
|
|
|
|
+
|
|
|
|
+ Revision 1.46 2003/05/17 00:19:51 jonas
|
|
* fixed inclocked
|
|
* fixed inclocked
|
|
|
|
|
|
Revision 1.45 2003/05/14 19:47:35 jonas
|
|
Revision 1.45 2003/05/14 19:47:35 jonas
|