|
@@ -24,80 +24,148 @@
|
|
|
|
|
|
{$define FPC_SYSTEM_HAS_MOVE}
|
|
{$define FPC_SYSTEM_HAS_MOVE}
|
|
|
|
|
|
-procedure Move(var source;var dest;count:longint);
|
|
|
|
-begin
|
|
|
|
-{ register usage:
|
|
|
|
- r3 source
|
|
|
|
- r4 dest
|
|
|
|
- r5 count
|
|
|
|
- r13 ptr to end of source
|
|
|
|
- r14 ptr to end of dest
|
|
|
|
- r15 counter 1
|
|
|
|
- r16 counter 2
|
|
|
|
- r17 addr increment
|
|
|
|
- r18 ptr to current source block
|
|
|
|
- r19 ptr to current dest block
|
|
|
|
- r20-24 buffer
|
|
|
|
- f1-4 buffer
|
|
|
|
- ctr Loop counter
|
|
|
|
- notes:
|
|
|
|
- Move uses FPRs for increased bandwidth
|
|
|
|
-}
|
|
|
|
- asm
|
|
|
|
- { do some param checking, initialization }
|
|
|
|
- cmplwi cr2,r3,0
|
|
|
|
- cmplwi cr3,r4,0
|
|
|
|
- cmplw cr4,r3,r4
|
|
|
|
- add r13,r3,r5
|
|
|
|
- add r14,r4,r5
|
|
|
|
- bt cr2,.MoveEnd //end if source=nil
|
|
|
|
- bt cr3,.MoveEnd //end if dest=nil
|
|
|
|
- bt cr4,.MoveEnd //end if source=dest
|
|
|
|
- { see if source and dest overlap }
|
|
|
|
- cmplw cr2,r13,r4
|
|
|
|
- cmplw cr3,r4,r3
|
|
|
|
- srawi. r15,r5,$5 //r15 := count div 32
|
|
|
|
- andi r16,r5,$1F //r16 := count mod 32
|
|
|
|
- crand cr3,cr2,cr3
|
|
|
|
- mtctr r15 //Load loop counter
|
|
|
|
- bgt cr3,.MoveRL //dest overlaps source on right
|
|
|
|
- li r17,$8 //Offset 8 bytes per doubleword copy
|
|
|
|
- sub r18,r17,r3 //calculate the starting source
|
|
|
|
- sub r19,r17,r4 // and dest ptrs
|
|
|
|
- beq .MoveByByte //If count<32 skip 32 byte block copy
|
|
|
|
- srawi. r15,r16,$2 //r15 := r16 div 4
|
|
|
|
- andi r16,r15,$3 //r16 := r15 mod 4
|
|
|
|
- cmpwi cr2,r16,0 //r16 = 0 ?
|
|
|
|
- crand cr3,cr2,cr0 //r15 = 0 AND r16 = 0 ?
|
|
|
|
-.MoveBlockLoop: //32 Byte block copy (fully optimized)
|
|
|
|
- lfdux f1,r18,r17
|
|
|
|
- lfdux f2,r18,r17
|
|
|
|
- lfdux f3,r18,r17
|
|
|
|
- lfdux f4,r18,r17
|
|
|
|
- stfdux f1,r19,r17
|
|
|
|
- stfdux f2,r19,r17
|
|
|
|
- stfdux f3,r19,r17
|
|
|
|
- stfdux f4,r19,r17
|
|
|
|
- bdnz .MoveBlockLoop
|
|
|
|
-
|
|
|
|
- bt cr3,MoveEnd //Nothing left to do...
|
|
|
|
- mtspr 1,r16 //XER := r16
|
|
|
|
- beq .MoveBytes //There are fewer than 4 bytes left
|
|
|
|
- mtctr r15 //load counter
|
|
|
|
- andi r15,r15,$3 //r15 := r15 mod 4
|
|
|
|
- srawi r17,$1 //Offset := Offset div 2
|
|
|
|
-.MoveWordLoop: //4 byte copy
|
|
|
|
- lwzux r20,r18,r17
|
|
|
|
- stwux r20,r19,r17
|
|
|
|
- bdnz .WordCopyLoop
|
|
|
|
-
|
|
|
|
- bt cr2,MoveEnd //Nothing left to do...
|
|
|
|
-.MoveBytes: //Copy remaining stragglers
|
|
|
|
- lswx r20,r0,r18
|
|
|
|
- stswx r20,r0,r19
|
|
|
|
-.MoveEnd:
|
|
|
|
- End;
|
|
|
|
-End;
|
|
|
|
|
|
+procedure Move(var sou{}rce;var dest;count:longint);assembler;
|
|
|
|
+asm
|
|
|
|
+ { count <= 0 ? }
|
|
|
|
+ cmpwi cr0,r5,0
|
|
|
|
+ { check if we have to do the move backwards because of overlap }
|
|
|
|
+ sub r30,r4,r3
|
|
|
|
+ { carry := boolean(dest-source < count) = boolean(overlap) }
|
|
|
|
+ subc r30,r30,r5
|
|
|
|
+
|
|
|
|
+ { count < 11 ? (to decide whether we will move dwords or bytes }
|
|
|
|
+ cmpwi cr1,r5,11
|
|
|
|
+
|
|
|
|
+ { if overlap, then r30 := -1 else r30 := 0 }
|
|
|
|
+ subfe r30,r30,r30
|
|
|
|
+
|
|
|
|
+ { count < 39 ? (32 + max. alignment (7) }
|
|
|
|
+ cmpwi cr7,r5,39
|
|
|
|
+
|
|
|
|
+ { if count <= 0, stop }
|
|
|
|
+ ble cr0,LMoveDone
|
|
|
|
+
|
|
|
|
+ { if overlap, then r29 := count else r29 := 0 }
|
|
|
|
+ and r29,r5,r30
|
|
|
|
+ { if overlap, then point source and dest to the end }
|
|
|
|
+ add r3,r3,r29
|
|
|
|
+ add r4,r4,r29
|
|
|
|
+ { if overlap, then r29 := 0, else r29 := -1 }
|
|
|
|
+ not r29,r30
|
|
|
|
+ { if overlap, then r30 := -2, else r30 := 0 }
|
|
|
|
+ slwi r30,r30,1
|
|
|
|
+ { if overlap, then r30 := -1, else r30 := 1 }
|
|
|
|
+ addi r30,r30,1
|
|
|
|
+ { if overlap, then source/dest += -1, otherwise they stay }
|
|
|
|
+ { After the next instruction, r3/r4 + r30 = next position }
|
|
|
|
+ { to load/store from/to }
|
|
|
|
+ add r3,r3,r29
|
|
|
|
+ add r4,r4,r29
|
|
|
|
+
|
|
|
|
+ { if count < 11, copy everything byte by byte }
|
|
|
|
+ blt cr1,LMoveBytes
|
|
|
|
+
|
|
|
|
+ { otherwise, guarantee 4 byte alignment for dest for starters }
|
|
|
|
+LMove4ByteAlignLoop:
|
|
|
|
+ lbzux r29,r3,r30
|
|
|
|
+ stbux r29,r4,r30
|
|
|
|
+ { is dest now 4 aligned? }
|
|
|
|
+ andi. r29,r4,3
|
|
|
|
+ subi r5,r5,1
|
|
|
|
+ { while not aligned, continue }
|
|
|
|
+ bne cr0,LMove4ByteAlignLoop
|
|
|
|
+
|
|
|
|
+ { check for 8 byte alignment }
|
|
|
|
+ andi. r29,r4,7
|
|
|
|
+ { we are going to copy one byte again (the one at the newly }
|
|
|
|
+ { aligned address), so increase count again }
|
|
|
|
+ addi r5,r5,1
|
|
|
|
+ { count div 4 for number of dwords to copy }
|
|
|
|
+ srwi r29,r5,2
|
|
|
|
+ { if 11 <= count < 39, copy using dwords }
|
|
|
|
+ blt cr7,LMoveDWords
|
|
|
|
+
|
|
|
|
+ beq cr0,L8BytesAligned
|
|
|
|
+
|
|
|
|
+ { count >= 39 -> align to 8 byte boundary and then use the FPU }
|
|
|
|
+ { since we're already at 4 byte alignment, use dword store }
|
|
|
|
+ lwzux r29,r3,r30
|
|
|
|
+ stwux r29,r4,r30
|
|
|
|
+L8BytesAligned:
|
|
|
|
+ { count div 32 ( >= 1, since count was >=39 }
|
|
|
|
+ srwi r29,r5,5
|
|
|
|
+ { remainder }
|
|
|
|
+ andi. r5,r5,31
|
|
|
|
+ { to decide if we will do some dword stores afterwards or not }
|
|
|
|
+ cmpwi cr1,r5,11
|
|
|
|
+ mtctr r29
|
|
|
|
+
|
|
|
|
+ { r29 := count div 4, will be moved to ctr when copying dwords }
|
|
|
|
+ srwi r29,r5,2
|
|
|
|
+
|
|
|
|
+ { adjust the update count: it will now be 8 or -8 depending on overlap }
|
|
|
|
+ slwi r30,r30,3
|
|
|
|
+
|
|
|
|
+ { adjust source and dest pointers: because of the above loop, dest is now }
|
|
|
|
+ { aligned to 8 bytes. So if we substract r30 we will still have an 8 bytes }
|
|
|
|
+ { aligned address) }
|
|
|
|
+ sub r3,r3,r30
|
|
|
|
+ sub r4,r4,r30
|
|
|
|
+
|
|
|
|
+LMove32ByteLoop:
|
|
|
|
+ lfdux f31,r3,r30
|
|
|
|
+ lfdux f30,r3,r30
|
|
|
|
+ lfdux f29,r3,r30
|
|
|
|
+ lfdux f28,r3,r30
|
|
|
|
+ stfdux f31,r4,r30
|
|
|
|
+ stfdux f30,r4,r30
|
|
|
|
+ stfdux f29,r4,r30
|
|
|
|
+ stfdux f28,r4,r30
|
|
|
|
+ bdnz LMove32ByteLoop
|
|
|
|
+
|
|
|
|
+ { cr0*4+eq is true if "count and 31" = 0 }
|
|
|
|
+ beq cr0,LMoveDone
|
|
|
|
+
|
|
|
|
+ { make r30 again -1 or 1, but first adjust source/dest pointers }
|
|
|
|
+ add r3,r3,r30
|
|
|
|
+ add r4,r4,r30
|
|
|
|
+ srawi r30,r30,3
|
|
|
|
+ sub r3,r3,r30
|
|
|
|
+ sub r4,r4,r30
|
|
|
|
+
|
|
|
|
+ { cr1 contains whether count <= 11 }
|
|
|
|
+ ble cr1,LMoveBytes
|
|
|
|
+ add r3,r3,r30
|
|
|
|
+ add r4,r4,r30
|
|
|
|
+
|
|
|
|
+LMoveDWords:
|
|
|
|
+ mtctr r29
|
|
|
|
+ andi. r5,r5,3
|
|
|
|
+ { r30 * 4 }
|
|
|
|
+ slwi r30,r30,2
|
|
|
|
+ sub r3,r3,r30
|
|
|
|
+ sub r4,r4,r30
|
|
|
|
+
|
|
|
|
+LMoveDWordsLoop:
|
|
|
|
+ lwzux r29,r3,r30
|
|
|
|
+ stwux r29,r4,r30
|
|
|
|
+ bdnz LMoveDWordsLoop
|
|
|
|
+
|
|
|
|
+ beq cr0,LMoveDone
|
|
|
|
+ { make r30 again -1 or 1 }
|
|
|
|
+ add r3,r3,r30
|
|
|
|
+ add r4,r4,r30
|
|
|
|
+ srawi r30,r30,2
|
|
|
|
+ sub r3,r3,r30
|
|
|
|
+ sub r4,r4,r30
|
|
|
|
+LMoveBytes:
|
|
|
|
+ mtctr r5
|
|
|
|
+LMoveBytesLoop:
|
|
|
|
+ lbzux r29,r3,r30
|
|
|
|
+ stbux r29,r4,r30
|
|
|
|
+ bdnz LMoveBytesLoop
|
|
|
|
+LMoveDone:
|
|
|
|
+end ['R3','R4','R5','R29','R30','F28','F29','F30','F31','CTR','CR0','CR1','CR7'];
|
|
|
|
|
|
|
|
|
|
{$define FPC_SYSTEM_HAS_FILLCHAR}
|
|
{$define FPC_SYSTEM_HAS_FILLCHAR}
|
|
@@ -380,7 +448,11 @@ end ['r3','r4','r5','r29','r30','cr0','ctr'];
|
|
|
|
|
|
{
|
|
{
|
|
$Log$
|
|
$Log$
|
|
- Revision 1.2 2001-02-11 17:59:46 jonas
|
|
|
|
|
|
+ Revision 1.3 2001-03-02 13:24:10 jonas
|
|
|
|
+ + new, complete implementation of move procedure (including support for
|
|
|
|
+ overlapping regions)
|
|
|
|
+
|
|
|
|
+ Revision 1.2 2001/02/11 17:59:46 jonas
|
|
* implemented several more procedures
|
|
* implemented several more procedures
|
|
|
|
|
|
Revision 1.1 2000/07/27 07:32:12 jonas
|
|
Revision 1.1 2000/07/27 07:32:12 jonas
|