25 years ago · b7970bf7a4
--- a/rtl/powerpc/powerpc.inc
+++ b/rtl/powerpc/powerpc.inc
@@ -24,80 +24,148 @@
 
				 
			
 
				 {$define FPC_SYSTEM_HAS_MOVE}
			
 
				 
			
 
				-procedure Move(var source;var dest;count:longint);
			
 
				-begin
			
 
				-{ register usage:
			
 
				-  r3    source
			
 
				-  r4    dest
			
 
				-  r5    count
			
 
				-  r13   ptr to end of source
			
 
				-  r14   ptr to end of dest
			
 
				-  r15   counter 1
			
 
				-  r16   counter 2
			
 
				-  r17   addr increment
			
 
				-  r18   ptr to current source block
			
 
				-  r19   ptr to current dest block
			
 
				-  r20-24        buffer
			
 
				-  f1-4  buffer
			
 
				-  ctr   Loop counter
			
 
				-  notes:
			
 
				-  Move uses FPRs for increased bandwidth
			
 
				-}
			
 
				-        asm
			
 
				-        { do some param checking, initialization }
			
 
				-        cmplwi  cr2,r3,0
			
 
				-        cmplwi  cr3,r4,0
			
 
				-        cmplw   cr4,r3,r4
			
 
				-        add             r13,r3,r5
			
 
				-        add             r14,r4,r5
			
 
				-        bt              cr2,.MoveEnd    //end if source=nil
			
 
				-        bt              cr3,.MoveEnd    //end if dest=nil
			
 
				-        bt              cr4,.MoveEnd    //end if source=dest
			
 
				-        { see if source and dest overlap }
			
 
				-        cmplw   cr2,r13,r4
			
 
				-        cmplw   cr3,r4,r3
			
 
				-        srawi.  r15,r5,$5               //r15 := count div 32
			
 
				-        andi    r16,r5,$1F              //r16 := count mod 32
			
 
				-        crand   cr3,cr2,cr3
			
 
				-        mtctr   r15                             //Load loop counter
			
 
				-        bgt             cr3,.MoveRL             //dest overlaps source on right
			
 
				-        li              r17,$8                  //Offset 8 bytes per doubleword copy
			
 
				-        sub             r18,r17,r3              //calculate the starting source
			
 
				-        sub             r19,r17,r4              //                                      and dest ptrs
			
 
				-        beq             .MoveByByte             //If count<32 skip 32 byte block copy
			
 
				-        srawi.  r15,r16,$2              //r15 := r16 div 4
			
 
				-        andi    r16,r15,$3              //r16 := r15 mod 4
			
 
				-        cmpwi   cr2,r16,0               //r16 = 0 ?
			
 
				-        crand   cr3,cr2,cr0             //r15 = 0 AND r16 = 0 ?
			
 
				-.MoveBlockLoop:                                 //32 Byte block copy (fully optimized)
			
 
				-                lfdux   f1,r18,r17
			
 
				-                lfdux   f2,r18,r17
			
 
				-                lfdux   f3,r18,r17
			
 
				-                lfdux   f4,r18,r17
			
 
				-                stfdux  f1,r19,r17
			
 
				-                stfdux  f2,r19,r17
			
 
				-                stfdux  f3,r19,r17
			
 
				-                stfdux  f4,r19,r17
			
 
				-                bdnz    .MoveBlockLoop
			
 
				-
			
 
				-                bt              cr3,MoveEnd             //Nothing left to do...
			
 
				-                mtspr   1,r16                   //XER := r16
			
 
				-                beq             .MoveBytes              //There are fewer than 4 bytes left
			
 
				-                mtctr   r15                             //load counter
			
 
				-                andi    r15,r15,$3              //r15 := r15 mod 4
			
 
				-                srawi   r17,$1                  //Offset := Offset div 2
			
 
				-.MoveWordLoop:                                  //4 byte copy
			
 
				-                lwzux   r20,r18,r17
			
 
				-                stwux   r20,r19,r17
			
 
				-                bdnz    .WordCopyLoop
			
 
				-
			
 
				-                bt              cr2,MoveEnd             //Nothing left to do...
			
 
				-.MoveBytes:                                             //Copy remaining stragglers
			
 
				-                lswx    r20,r0,r18
			
 
				-                stswx   r20,r0,r19
			
 
				-.MoveEnd:
			
 
				-                End;
			
 
				-End;
			
 
				+procedure Move(var sou{}rce;var dest;count:longint);assembler;
			
 
				+asm
			
 
				+                {  count <= 0 ?  }
			
 
				+                cmpwi   cr0,r5,0
			
 
				+                {  check if we have to do the move backwards because of overlap  }
			
 
				+                sub     r30,r4,r3
			
 
				+                {  carry := boolean(dest-source < count) = boolean(overlap) }
			
 
				+                subc    r30,r30,r5
			
 
				+                
			
 
				+                {  count < 11 ? (to decide whether we will move dwords or bytes  }
			
 
				+                cmpwi   cr1,r5,11
			
 
				+                
			
 
				+                {  if overlap, then r30 := -1 else r30 := 0  }
			
 
				+                subfe   r30,r30,r30
			
 
				+                
			
 
				+                {  count < 39 ? (32 + max. alignment (7) }
			
 
				+                cmpwi   cr7,r5,39
			
 
				+                
			
 
				+                {  if count <= 0, stop  }
			
 
				+                ble     cr0,LMoveDone
			
 
				+                
			
 
				+                {  if overlap, then r29 := count else r29 := 0  }
			
 
				+                and     r29,r5,r30
			
 
				+                {  if overlap, then point source and dest to the end  }
			
 
				+                add     r3,r3,r29
			
 
				+                add     r4,r4,r29
			
 
				+                {  if overlap, then r29 := 0, else r29 := -1  }
			
 
				+                not     r29,r30
			
 
				+                {  if overlap, then r30 := -2, else r30 := 0  }
			
 
				+                slwi    r30,r30,1
			
 
				+                {  if overlap, then r30 := -1, else r30 := 1  }
			
 
				+                addi    r30,r30,1
			
 
				+                {  if overlap, then source/dest += -1, otherwise they stay }
			
 
				+                {  After the next instruction, r3/r4 + r30 = next position }
			
 
				+                {  to load/store from/to                                   }
			
 
				+                add     r3,r3,r29
			
 
				+                add     r4,r4,r29
			
 
				+
			
 
				+                {  if count < 11, copy everything byte by byte  }
			
 
				+                blt     cr1,LMoveBytes
			
 
				+
			
 
				+                {  otherwise, guarantee 4 byte alignment for dest for starters  }
			
 
				+LMove4ByteAlignLoop:
			
 
				+                lbzux   r29,r3,r30
			
 
				+                stbux   r29,r4,r30
			
 
				+                {  is dest now 4 aligned?  }
			
 
				+                andi.   r29,r4,3
			
 
				+                subi    r5,r5,1
			
 
				+                {  while not aligned, continue  }
			
 
				+                bne     cr0,LMove4ByteAlignLoop
			
 
				+
			
 
				+                { check for 8 byte alignment }
			
 
				+                andi.   r29,r4,7
			
 
				+                { we are going to copy one byte again (the one at the newly }
			
 
				+                { aligned address), so increase count again                 }
			
 
				+                addi    r5,r5,1
			
 
				+                { count div 4 for number of dwords to copy }
			
 
				+                srwi    r29,r5,2
			
 
				+                {  if 11 <= count < 39, copy using dwords }
			
 
				+                blt     cr7,LMoveDWords
			
 
				+
			
 
				+                beq     cr0,L8BytesAligned
			
 
				+                
			
 
				+                {  count >= 39 -> align to 8 byte boundary and then use the FPU  }
			
 
				+                {  since we're already at 4 byte alignment, use dword store      }
			
 
				+                lwzux   r29,r3,r30
			
 
				+                stwux   r29,r4,r30
			
 
				+L8BytesAligned:
			
 
				+                { count div 32 ( >= 1, since count was >=39 }
			
 
				+                srwi    r29,r5,5
			
 
				+                { remainder }
			
 
				+                andi.   r5,r5,31
			
 
				+                { to decide if we will do some dword stores afterwards or not }
			
 
				+                cmpwi   cr1,r5,11
			
 
				+                mtctr   r29
			
 
				+                
			
 
				+                {  r29 := count div 4, will be moved to ctr when copying dwords  }
			
 
				+                srwi    r29,r5,2
			
 
				+                
			
 
				+                {  adjust the update count: it will now be 8 or -8 depending on overlap  }
			
 
				+                slwi    r30,r30,3
			
 
				+                
			
 
				+                {  adjust source and dest pointers: because of the above loop, dest is now   }
			
 
				+                {  aligned to 8 bytes. So if we substract r30 we will still have an 8 bytes  }
			
 
				+                { aligned address)                                                           }
			
 
				+                sub     r3,r3,r30
			
 
				+                sub     r4,r4,r30
			
 
				+
			
 
				+LMove32ByteLoop:
			
 
				+                lfdux   f31,r3,r30
			
 
				+                lfdux   f30,r3,r30
			
 
				+                lfdux   f29,r3,r30
			
 
				+                lfdux   f28,r3,r30
			
 
				+                stfdux  f31,r4,r30
			
 
				+                stfdux  f30,r4,r30
			
 
				+                stfdux  f29,r4,r30
			
 
				+                stfdux  f28,r4,r30
			
 
				+                bdnz    LMove32ByteLoop
			
 
				+
			
 
				+                { cr0*4+eq is true if "count and 31" = 0 }
			
 
				+                beq     cr0,LMoveDone
			
 
				+
			
 
				+                {  make r30 again -1 or 1, but first adjust source/dest pointers }
			
 
				+                add		r3,r3,r30
			
 
				+                add		r4,r4,r30
			
 
				+                srawi   r30,r30,3
			
 
				+                sub     r3,r3,r30
			
 
				+                sub     r4,r4,r30
			
 
				+
			
 
				+                { cr1 contains whether count <= 11 }
			
 
				+                ble     cr1,LMoveBytes
			
 
				+                add     r3,r3,r30
			
 
				+                add     r4,r4,r30
			
 
				+
			
 
				+LMoveDWords:
			
 
				+                mtctr   r29
			
 
				+                andi.   r5,r5,3
			
 
				+                {  r30 * 4  }
			
 
				+                slwi    r30,r30,2
			
 
				+                sub		r3,r3,r30
			
 
				+                sub		r4,r4,r30
			
 
				+
			
 
				+LMoveDWordsLoop:
			
 
				+                lwzux   r29,r3,r30
			
 
				+                stwux   r29,r4,r30
			
 
				+                bdnz    LMoveDWordsLoop
			
 
				+
			
 
				+                beq     cr0,LMoveDone
			
 
				+                {  make r30 again -1 or 1  }
			
 
				+                add		r3,r3,r30
			
 
				+                add		r4,r4,r30
			
 
				+                srawi   r30,r30,2
			
 
				+                sub     r3,r3,r30
			
 
				+                sub     r4,r4,r30
			
 
				+LMoveBytes:
			
 
				+                mtctr   r5
			
 
				+LMoveBytesLoop:
			
 
				+                lbzux   r29,r3,r30
			
 
				+                stbux   r29,r4,r30
			
 
				+                bdnz    LMoveBytesLoop
			
 
				+LMoveDone:
			
 
				+end ['R3','R4','R5','R29','R30','F28','F29','F30','F31','CTR','CR0','CR1','CR7'];
			
 
				 
			
 
				 
			
 
				 {$define FPC_SYSTEM_HAS_FILLCHAR}
			
@@ -380,7 +448,11 @@ end ['r3','r4','r5','r29','r30','cr0','ctr'];
 
				 
			
 
				 {
			
 
				   $Log$
			
 
				-  Revision 1.2  2001-02-11 17:59:46  jonas
			
 
				+  Revision 1.3  2001-03-02 13:24:10  jonas
			
 
				+    + new, complete implementation of move procedure (including support for
			
 
				+      overlapping regions)
			
 
				+
			
 
				+  Revision 1.2  2001/02/11 17:59:46  jonas
			
 
				     * implemented several more procedures
			
 
				 
			
 
				   Revision 1.1  2000/07/27 07:32:12  jonas