24 years ago · 8f2e700b97
--- a/rtl/powerpc/powerpc.inc
+++ b/rtl/powerpc/powerpc.inc
@@ -26,8 +26,6 @@
 
				 
			
 
				 procedure Move(var source;var dest;count:longint);assembler;
			
 
				 asm
			
 
				-          {  load the begin of the source in the data cache }
			
 
				-          dcbt    0,r3
			
 
				           {  count <= 0 ?  }
			
 
				           cmpwi   cr0,r5,0
			
 
				           {  check if we have to do the move backwards because of overlap  }
			
@@ -35,8 +33,8 @@ asm
 
				           {  carry := boolean(dest-source < count) = boolean(overlap) }
			
 
				           subc    r10,r10,r5
			
 
				 
			
 
				-          {  count < 11 ? (to decide whether we will move dwords or bytes  }
			
 
				-          cmpwi   cr1,r5,11
			
 
				+          {  count < 15 ? (to decide whether we will move dwords or bytes  }
			
 
				+          cmpwi   cr1,r5,15
			
 
				 
			
 
				           {  if overlap, then r10 := -1 else r10 := 0  }
			
 
				           subfe   r10,r10,r10
			
@@ -47,6 +45,11 @@ asm
 
				           {  if count <= 0, stop  }
			
 
				           ble     cr0,LMoveDone
			
 
				 
			
 
				+          {  load the begin of the source in the data cache }
			
 
				+          dcbt    0,r3
			
 
				+          { and the dest as well }
			
 
				+          dcbst   0,r4
			
 
				+
			
 
				           {  if overlap, then r0 := count else r0 := 0  }
			
 
				           and     r0,r5,r10
			
 
				           {  if overlap, then point source and dest to the end  }
			
@@ -64,7 +67,7 @@ asm
 
				           add     r3,r3,r0
			
 
				           add     r4,r4,r0
			
 
				 
			
 
				-          {  if count < 11, copy everything byte by byte  }
			
 
				+          {  if count < 15, copy everything byte by byte  }
			
 
				           blt     cr1,LMoveBytes
			
 
				 
			
 
				           {  otherwise, guarantee 4 byte alignment for dest for starters  }
			
@@ -177,52 +180,133 @@ end ['R0','R3','R4','R5','R10','F0','F11','F12','F13','CTR','CR0','CR1','CR7'];
 
				 
			
 
				 {$define FPC_SYSTEM_HAS_FILLCHAR}
			
 
				 
			
 
				-Procedure FillChar(var x;count:longint;value:byte);
			
 
				-begin
			
 
				-        asm
			
 
				-{ Register Usage:
			
 
				-        r3      x
			
 
				-        r4      count
			
 
				-        r5      value
			
 
				-        r13     value.value.value.value
			
 
				-        r14     ptr to current dest char
			
 
				-        r15     byte increment, Scratch
			
 
				-        r16     Block count
			
 
				-        r17 misalignment byte count
			
 
				-}
			
 
				-                cmpwi   cr2,r4,12
			
 
				-                mr              r14,r3
			
 
				-                andi.   r17,r3,3
			
 
				-                sub             r14,r3,r17              //32 bit align
			
 
				-                blt             cr2,.FillBytes  //if count<12 then fill byte by byte
			
 
				-                sub             r16,r4,r17
			
 
				-                andi    r17,r16,3
			
 
				-                cmpwi   cr2,r17,0
			
 
				-                srwi    r16,r16,2               //r16:=count div 4
			
 
				-                subi    r16,r16,2
			
 
				-                mtctr   r16                             //counter:=r16
			
 
				-                mr              r13,r5                  //insert
			
 
				-                insrwi  r13,r5,8,16             //              value into all four bytes
			
 
				-                insrwi  r13,r13,16,0    //                                                                      of r13
			
 
				-                li              r15,4
			
 
				-                stw             r13,0(r3)               //fill first few bytes
			
 
				-.FillWordLoop:
			
 
				-                stwux   r13,r14,r15
			
 
				-                bdnz    .FillWordLoop
			
 
				-                beq             cr2,FillEnd             //No trailing bytes, so exit
			
 
				-                add             r14,r3,r4
			
 
				-                stw             r13,-4(r14)             //fill last few bytes
			
 
				-                b               .FillEnd
			
 
				-
			
 
				-.FillBytes:
			
 
				-                mtctr   r4                              //counter:=count
			
 
				-                li              r15,1
			
 
				-                subi    r14,r3,1
			
 
				-.FillByteLoop:
			
 
				-                stbux   r13,r14,r15
			
 
				-                bdnz    .FillByteLoop
			
 
				-.FillEnd:
			
 
				-        end [r13,r14,r15,r16,r17,ctr];
			
 
				+Procedure FillChar(var x;count:longint;value:byte);assembler;
			
 
				+{ input: x in r3, count in r4, value in r5 }
			
 
				+
			
 
				+{$ifndef ABI_AIX}
			
 
				+{ in the AIX ABI, we can use te red zone for temp storage, otherwise we have }
			
 
				+{ to explicitely allocate room                                               }
			
 
				+var
			
 
				+  temp: record
			
 
				+    case byte of
			
 
				+      0: (l1,l2: longint);
			
 
				+      1: (d: double);
			
 
				+    end;
			
 
				+{$endif ABI_AIX}
			
 
				+asm
			
 
				+        { no bytes? }
			
 
				+        cmpwi     cr6,r4,0
			
 
				+        { less than 15 bytes? }
			
 
				+        cmpwi     cr7,r4,15
			
 
				+        { less than 63 bytes? }
			
 
				+        cmpwi     cr1,r4,63
			
 
				+        { fill r5 with ValueValueValueValue }
			
 
				+        rlwimi    r5,r5,8,16,23
			
 
				+        { setup for aligning x to multiple of 4}
			
 
				+        rlwinm    r10,r3,0,31-2+1,31
			
 
				+        rlwimi    r5,r5,16,0,15
			
 
				+        beq       cr6,LFillCharDone
			
 
				+        { get the start of the data in the cache (and mark it as "will be }
			
 
				+        { modified")                                                      }
			
 
				+        dcbst     0,r3
			
 
				+        subfic    r10,r10,4
			
 
				+        blt       cr7,LFillCharVerySmall
			
 
				+        { just store 4 bytes instead of using a loop to align (there are }
			
 
				+        { plenty of other instructions now to keep the processor busy    }
			
 
				+        { while it handles the (possibly unaligned) store)               }
			
 
				+        stw       r5,0(r3)
			
 
				+        { r3 := align(r3,4) }
			
 
				+        add       r3,r3,r10
			
 
				+        { decrease count with number of bytes already stored }
			
 
				+        sub       r4,r4,r10
			
 
				+        blt       cr1,LFillCharSmall
			
 
				+        { if we have to fill with 0 (which happens a lot), we can simply use }
			
 
				+        { dcbz for the most part, which is very fast, so make a special case }
			
 
				+        { for that                                                           }
			
 
				+        cmplwi    cr1,r5,0
			
 
				+        { align to a multiple of 32 (and immediately check whether we aren't }
			
 
				+        { already 32 byte aligned)                                           }
			
 
				+        rlwinm.   r10,r3,0,31-5+1,31
			
 
				+        { setup r3 for using update forms of store instructions }
			
 
				+        subi      r3,r3,4
			
 
				+        { get number of bytes to store }
			
 
				+        subfic    r10,r10,32
			
 
				+        { if already 32byte aligned, skip align loop }
			
 
				+        beq       L32ByteAlignLoopDone
			
 
				+        { substract from the total count }
			
 
				+        sub       r4,r4,r10
			
 
				+L32ByteAlignLoop:
			
 
				+        { we were already aligned to 4 byres, so this will count down to }
			
 
				+        { exactly 0                                                      }
			
 
				+        subic.    r10,r10,4
			
 
				+        stwu      r5,4(r3)
			
 
				+        bne       L32ByteAlignLoop
			
 
				+L32ByteAlignLoopDone:
			
 
				+        { get the amount of 32 byte blocks }
			
 
				+        srwi      r10,r4,5
			
 
				+        { and keep the rest in r4 (recording whether there is any rest) }
			
 
				+        rlwinm.   r4,r4,0,31-5+2,31
			
 
				+        { move to ctr }
			
 
				+        mtctr     r10
			
 
				+        { check how many rest there is (to decide whether we'll use }
			
 
				+        { FillCharSmall or FillCharVerySmall)                       } 
			
 
				+        cmpl      cr7,r4,11
			
 
				+        { if filling with zero, only use dcbz }
			
 
				+        bne       cr1, LFillCharNoZero
			
 
				+        { make r3 point again to the actual store position }
			
 
				+        addi      r3,r3,4
			
 
				+LFillCharDCBZLoop:
			
 
				+        dcbz      0,r3
			
 
				+        add       r3,r3,32
			
 
				+        bdnz      LFillCharDCBZLoop
			
 
				+        { if there was no rest, we're finished }
			
 
				+        beq       LFillCharDone
			
 
				+        b         LFillCharSmall
			
 
				+LFillCharNoZero:
			
 
				+{$ifdef ABI_AIX}
			
 
				+        stw       r5,0(sp)
			
 
				+        stw       r5,4(sp)
			
 
				+        lfd       f0,0(sp)
			
 
				+{$else ABI_AIX}
			
 
				+        stw       r5,temp.l1
			
 
				+        stw       r5,temp.l2
			
 
				+        lfd       f0,temp.d
			
 
				+{$endif ABI_AIX}
			
 
				+        { make r3 point to address-8, so we're able to use fp double stores }
			
 
				+        { with update (it's already -4 now)                                 }
			
 
				+        subi      r3,r3,4
			
 
				+        { load r10 with 8, so that dcbz uses the correct address }
			
 
				+LFillChar32ByteLoop:
			
 
				+        dcbz      r3,r10
			
 
				+        stfdu     f0,8(r3)
			
 
				+        stfdu     f0,8(r3)
			
 
				+        stfdu     f0,8(r3)
			
 
				+        stfdu     f0,8(r3)
			
 
				+        bdnz      LFillChar32ByteLoop
			
 
				+        { if there was no rest, we're finished }
			
 
				+        beq       LFillCharDone
			
 
				+LFillCharSmall:
			
 
				+        { when we arrive here, we're already 4 byte aligned }
			
 
				+        { get count div 4 to store dwords }
			
 
				+        srwi      r10,r4,2
			
 
				+        { get ready for use of update stores }
			
 
				+        subi      r3,r3,4
			
 
				+        mtctr     r10
			
 
				+        rlwinm.   r4,r4,0,31-2+1,31
			
 
				+LFillCharSmallLoop:
			
 
				+        stwu      r5,4(r3)
			
 
				+        bdnz      LFillCharSmallLoop
			
 
				+        { if nothing left, stop }
			
 
				+        beq       LFillCharDone
			
 
				+        { get ready to store bytes }
			
 
				+        addi      r3,r3,4
			
 
				+LFillCharVerySmall:
			
 
				+        mtctr     r4
			
 
				+        subi      r3,r3,1
			
 
				+LFillCharVerySmallLoop:
			
 
				+        stbu      r5,1(r3)
			
 
				+        bdnz      LFillCharVerySmallLoop
			
 
				+LFillCharDone:
			
 
				 end;
			
 
				 
			
 
				 
			
@@ -758,7 +842,12 @@ end ['r3','r10'];
 
				 
			
 
				 {
			
 
				   $Log$
			
 
				-  Revision 1.6  2001-09-27 15:30:29  jonas
			
 
				+  Revision 1.7  2001-09-28 13:28:49  jonas
			
 
				+    * small changes to move (different count values trigger the selection of
			
 
				+      moving bytes instead dwords/doubles and move dcbt instruction)
			
 
				+    + implemented fillchar (untested)
			
 
				+
			
 
				+  Revision 1.6  2001/09/27 15:30:29  jonas
			
 
				     * conversion to compilerproc and to structure used by i386 rtl
			
 
				     * some bugfixes
			
 
				     * powerpc.inc is almost complete (only fillchar/word/dword, get_frame etc