Browse Source

* small changes to move (different count values trigger the selection of
moving bytes instead dwords/doubles and move dcbt instruction)
+ implemented fillchar (untested)

Jonas Maebe 24 years ago
parent
commit
8f2e700b97
1 changed files with 141 additions and 52 deletions
  1. 141 52
      rtl/powerpc/powerpc.inc

+ 141 - 52
rtl/powerpc/powerpc.inc

@@ -26,8 +26,6 @@
 
 procedure Move(var source;var dest;count:longint);assembler;
 asm
-          {  load the begin of the source in the data cache }
-          dcbt    0,r3
           {  count <= 0 ?  }
           cmpwi   cr0,r5,0
           {  check if we have to do the move backwards because of overlap  }
@@ -35,8 +33,8 @@ asm
           {  carry := boolean(dest-source < count) = boolean(overlap) }
           subc    r10,r10,r5
 
-          {  count < 11 ? (to decide whether we will move dwords or bytes  }
-          cmpwi   cr1,r5,11
+          {  count < 15 ? (to decide whether we will move dwords or bytes  }
+          cmpwi   cr1,r5,15
 
           {  if overlap, then r10 := -1 else r10 := 0  }
           subfe   r10,r10,r10
@@ -47,6 +45,11 @@ asm
           {  if count <= 0, stop  }
           ble     cr0,LMoveDone
 
+          {  load the begin of the source in the data cache }
+          dcbt    0,r3
+          { and the dest as well }
+          dcbst   0,r4
+
           {  if overlap, then r0 := count else r0 := 0  }
           and     r0,r5,r10
           {  if overlap, then point source and dest to the end  }
@@ -64,7 +67,7 @@ asm
           add     r3,r3,r0
           add     r4,r4,r0
 
-          {  if count < 11, copy everything byte by byte  }
+          {  if count < 15, copy everything byte by byte  }
           blt     cr1,LMoveBytes
 
           {  otherwise, guarantee 4 byte alignment for dest for starters  }
@@ -177,52 +180,133 @@ end ['R0','R3','R4','R5','R10','F0','F11','F12','F13','CTR','CR0','CR1','CR7'];
 
 {$define FPC_SYSTEM_HAS_FILLCHAR}
 
-Procedure FillChar(var x;count:longint;value:byte);
-begin
-        asm
-{ Register Usage:
-        r3      x
-        r4      count
-        r5      value
-        r13     value.value.value.value
-        r14     ptr to current dest char
-        r15     byte increment, Scratch
-        r16     Block count
-        r17 misalignment byte count
-}
-                cmpwi   cr2,r4,12
-                mr              r14,r3
-                andi.   r17,r3,3
-                sub             r14,r3,r17              //32 bit align
-                blt             cr2,.FillBytes  //if count<12 then fill byte by byte
-                sub             r16,r4,r17
-                andi    r17,r16,3
-                cmpwi   cr2,r17,0
-                srwi    r16,r16,2               //r16:=count div 4
-                subi    r16,r16,2
-                mtctr   r16                             //counter:=r16
-                mr              r13,r5                  //insert
-                insrwi  r13,r5,8,16             //              value into all four bytes
-                insrwi  r13,r13,16,0    //                                                                      of r13
-                li              r15,4
-                stw             r13,0(r3)               //fill first few bytes
-.FillWordLoop:
-                stwux   r13,r14,r15
-                bdnz    .FillWordLoop
-                beq             cr2,FillEnd             //No trailing bytes, so exit
-                add             r14,r3,r4
-                stw             r13,-4(r14)             //fill last few bytes
-                b               .FillEnd
-
-.FillBytes:
-                mtctr   r4                              //counter:=count
-                li              r15,1
-                subi    r14,r3,1
-.FillByteLoop:
-                stbux   r13,r14,r15
-                bdnz    .FillByteLoop
-.FillEnd:
-        end [r13,r14,r15,r16,r17,ctr];
+Procedure FillChar(var x;count:longint;value:byte);assembler;
+{ input: x in r3, count in r4, value in r5 }
+
+{$ifndef ABI_AIX}
+{ in the AIX ABI, we can use te red zone for temp storage, otherwise we have }
+{ to explicitely allocate room                                               }
+var
+  temp: record
+    case byte of
+      0: (l1,l2: longint);
+      1: (d: double);
+    end;
+{$endif ABI_AIX}
+asm
+        { no bytes? }
+        cmpwi     cr6,r4,0
+        { less than 15 bytes? }
+        cmpwi     cr7,r4,15
+        { less than 63 bytes? }
+        cmpwi     cr1,r4,63
+        { fill r5 with ValueValueValueValue }
+        rlwimi    r5,r5,8,16,23
+        { setup for aligning x to multiple of 4}
+        rlwinm    r10,r3,0,31-2+1,31
+        rlwimi    r5,r5,16,0,15
+        beq       cr6,LFillCharDone
+        { get the start of the data in the cache (and mark it as "will be }
+        { modified")                                                      }
+        dcbst     0,r3
+        subfic    r10,r10,4
+        blt       cr7,LFillCharVerySmall
+        { just store 4 bytes instead of using a loop to align (there are }
+        { plenty of other instructions now to keep the processor busy    }
+        { while it handles the (possibly unaligned) store)               }
+        stw       r5,0(r3)
+        { r3 := align(r3,4) }
+        add       r3,r3,r10
+        { decrease count with number of bytes already stored }
+        sub       r4,r4,r10
+        blt       cr1,LFillCharSmall
+        { if we have to fill with 0 (which happens a lot), we can simply use }
+        { dcbz for the most part, which is very fast, so make a special case }
+        { for that                                                           }
+        cmplwi    cr1,r5,0
+        { align to a multiple of 32 (and immediately check whether we aren't }
+        { already 32 byte aligned)                                           }
+        rlwinm.   r10,r3,0,31-5+1,31
+        { setup r3 for using update forms of store instructions }
+        subi      r3,r3,4
+        { get number of bytes to store }
+        subfic    r10,r10,32
+        { if already 32byte aligned, skip align loop }
+        beq       L32ByteAlignLoopDone
+        { substract from the total count }
+        sub       r4,r4,r10
+L32ByteAlignLoop:
+        { we were already aligned to 4 byres, so this will count down to }
+        { exactly 0                                                      }
+        subic.    r10,r10,4
+        stwu      r5,4(r3)
+        bne       L32ByteAlignLoop
+L32ByteAlignLoopDone:
+        { get the amount of 32 byte blocks }
+        srwi      r10,r4,5
+        { and keep the rest in r4 (recording whether there is any rest) }
+        rlwinm.   r4,r4,0,31-5+2,31
+        { move to ctr }
+        mtctr     r10
+        { check how many rest there is (to decide whether we'll use }
+        { FillCharSmall or FillCharVerySmall)                       } 
+        cmpl      cr7,r4,11
+        { if filling with zero, only use dcbz }
+        bne       cr1, LFillCharNoZero
+        { make r3 point again to the actual store position }
+        addi      r3,r3,4
+LFillCharDCBZLoop:
+        dcbz      0,r3
+        add       r3,r3,32
+        bdnz      LFillCharDCBZLoop
+        { if there was no rest, we're finished }
+        beq       LFillCharDone
+        b         LFillCharSmall
+LFillCharNoZero:
+{$ifdef ABI_AIX}
+        stw       r5,0(sp)
+        stw       r5,4(sp)
+        lfd       f0,0(sp)
+{$else ABI_AIX}
+        stw       r5,temp.l1
+        stw       r5,temp.l2
+        lfd       f0,temp.d
+{$endif ABI_AIX}
+        { make r3 point to address-8, so we're able to use fp double stores }
+        { with update (it's already -4 now)                                 }
+        subi      r3,r3,4
+        { load r10 with 8, so that dcbz uses the correct address }
+LFillChar32ByteLoop:
+        dcbz      r3,r10
+        stfdu     f0,8(r3)
+        stfdu     f0,8(r3)
+        stfdu     f0,8(r3)
+        stfdu     f0,8(r3)
+        bdnz      LFillChar32ByteLoop
+        { if there was no rest, we're finished }
+        beq       LFillCharDone
+LFillCharSmall:
+        { when we arrive here, we're already 4 byte aligned }
+        { get count div 4 to store dwords }
+        srwi      r10,r4,2
+        { get ready for use of update stores }
+        subi      r3,r3,4
+        mtctr     r10
+        rlwinm.   r4,r4,0,31-2+1,31
+LFillCharSmallLoop:
+        stwu      r5,4(r3)
+        bdnz      LFillCharSmallLoop
+        { if nothing left, stop }
+        beq       LFillCharDone
+        { get ready to store bytes }
+        addi      r3,r3,4
+LFillCharVerySmall:
+        mtctr     r4
+        subi      r3,r3,1
+LFillCharVerySmallLoop:
+        stbu      r5,1(r3)
+        bdnz      LFillCharVerySmallLoop
+LFillCharDone:
 end;
 
 
@@ -758,7 +842,12 @@ end ['r3','r10'];
 
 {
   $Log$
-  Revision 1.6  2001-09-27 15:30:29  jonas
+  Revision 1.7  2001-09-28 13:28:49  jonas
+    * small changes to move (different count values trigger the selection of
+      moving bytes instead dwords/doubles and move dcbt instruction)
+    + implemented fillchar (untested)
+
+  Revision 1.6  2001/09/27 15:30:29  jonas
     * conversion to compilerproc and to structure used by i386 rtl
     * some bugfixes
     * powerpc.inc is almost complete (only fillchar/word/dword, get_frame etc