Browse Source

+ inlined versions of concatcopy

git-svn-id: trunk@29959 -
Jonas Maebe 10 years ago
parent
commit
07455fb889
1 changed files with 408 additions and 129 deletions
  1. 408 129
      compiler/aarch64/cgcpu.pas

+ 408 - 129
compiler/aarch64/cgcpu.pas

@@ -92,9 +92,8 @@ interface
         procedure g_maybe_got_init(list: TAsmList); override;
         procedure g_restore_registers(list: TAsmList);override;
         procedure g_save_registers(list: TAsmList);override;
+        procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len: tcgint);
         procedure g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);override;
-        procedure g_concatcopy_unaligned(list: TAsmList; const source, dest: treference; len: tcgint);override;
-        procedure g_concatcopy_move(list: TAsmList; const source, dest: treference; len : tcgint);
         procedure g_adjust_self_value(list: TAsmList; procdef: tprocdef; ioffset: tcgint);override;
         procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
        private
@@ -1468,7 +1467,6 @@ implementation
       end;
 
 
-
     procedure tcgaarch64.g_proc_entry(list: TAsmList; localsize: longint; nostackframe: boolean);
       var
         ref: treference;
@@ -1676,152 +1674,433 @@ implementation
       end;
 
 
-    procedure tcgaarch64.g_concatcopy(list:TAsmList;const source,dest:treference;len:tcgint);
-(*
+    procedure tcgaarch64.g_concatcopy(list: TAsmList; const source, dest: treference; len: tcgint);
+
       var
-        tmpreg1,
-        hreg,
+        sourcebasereplaced, destbasereplaced: boolean;
+
+      { get optimal memory operation to use for loading/storing data
+        in an unrolled loop }
+      procedure getmemop(scaledop, unscaledop: tasmop; const startref, endref: treference; opsize: tcgsize; postfix: toppostfix; out memop: tasmop; out needsimplify: boolean);
+        begin
+          if (simple_ref_type(scaledop,opsize,postfix,startref)=sr_simple) and
+             (simple_ref_type(scaledop,opsize,postfix,endref)=sr_simple) then
+            begin
+              memop:=unscaledop;
+              needsimplify:=true;
+            end
+          else if (unscaledop<>A_NONE) and
+             (simple_ref_type(unscaledop,opsize,postfix,startref)=sr_simple) and
+             (simple_ref_type(unscaledop,opsize,postfix,endref)=sr_simple) then
+            begin
+              memop:=unscaledop;
+              needsimplify:=false;
+            end
+          else
+            begin
+              memop:=scaledop;
+              needsimplify:=true;
+            end;
+        end;
+
+      { adjust the offset and/or addressing mode after a load/store so it's
+        correct for the next one of the same size }
+      procedure updaterefafterloadstore(var ref: treference; oplen: longint);
+        begin
+          case ref.addressmode of
+            AM_OFFSET:
+              inc(ref.offset,oplen);
+            AM_POSTINDEXED:
+              { base register updated by instruction, next offset can remain
+                the same }
+              ;
+            AM_PREINDEXED:
+              begin
+                { base register updated by instruction -> next instruction can
+                  use post-indexing with offset = sizeof(operation) }
+                ref.offset:=0;
+                ref.addressmode:=AM_OFFSET;
+              end;
+          end;
+        end;
+
+      { generate a load/store and adjust the reference offset to the next
+        memory location if necessary }
+      procedure genloadstore(list: TAsmList; op: tasmop; reg: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
+        begin
+          list.concat(setoppostfix(taicpu.op_reg_ref(op,reg,ref),postfix));
+          updaterefafterloadstore(ref,tcgsize2size[opsize]);
+        end;
+
+      { generate a dual load/store (ldp/stp) and adjust the reference offset to
+        the next memory location if necessary }
+      procedure gendualloadstore(list: TAsmList; op: tasmop; reg1, reg2: tregister; var ref: treference; postfix: toppostfix; opsize: tcgsize);
+        begin
+          list.concat(setoppostfix(taicpu.op_reg_reg_ref(op,reg1,reg2,ref),postfix));
+          updaterefafterloadstore(ref,tcgsize2size[opsize]*2);
+        end;
+
+      { turn a reference into a pre- or post-indexed reference for use in a
+        load/store of a particular size }
+      procedure makesimpleforcopy(list: TAsmList; var scaledop: tasmop; opsize: tcgsize; postfix: toppostfix; forcepostindexing: boolean; var ref: treference; var basereplaced: boolean);
+        var
+          tmpreg: tregister;
+          scaledoffset: longint;
+          orgaddressmode: taddressmode;
+        begin
+          scaledoffset:=tcgsize2size[opsize];
+          if scaledop in [A_LDP,A_STP] then
+            scaledoffset:=scaledoffset*2;
+          { can we use the reference as post-indexed without changes? }
+          if forcepostindexing then
+            begin
+              orgaddressmode:=ref.addressmode;
+              ref.addressmode:=AM_POSTINDEXED;
+              if (orgaddressmode=AM_POSTINDEXED) or
+                 ((ref.offset=0) and
+                  (simple_ref_type(scaledop,opsize,postfix,ref)=sr_simple)) then
+                begin
+                  { just change the post-indexed offset to the access size }
+                  ref.offset:=scaledoffset;
+                  { and replace the base register if that didn't happen yet
+                    (could be sp or a regvar) }
+                  if not basereplaced then
+                    begin
+                      tmpreg:=getaddressregister(list);
+                      a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
+                      ref.base:=tmpreg;
+                      basereplaced:=true;
+                    end;
+                  exit;
+                end;
+              ref.addressmode:=orgaddressmode;
+            end;
+{$ifdef dummy}
+          This could in theory be useful in case you have a concatcopy from
+          e.g. x1+255 to x1+267 *and* the reference is aligned, but this seems
+          very unlikely. Disabled because it still needs fixes, as it
+          also generates pre-indexed loads right now at the very end for the
+          left-over gencopies
+
+          { can we turn it into a pre-indexed reference for free? (after the
+            first operation, it will be turned into an offset one) }
+          if not forcepostindexing and
+             (ref.offset<>0) then
+            begin
+              orgaddressmode:=ref.addressmode;
+              ref.addressmode:=AM_PREINDEXED;
+              tmpreg:=ref.base;
+              if not basereplaced and
+                 (ref.base=tmpreg) then
+                begin
+                  tmpreg:=getaddressregister(list);
+                  a_load_reg_reg(list,OS_ADDR,OS_ADDR,ref.base,tmpreg);
+                  ref.base:=tmpreg;
+                  basereplaced:=true;
+                end;
+              if simple_ref_type(scaledop,opsize,postfix,ref)<>sr_simple then
+                make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
+              exit;
+            end;
+{$endif dummy}
+          if not forcepostindexing then
+            begin
+              ref.addressmode:=AM_OFFSET;
+              make_simple_ref(list,scaledop,opsize,postfix,ref,NR_NO);
+              { this may still cause problems if the final offset is no longer
+                a simple ref; it's a bit complicated to pass all information
+                through at all places and check that here, so play safe: we
+                currently never generate unrolled copies for more than 64
+                bytes (32 with non-double-register copies) }
+              if ref.index=NR_NO then
+                begin
+                  if ((scaledop in [A_LDP,A_STP]) and
+                      (ref.offset<((64-8)*tcgsize2size[opsize]))) or
+                     ((scaledop in [A_LDUR,A_STUR]) and
+                      (ref.offset<(255-8*tcgsize2size[opsize]))) or
+                     ((scaledop in [A_LDR,A_STR]) and
+                      (ref.offset<((4096-8)*tcgsize2size[opsize]))) then
+                    exit;
+                end;
+            end;
+          tmpreg:=getaddressregister(list);
+          a_loadaddr_ref_reg(list,ref,tmpreg);
+          basereplaced:=true;
+          if forcepostindexing then
+            begin
+              reference_reset_base(ref,tmpreg,scaledoffset,ref.alignment);
+              ref.addressmode:=AM_POSTINDEXED;
+            end
+          else
+            begin
+              reference_reset_base(ref,tmpreg,0,ref.alignment);
+              ref.addressmode:=AM_OFFSET;
+            end
+        end;
+
+      { prepare a reference for use by gencopy. This is done both after the
+        unrolled and regular copy loop -> get rid of post-indexing mode, make
+        sure ref is valid }
+      procedure preparecopy(list: tasmlist; scaledop, unscaledop: tasmop; var ref: treference; opsize: tcgsize; postfix: toppostfix; out op: tasmop; var basereplaced: boolean);
+        var
+          simplify: boolean;
+        begin
+          if ref.addressmode=AM_POSTINDEXED then
+            ref.offset:=tcgsize2size[opsize];
+          getmemop(scaledop,scaledop,ref,ref,opsize,postfix,op,simplify);
+          if simplify then
+            begin
+              makesimpleforcopy(list,scaledop,opsize,postfix,false,ref,basereplaced);
+              op:=scaledop;
+            end;
+        end;
+
+      { generate a copy from source to dest of size opsize/postfix }
+      procedure gencopy(list: TAsmList; var source, dest: treference; postfix: toppostfix; opsize: tcgsize);
+        var
+          reg: tregister;
+          loadop, storeop: tasmop;
+        begin
+          preparecopy(list,A_LDR,A_LDUR,source,opsize,postfix,loadop,sourcebasereplaced);
+          preparecopy(list,A_STR,A_STUR,dest,opsize,postfix,storeop,destbasereplaced);
+          reg:=getintregister(list,opsize);
+          genloadstore(list,loadop,reg,source,postfix,opsize);
+          genloadstore(list,storeop,reg,dest,postfix,opsize);
+        end;
+
+
+      { copy the leftovers after an unrolled or regular copy loop }
+      procedure gencopyleftovers(list: TAsmList; var source, dest: treference; len: longint);
+        begin
+          { stop post-indexing if we did so in the loop, since in that case all
+            offsets definitely can be represented now }
+          if source.addressmode=AM_POSTINDEXED then
+            begin
+              source.addressmode:=AM_OFFSET;
+              source.offset:=0;
+            end;
+          if dest.addressmode=AM_POSTINDEXED then
+            begin
+              dest.addressmode:=AM_OFFSET;
+              dest.offset:=0;
+            end;
+          { transfer the leftovers }
+          if len>=8 then
+            begin
+              dec(len,8);
+              gencopy(list,source,dest,PF_NONE,OS_64);
+            end;
+          if len>=4 then
+            begin
+              dec(len,4);
+              gencopy(list,source,dest,PF_NONE,OS_32);
+            end;
+          if len>=2 then
+            begin
+              dec(len,2);
+              gencopy(list,source,dest,PF_H,OS_16);
+            end;
+          if len>=1 then
+            begin
+              dec(len);
+              gencopy(list,source,dest,PF_B,OS_8);
+            end;
+        end;
+
+
+      const
+        { load_length + loop dec + cbnz }
+        loopoverhead=12;
+        { loop overhead + load + store }
+        totallooplen=loopoverhead + 8;
+      var
+        totalalign: longint;
+        maxlenunrolled: tcgint;
+        loadop, storeop: tasmop;
+        opsize: tcgsize;
+        postfix: toppostfix;
+        tmpsource, tmpdest: treference;
+        scaledstoreop, unscaledstoreop,
+        scaledloadop, unscaledloadop: tasmop;
+        regs: array[1..8] of tregister;
         countreg: tregister;
-        src, dst: treference;
-        lab: tasmlabel;
-        count, count2: aint;
-*)
+        i, regcount: longint;
+        hl: tasmlabel;
+        simplifysource, simplifydest: boolean;
       begin
-(*
-        { anybody wants to determine a good value here :)? }
-        if len>100 then
-*)
-          g_concatcopy_move(list,source,dest,len)
-(*
+        if len=0 then
+          exit;
+        sourcebasereplaced:=false;
+        destbasereplaced:=false;
+        { maximum common alignment }
+        totalalign:=max(1,newalignment(source.alignment,dest.alignment));
+        { use a simple load/store? }
+        if (len in [1,2,4,8]) and
+           ((totalalign>=(len div 2)) or
+            (source.alignment=len) or
+            (dest.alignment=len)) then
+          begin
+            opsize:=int_cgsize(len);
+            a_load_ref_ref(list,opsize,opsize,source,dest);
+            exit;
+          end;
+
+        { alignment > length is not useful, and would break some checks below }
+        while totalalign>len do
+          totalalign:=totalalign div 2;
+
+        { operation sizes to use based on common alignment }
+        case totalalign of
+          1:
+            begin
+              postfix:=PF_B;
+              opsize:=OS_8;
+            end;
+          2:
+            begin
+              postfix:=PF_H;
+              opsize:=OS_16;
+            end;
+          4:
+            begin
+              postfix:=PF_None;
+              opsize:=OS_32;
+            end
+          else
+            begin
+              totalalign:=8;
+              postfix:=PF_None;
+              opsize:=OS_64;
+            end;
+        end;
+        { maximum length to handled with an unrolled loop (4 loads + 4 stores) }
+        maxlenunrolled:=min(totalalign,8)*4;
+        { ldp/stp -> 2 registers per instruction }
+        if (totalalign>=4) and
+           (len>=totalalign*2) then
+          begin
+            maxlenunrolled:=maxlenunrolled*2;
+            scaledstoreop:=A_STP;
+            scaledloadop:=A_LDP;
+            unscaledstoreop:=A_NONE;
+            unscaledloadop:=A_NONE;
+          end
         else
           begin
-            count:=len div 4;
-            if (count<=4) and reference_is_reusable(source) then
-              src:=source
-            else
-              begin
-                reference_reset_base(src,getintregister(list,OS_ADDR),0,sizeof(aint));
-                a_loadaddr_ref_reg(list,source,src.base);
-              end;
-            if (count<=4) and reference_is_reusable(dest) then
-              dst:=dest
-            else
-              begin
-                reference_reset_base(dst,getintregister(list,OS_ADDR),0,sizeof(aint));
-                a_loadaddr_ref_reg(list,dest,dst.base);
-              end;
-            { generate a loop }
-            if count>4 then
-              begin
-                countreg:=GetIntRegister(list,OS_INT);
-                tmpreg1:=GetIntRegister(list,OS_INT);
-                a_load_const_reg(list,OS_INT,count,countreg);
-                current_asmdata.getjumplabel(lab);
-                a_label(list, lab);
-                list.concat(taicpu.op_ref_reg(A_LD,src,tmpreg1));
-                list.concat(taicpu.op_reg_ref(A_ST,tmpreg1,dst));
-                list.concat(taicpu.op_reg_const_reg(A_ADD,src.base,4,src.base));
-                list.concat(taicpu.op_reg_const_reg(A_ADD,dst.base,4,dst.base));
-                list.concat(taicpu.op_reg_const_reg(A_SUBcc,countreg,1,countreg));
-                a_jmp_cond(list,OC_NE,lab);
-                len := len mod 4;
-              end;
-            { unrolled loop }
-            count:=len div 4;
-            if count>0 then
+            scaledstoreop:=A_STR;
+            scaledloadop:=A_LDR;
+            unscaledstoreop:=A_STUR;
+            unscaledloadop:=A_LDUR;
+          end;
+        { we only need 4 instructions extra to call FPC_MOVE }
+        if cs_opt_size in current_settings.optimizerswitches then
+          maxlenunrolled:=maxlenunrolled div 2;
+        if (len>maxlenunrolled) and
+           (len>totalalign*8) then
+          begin
+            g_concatcopy_move(list,source,dest,len);
+            exit;
+          end;
+
+        simplifysource:=true;
+        simplifydest:=true;
+        tmpsource:=source;
+        tmpdest:=dest;
+        { can we directly encode all offsets in an unrolled loop? }
+        if len<=maxlenunrolled then
+          begin
+{$ifdef extdebug}
+            list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop; len/opsize/align: '+tostr(len)+'/'+tostr(tcgsize2size[opsize])+'/'+tostr(totalalign))));
+{$endif extdebug}
+            { the leftovers will be handled separately -> -(len mod opsize) }
+            inc(tmpsource.offset,len-(len mod tcgsize2size[opsize]));
+            { additionally, the last regular load/store will be at
+              offset+len-opsize (if len-(len mod opsize)>len) }
+            if tmpsource.offset>source.offset then
+              dec(tmpsource.offset,tcgsize2size[opsize]);
+            getmemop(scaledloadop,unscaledloadop,source,tmpsource,opsize,postfix,loadop,simplifysource);
+            inc(tmpdest.offset,len-(len mod tcgsize2size[opsize]));
+            if tmpdest.offset>dest.offset then
+              dec(tmpdest.offset,tcgsize2size[opsize]);
+            getmemop(scaledstoreop,unscaledstoreop,dest,tmpdest,opsize,postfix,storeop,simplifydest);
+            tmpsource:=source;
+            tmpdest:=dest;
+            { if we can't directly encode all offsets, simplify }
+            if simplifysource then
               begin
-                tmpreg1:=GetIntRegister(list,OS_INT);
-                for count2 := 1 to count do
-                  begin
-                    list.concat(taicpu.op_ref_reg(A_LD,src,tmpreg1));
-                    list.concat(taicpu.op_reg_ref(A_ST,tmpreg1,dst));
-                    inc(src.offset,4);
-                    inc(dst.offset,4);
-                  end;
-                len := len mod 4;
+                loadop:=scaledloadop;
+                makesimpleforcopy(list,loadop,opsize,postfix,false,tmpsource,sourcebasereplaced);
               end;
-            if (len and 4) <> 0 then
+            if simplifydest then
               begin
-                hreg:=GetIntRegister(list,OS_INT);
-                a_load_ref_reg(list,OS_32,OS_32,src,hreg);
-                a_load_reg_ref(list,OS_32,OS_32,hreg,dst);
-                inc(src.offset,4);
-                inc(dst.offset,4);
+                storeop:=scaledstoreop;
+                makesimpleforcopy(list,storeop,opsize,postfix,false,tmpdest,destbasereplaced);
               end;
-            { copy the leftovers }
-            if (len and 2) <> 0 then
+            regcount:=len div tcgsize2size[opsize];
+            { in case we transfer two registers at a time, we copy an even
+              number of registers }
+            if loadop=A_LDP then
+              regcount:=regcount and not(1);
+            { max 4 loads/stores -> max 8 registers (in case of ldp/stdp) }
+            for i:=1 to regcount do
+              regs[i]:=getintregister(list,opsize);
+            if loadop=A_LDP then
               begin
-                hreg:=GetIntRegister(list,OS_INT);
-                a_load_ref_reg(list,OS_16,OS_16,src,hreg);
-                a_load_reg_ref(list,OS_16,OS_16,hreg,dst);
-                inc(src.offset,2);
-                inc(dst.offset,2);
-              end;
-            if (len and 1) <> 0 then
+                { load registers }
+                for i:=1 to (regcount div 2) do
+                  gendualloadstore(list,loadop,regs[i*2-1],regs[i*2],tmpsource,postfix,opsize);
+                { store registers }
+                for i:=1 to (regcount div 2) do
+                  gendualloadstore(list,storeop,regs[i*2-1],regs[i*2],tmpdest,postfix,opsize);
+              end
+            else
               begin
-                hreg:=GetIntRegister(list,OS_INT);
-                a_load_ref_reg(list,OS_8,OS_8,src,hreg);
-                a_load_reg_ref(list,OS_8,OS_8,hreg,dst);
+                for i:=1 to regcount do
+                  genloadstore(list,loadop,regs[i],tmpsource,postfix,opsize);
+                for i:=1 to regcount do
+                  genloadstore(list,storeop,regs[i],tmpdest,postfix,opsize);
               end;
-          end;
-*)
-      end;
-
-
-    procedure tcgaarch64.g_concatcopy_unaligned(list : TAsmList;const source,dest : treference;len : tcgint);
-(*
-      var
-        src, dst: treference;
-        tmpreg1,
-        countreg: tregister;
-        i : aint;
-        lab: tasmlabel;
-*)
-      begin
-(*
-        if len>31 then
-*)
-          g_concatcopy_move(list,source,dest,len)
-(*
+            { leftover }
+            len:=len-regcount*tcgsize2size[opsize];
+{$ifdef extdebug}
+            list.concat(tai_comment.Create(strpnew('concatcopy unrolled loop leftover: '+tostr(len))));
+{$endif extdebug}
+          end
         else
           begin
-            reference_reset(src,source.alignment);
-            reference_reset(dst,dest.alignment);
-            { load the address of source into src.base }
-            src.base:=GetAddressRegister(list);
-            a_loadaddr_ref_reg(list,source,src.base);
-            { load the address of dest into dst.base }
-            dst.base:=GetAddressRegister(list);
-            a_loadaddr_ref_reg(list,dest,dst.base);
-            { generate a loop }
-            if len>4 then
+{$ifdef extdebug}
+            list.concat(tai_comment.Create(strpnew('concatcopy regular loop; len/align: '+tostr(len)+'/'+tostr(totalalign))));
+{$endif extdebug}
+            { regular loop -> definitely use post-indexing }
+            loadop:=scaledloadop;
+            makesimpleforcopy(list,loadop,opsize,postfix,true,tmpsource,sourcebasereplaced);
+            storeop:=scaledstoreop;
+            makesimpleforcopy(list,storeop,opsize,postfix,true,tmpdest,destbasereplaced);
+            current_asmdata.getjumplabel(hl);
+            countreg:=getintregister(list,OS_32);
+            if loadop=A_LDP then
+              a_load_const_reg(list,OS_32,len div tcgsize2size[opsize]*2,countreg)
+            else
+              a_load_const_reg(list,OS_32,len div tcgsize2size[opsize],countreg);
+            a_label(list,hl);
+            a_op_const_reg(list,OP_SUB,OS_32,1,countreg);
+            if loadop=A_LDP then
               begin
-                countreg:=GetIntRegister(list,OS_INT);
-                tmpreg1:=GetIntRegister(list,OS_INT);
-                a_load_const_reg(list,OS_INT,len,countreg);
-                current_asmdata.getjumplabel(lab);
-                a_label(list, lab);
-                list.concat(taicpu.op_ref_reg(A_LDUB,src,tmpreg1));
-                list.concat(taicpu.op_reg_ref(A_STB,tmpreg1,dst));
-                list.concat(taicpu.op_reg_const_reg(A_ADD,src.base,1,src.base));
-                list.concat(taicpu.op_reg_const_reg(A_ADD,dst.base,1,dst.base));
-                list.concat(taicpu.op_reg_const_reg(A_SUBcc,countreg,1,countreg));
-                a_jmp_cond(list,OC_NE,lab);
+                regs[1]:=getintregister(list,opsize);
+                regs[2]:=getintregister(list,opsize);
+                gendualloadstore(list,loadop,regs[1],regs[2],tmpsource,postfix,opsize);
+                gendualloadstore(list,storeop,regs[1],regs[2],tmpdest,postfix,opsize);
               end
             else
               begin
-                { unrolled loop }
-                tmpreg1:=GetIntRegister(list,OS_INT);
-                for i:=1 to len do
-                  begin
-                    list.concat(taicpu.op_ref_reg(A_LDUB,src,tmpreg1));
-                    list.concat(taicpu.op_reg_ref(A_STB,tmpreg1,dst));
-                    inc(src.offset);
-                    inc(dst.offset);
-                  end;
+                regs[1]:=getintregister(list,opsize);
+                genloadstore(list,loadop,regs[1],tmpsource,postfix,opsize);
+                genloadstore(list,storeop,regs[1],tmpdest,postfix,opsize);
               end;
+            list.concat(taicpu.op_reg_sym_ofs(A_CBNZ,countreg,hl,0));
+            len:=len mod tcgsize2size[opsize];
           end;
-*)
+        gencopyleftovers(list,tmpsource,tmpdest,len);
       end;