6 years ago · dd2d1bf68b
--- a/compiler/avr/cgcpu.pas
+++ b/compiler/avr/cgcpu.pas
@@ -438,6 +438,11 @@ unit cgcpu;
 
				 
			
 
				 
			
 
				      procedure tcgavr.a_op_const_reg_reg(list: TAsmList; op: TOpCg; size: tcgsize; a: tcgint; src, dst: tregister);
			
 
				+       var
			
 
				+         tmpSrc, tmpDst, countreg: TRegister;
			
 
				+         b, b2, i, j: byte;
			
 
				+         s1, s2, t1: integer;
			
 
				+         l1: TAsmLabel;
			
 
				        begin
			
 
				          if (op in [OP_MUL,OP_IMUL]) and (size in [OS_16,OS_S16]) and (a in [2,4,8]) then
			
 
				            begin
			
@@ -451,6 +456,102 @@ unit cgcpu;
 
				                  a:=a shr 1;
			
 
				                end;
			
 
				            end
			
 
				+
			
 
				+         else if (op in [OP_SHL,OP_SHR]) and
			
 
				+           { a=0 get eliminated later by tcg.optimize_op_const }
			
 
				+           (a>0)  then
			
 
				+           begin
			
 
				+             { number of bytes to shift }
			
 
				+             b:=a div 8;
			
 
				+
			
 
				+             { Ensure that b is never larger than base type }
			
 
				+             if b>tcgsize2size[size] then
			
 
				+               begin
			
 
				+                 b:=tcgsize2size[size];
			
 
				+                 b2:=0;
			
 
				+               end
			
 
				+             else
			
 
				+               b2:=a mod 8;
			
 
				+
			
 
				+             if b < tcgsize2size[size] then
			
 
				+               { copy from src to dst accounting for shift offset }
			
 
				+               for i:=0 to (tcgsize2size[size]-b-1) do
			
 
				+                 if op=OP_SHL then
			
 
				+                   a_load_reg_reg(list,OS_8,OS_8,
			
 
				+                     GetOffsetReg64(src,NR_NO,i),
			
 
				+                     GetOffsetReg64(dst,NR_NO,i+b))
			
 
				+                 else
			
 
				+                   a_load_reg_reg(list,OS_8,OS_8,
			
 
				+                     GetOffsetReg64(src,NR_NO,i+b),
			
 
				+                     GetOffsetReg64(dst,NR_NO,i));
			
 
				+
			
 
				+             { remaining bit shifts }
			
 
				+             if b2 > 0 then
			
 
				+               begin
			
 
				+                 { Cost of loop }
			
 
				+                 s1:=3+tcgsize2size[size]-b;
			
 
				+                 t1:=b2*(tcgsize2size[size]-b+3);
			
 
				+                 { Cost of loop unrolling,t2=s2 }
			
 
				+                 s2:=b2*(tcgsize2size[size]-b);
			
 
				+
			
 
				+                 if ((cs_opt_size in current_settings.optimizerswitches) and (s1<s2)) or
			
 
				+                    (((s2-s1)-t1/s2)>0) then
			
 
				+                   begin
			
 
				+                     { Shift non-moved bytes in loop }
			
 
				+                     current_asmdata.getjumplabel(l1);
			
 
				+                     countreg:=getintregister(list,OS_8);
			
 
				+                     a_load_const_reg(list,OS_8,b2,countreg);
			
 
				+                     cg.a_label(list,l1);
			
 
				+                     if op=OP_SHL then
			
 
				+                       list.concat(taicpu.op_reg(A_LSL,GetOffsetReg64(dst,NR_NO,b)))
			
 
				+                     else
			
 
				+                       list.concat(taicpu.op_reg(A_LSR,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-1-b)));
			
 
				+
			
 
				+                     if size in [OS_S16,OS_16,OS_S32,OS_32,OS_S64,OS_64] then
			
 
				+                       begin
			
 
				+                         for i:=2+b to tcgsize2size[size] do
			
 
				+                           if op=OP_SHL then
			
 
				+                             list.concat(taicpu.op_reg(A_ROL,GetOffsetReg64(dst,NR_NO,i-1)))
			
 
				+                           else
			
 
				+                             list.concat(taicpu.op_reg(A_ROR,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-i-b)));
			
 
				+                       end;
			
 
				+                     list.concat(taicpu.op_reg(A_DEC,countreg));
			
 
				+                     a_jmp_flags(list,F_NE,l1);
			
 
				+                     { keep registers alive }
			
 
				+                     a_reg_sync(list,countreg);
			
 
				+                   end
			
 
				+                 else
			
 
				+                   begin
			
 
				+                     { Unroll shift loop over non-moved bytes }
			
 
				+                     for j:=1 to b2 do
			
 
				+                     begin
			
 
				+                       if op=OP_SHL then
			
 
				+                         list.concat(taicpu.op_reg(A_LSL,
			
 
				+                         GetOffsetReg64(dst,NR_NO,b)))
			
 
				+                       else
			
 
				+                         list.concat(taicpu.op_reg(A_LSR,
			
 
				+                           GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-b-1)));
			
 
				+
			
 
				+                       if not(size in [OS_8,OS_S8]) then
			
 
				+                         for i:=2 to tcgsize2size[size]-b do
			
 
				+                           if op=OP_SHL then
			
 
				+                             list.concat(taicpu.op_reg(A_ROL,
			
 
				+                               GetOffsetReg64(dst,NR_NO,b+i-1)))
			
 
				+                           else
			
 
				+                             list.concat(taicpu.op_reg(A_ROR,
			
 
				+                               GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-b-i)));
			
 
				+                     end;
			
 
				+                   end;
			
 
				+               end;
			
 
				+
			
 
				+               { fill skipped destination registers with 0
			
 
				+                 Do last,then optimizer can optimize register moves }
			
 
				+               for i:=1 to b do
			
 
				+                 if op=OP_SHL then
			
 
				+                   emit_mov(list,GetOffsetReg64(dst,NR_NO,i-1),NR_R1)
			
 
				+                 else
			
 
				+                   emit_mov(list,GetOffsetReg64(dst,NR_NO,tcgsize2size[size]-i),NR_R1);
			
 
				+           end
			
 
				          else
			
 
				            inherited a_op_const_reg_reg(list,op,size,a,src,dst);
			
 
				        end;
			
@@ -687,8 +788,8 @@ unit cgcpu;
 
				 
			
 
				                list.concat(taicpu.op_reg(A_DEC,countreg));
			
 
				                a_jmp_flags(list,F_NE,l1);
			
 
				-               // keep registers alive
			
 
				-               list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg));
			
 
				+               { keep registers alive }
			
 
				+               a_reg_sync(list,countreg);
			
 
				                cg.a_label(list,l2);
			
 
				              end;
			
 
				 
			
@@ -2417,8 +2518,8 @@ unit cgcpu;
 
				             cg.ungetcpuregister(list,NR_R27);
			
 
				             cg.ungetcpuregister(list,NR_R30);
			
 
				             cg.ungetcpuregister(list,NR_R31);
			
 
				-            // keep registers alive
			
 
				-            list.concat(taicpu.op_reg_reg(A_MOV,countreg,countreg));
			
 
				+            { keep registers alive }
			
 
				+            a_reg_sync(list,countreg);
			
 
				           end
			
 
				         else
			
 
				           begin