2
0
Эх сурвалжийг харах

+ Support omitting the frame pointer on arm even in procedures with incoming parameters
on the stack, this can be enabled by -OoFORCENOSTACKFRAME. This reduces the required
entry/exit code and makes an extra register available to the compiler. However, since this is
based on an estimation of the required stack size, it might have two drawbacks:
either the stack frame is estimated to big, the program requires a bigger stack than needed
or it is estimated too small, then the compiler throws an internalerror during compilation. These
issues can be overcome as soon as the compiler supports recompiling subroutines if needed.

git-svn-id: trunk@27239 -

florian 11 жил өмнө
parent
commit
686a2d2f3f

+ 31 - 15
compiler/arm/cgcpu.pas

@@ -1762,14 +1762,18 @@ unit cgcpu;
          r : byte;
          mmregs,
          regs, saveregs : tcpuregisterset;
+         registerarea,
          r7offset,
          stackmisalignment : pint;
          postfix: toppostfix;
          imm1, imm2: DWord;
+         stack_parameters : Boolean;
       begin
         LocalSize:=align(LocalSize,4);
+        stack_parameters:=current_procinfo.procdef.stack_tainting_parameter(calleeside);
+
         { call instruction does not put anything on the stack }
-        stackmisalignment:=0;
+        registerarea:=0;
         tarmprocinfo(current_procinfo).stackpaddingreg:=High(TSuperRegister);
         lastfloatreg:=RS_NO;
         if not(nostackframe) then
@@ -1789,7 +1793,7 @@ unit cgcpu;
                         if firstfloatreg=RS_NO then
                           firstfloatreg:=r;
                         lastfloatreg:=r;
-                        inc(stackmisalignment,12);
+                        inc(registerarea,12);
                       end;
                 end;
               fpu_vfpv2,
@@ -1829,16 +1833,16 @@ unit cgcpu;
                    begin
                      for r:=RS_R0 to RS_R15 do
                        if r in regs then
-                         inc(stackmisalignment,4);
+                         inc(registerarea,4);
 
                      { if the stack is not 8 byte aligned, try to add an extra register,
                        so we can avoid the extra sub/add ...,#4 later (KB) }
-                     if ((stackmisalignment mod current_settings.alignment.localalignmax) <> 0) then
+                     if ((registerarea mod current_settings.alignment.localalignmax) <> 0) then
                        for r:=RS_R3 downto RS_R0 do
                          if not(r in regs) then
                            begin
                              regs:=regs+[r];
-                             inc(stackmisalignment,4);
+                             inc(registerarea,4);
                              tarmprocinfo(current_procinfo).stackpaddingreg:=r;
                              break;
                            end;
@@ -1876,7 +1880,7 @@ unit cgcpu;
                     for r:=RS_R0 to RS_R15 do
                       if r in saveregs then
                         begin
-                          inc(stackmisalignment,4);
+                          inc(registerarea,4);
                           if r<RS_FRAME_POINTER_REG then
                             inc(r7offset,4);
                         end;
@@ -1894,19 +1898,26 @@ unit cgcpu;
                       begin
                         for r:=RS_R8 to RS_R11 do
                           if r in saveregs then
-                            inc(stackmisalignment,4);
+                            inc(registerarea,4);
                         list.concat(setoppostfix(taicpu.op_ref_regset(A_STM,ref,R_INTREGISTER,R_SUBWHOLE,saveregs),PF_FD));
                       end;
                   end;
               end;
 
-            stackmisalignment:=stackmisalignment mod current_settings.alignment.localalignmax;
+            stackmisalignment:=registerarea mod current_settings.alignment.localalignmax;
             if (LocalSize<>0) or
                ((stackmisalignment<>0) and
                 ((pi_do_call in current_procinfo.flags) or
                  (po_assembler in current_procinfo.procdef.procoptions))) then
               begin
                 localsize:=align(localsize+stackmisalignment,current_settings.alignment.localalignmax)-stackmisalignment;
+                if stack_parameters and (pi_estimatestacksize in current_procinfo.flags) then
+                  begin
+                    if localsize>tarmprocinfo(current_procinfo).stackframesize then
+                      internalerror(2014030901)
+                    else
+                      localsize:=tarmprocinfo(current_procinfo).stackframesize-registerarea;
+                  end;
                 if is_shifter_const(localsize,shift) then
                   begin
                     a_reg_dealloc(list,NR_R12);
@@ -1989,6 +2000,7 @@ unit cgcpu;
          mmregs,
          saveregs,
          regs : tcpuregisterset;
+         registerarea,
          stackmisalignment: pint;
          paddingreg: TSuperRegister;
          mmpostfix: toppostfix;
@@ -1996,7 +2008,7 @@ unit cgcpu;
       begin
         if not(nostackframe) then
           begin
-            stackmisalignment:=0;
+            registerarea:=0;
             firstfloatreg:=RS_NO;
             lastfloatreg:=RS_NO;
             mmregs:=[];
@@ -2016,7 +2028,7 @@ unit cgcpu;
                         lastfloatreg:=r;
                         { floating point register space is already included in
                           localsize below by calc_stackframe_size
-                         inc(stackmisalignment,12);
+                         inc(registerarea,12);
                         }
                       end;
                 end;
@@ -2108,13 +2120,13 @@ unit cgcpu;
                     ref.addressmode:=AM_PREINDEXED;
                     for r:=RS_R8 to RS_R11 do
                       if r in saveregs then
-                        inc(stackmisalignment,4);
+                        inc(registerarea,4);
                     regs:=regs-saveregs;
                   end;
               end;
             for r:=RS_R0 to RS_R15 do
               if r in regs then
-                inc(stackmisalignment,4);
+                inc(registerarea,4);
 
             { reapply the stack padding reg, in case there was one, see the complimentary
               comment in g_proc_entry() (KB) }
@@ -2125,9 +2137,9 @@ unit cgcpu;
               else
                 begin
                   regs:=regs+[paddingreg];
-                  inc(stackmisalignment,4);
+                  inc(registerarea,4);
                 end;
-            stackmisalignment:=stackmisalignment mod current_settings.alignment.localalignmax;
+            stackmisalignment:=registerarea mod current_settings.alignment.localalignmax;
             if (current_procinfo.framepointer=NR_STACK_POINTER_REG) or
                (target_info.system in systems_darwin) then
               begin
@@ -2137,7 +2149,11 @@ unit cgcpu;
                     ((pi_do_call in current_procinfo.flags) or
                      (po_assembler in current_procinfo.procdef.procoptions))) then
                   begin
-                    localsize:=align(localsize+stackmisalignment,current_settings.alignment.localalignmax)-stackmisalignment;
+                    if pi_estimatestacksize in current_procinfo.flags then
+                      LocalSize:=tarmprocinfo(current_procinfo).stackframesize-registerarea
+                    else
+                      localsize:=align(localsize+stackmisalignment,current_settings.alignment.localalignmax)-stackmisalignment;
+
                     if is_shifter_const(LocalSize,shift) then
                       list.concat(taicpu.op_reg_reg_const(A_ADD,NR_STACK_POINTER_REG,NR_STACK_POINTER_REG,LocalSize))
                     else if split_into_shifter_const(localsize, imm1, imm2) then

+ 1 - 1
compiler/arm/cpuinfo.pas

@@ -643,7 +643,7 @@ Const
                                  { no need to write info about those }
                                  [cs_opt_level1,cs_opt_level2,cs_opt_level3]+
                                  [cs_opt_regvar,cs_opt_loopunroll,cs_opt_tailrecursion,
-				  cs_opt_stackframe,cs_opt_nodecse,cs_opt_reorder_fields,cs_opt_fastmath];
+                                  cs_opt_stackframe,cs_opt_nodecse,cs_opt_reorder_fields,cs_opt_fastmath,cs_opt_forcenostackframe];
 
    level1optimizerswitches = genericlevel1optimizerswitches;
    level2optimizerswitches = genericlevel2optimizerswitches + level1optimizerswitches +

+ 8 - 10
compiler/arm/cpupara.pas

@@ -44,8 +44,8 @@ unit cpupara;
           function get_funcretloc(p : tabstractprocdef; side: tcallercallee; forcetempdef: tdef): tcgpara;override;
          private
           procedure init_values(p: tabstractprocdef; side: tcallercallee; var curintreg,
-           curfloatreg, curmmreg: tsuperregister; var cur_stack_offset: aword;
- var sparesinglereg: tregister);
+            curfloatreg, curmmreg: tsuperregister; var cur_stack_offset: aword;
+            var sparesinglereg: tregister);
           function create_paraloc_info_intern(p : tabstractprocdef; side: tcallercallee; paras: tparalist;
             var curintreg, curfloatreg, curmmreg: tsuperregister; var cur_stack_offset: aword; var sparesinglereg: tregister; isvariadic: boolean):longint;
        end;
@@ -54,7 +54,9 @@ unit cpupara;
 
     uses
        verbose,systems,cutils,
-       defutil,symsym,symtable;
+       defutil,symsym,symtable,
+       { PowerPC uses procinfo as well in cpupara, so this should not hurt }
+       procinfo;
 
 
     function tarmparamanager.get_volatile_registers_int(calloption : tproccalloption):tcpuregisterset;
@@ -298,7 +300,7 @@ unit cpupara;
         curfloatreg:=RS_F0;
         curmmreg:=RS_D0;
 
-        if GenerateThumbCode and (side=calleeside) then
+        if (side=calleeside) and (GenerateThumbCode or (pi_estimatestacksize in current_procinfo.flags)) then
           cur_stack_offset:=(p as tprocdef).total_stackframe_size
         else
           cur_stack_offset:=0;
@@ -581,13 +583,9 @@ unit cpupara;
                    begin
                      if paraloc^.loc=LOC_REFERENCE then
                        begin
-                         if GenerateThumbCode then
+                         paraloc^.reference.index:=current_procinfo.framepointer;
+                         if current_procinfo.framepointer=NR_FRAME_POINTER_REG then
                            begin
-                             paraloc^.reference.index:=NR_STACK_POINTER_REG;
-                           end
-                         else
-                           begin
-                             paraloc^.reference.index:=NR_FRAME_POINTER_REG;
                              { on non-Darwin, the framepointer contains the value
                                of the stack pointer on entry. On Darwin, the
                                framepointer points to the previously saved

+ 2 - 2
compiler/arm/cpupi.pas

@@ -99,7 +99,7 @@ unit cpupi;
           tg.setfirsttemp(maxpushedparasize);
 
         { estimate stack frame size }
-        if GenerateThumbCode then
+        if GenerateThumbCode or (pi_estimatestacksize in flags) then
           begin
             stackframesize:=maxpushedparasize+32;
             localsize:=0;
@@ -145,7 +145,7 @@ unit cpupi;
          floatsavesize : aword;
          regs: tcpuregisterset;
       begin
-        if GenerateThumbCode then
+        if GenerateThumbCode or (pi_estimatestacksize in flags) then
           result:=stackframesize
         else
           begin

+ 7 - 4
compiler/globtype.pas

@@ -277,7 +277,8 @@ interface
          { compiler checks for empty procedures/methods and removes calls to them if possible }
          cs_opt_remove_emtpy_proc,
          cs_opt_constant_propagate,
-         cs_opt_dead_store_eliminate
+         cs_opt_dead_store_eliminate,
+         cs_opt_forcenostackframe
        );
        toptimizerswitches = set of toptimizerswitch;
 
@@ -305,14 +306,14 @@ interface
        end;
 
     const
-       OptimizerSwitchStr : array[toptimizerswitch] of string[16] = ('',
+       OptimizerSwitchStr : array[toptimizerswitch] of string[17] = ('',
          'LEVEL1','LEVEL2','LEVEL3',
          'REGVAR','UNCERTAIN','SIZE','STACKFRAME',
          'PEEPHOLE','ASMCSE','LOOPUNROLL','TAILREC','CSE',
          'DFA','STRENGTH','SCHEDULE','AUTOINLINE','USEEBP','USERBP',
          'ORDERFIELDS','FASTMATH','DEADVALUES','REMOVEEMPTYPROCS',
          'CONSTPROP',
-         'DEADSTORE'
+         'DEADSTORE','FORCENOSTACKFRAME'
        );
        WPOptimizerSwitchStr : array [twpoptimizerswitch] of string[14] = (
          'DEVIRTCALLS','OPTVMTS','SYMBOLLIVENESS'
@@ -606,7 +607,9 @@ interface
          { subroutine has nested exit }
          pi_has_nested_exit,
          { allocates memory on stack, so stack is unbalanced on exit }
-         pi_has_stack_allocs
+         pi_has_stack_allocs,
+         { set if the stack frame of the procedure is estimated }
+         pi_estimatestacksize
        );
        tprocinfoflags=set of tprocinfoflag;
 

+ 23 - 1
compiler/psub.pas

@@ -999,7 +999,7 @@ implementation
                 not(cs_profile in current_settings.moduleswitches) and
                 not(po_assembler in procdef.procoptions) and
                 not ((pi_has_stackparameter in flags)
-{$ifndef arm}  { Outgoing parameter(s) on stack do not need stackframe on x86 targets
+{$ifndef arm}   { Outgoing parameter(s) on stack do not need stackframe on x86 targets
                  with fixed stack. On ARM it fails, see bug #25050 }
                   and (not paramanager.use_fixed_stack)
 {$endif arm}
@@ -1021,13 +1021,35 @@ implementation
                   (necessary to init para_stack_size)
                 }
                 generate_parameter_info;
+
                 if not(procdef.stack_tainting_parameter(calleeside)) and
                    not(has_assembler_child) and (para_stack_size=0) then
                   begin
                     { Only need to set the framepointer }
                     framepointer:=NR_STACK_POINTER_REG;
                     tg.direction:=1;
+                  end
+{$if defined(arm)}
+                { On arm, the stack frame size can be estimated to avoid using an extra frame pointer,
+                  in case parameters are passed on the stack.
+
+                  However, the draw back is, if the estimation fails, compilation will break later on
+                  with an internal error, so this switch is not enabled by default yet. To overcome this,
+                  multipass compilation of subroutines must be supported
+                }
+                else if (cs_opt_forcenostackframe in current_settings.optimizerswitches) and
+                   not(has_assembler_child) then
+                  begin
+                    { Only need to set the framepointer }
+                    framepointer:=NR_STACK_POINTER_REG;
+                    tg.direction:=1;
+                    include(flags,pi_estimatestacksize);
+                    set_first_temp_offset;
+                    procdef.has_paraloc_info:=callnoside;
+                    generate_parameter_info;
+                    exit;
                   end;
+{$endif defined(arm)}
               end;
           end;
 {$endif defined(x86) or defined(arm)}

+ 3 - 2
compiler/utils/ppuutils/ppudump.pp

@@ -1196,8 +1196,9 @@ const
          (mask:pi_has_nested_exit;
          str:' subroutine contains a nested subroutine which calls the exit of the current one '),
          (mask:pi_has_stack_allocs;
-         str:' allocates memory on stack, so stack may be unbalanced on exit ')
-
+         str:' allocates memory on stack, so stack may be unbalanced on exit '),
+         (mask:pi_estimatestacksize;
+         str:' stack size is estimated before subroutine is compiled ')
   );
 var
   procinfooptions : tprocinfoflags;