Parcourir la source

* avoid to create a stack frame on aarch64 if possible

Florian Klämpfl il y a 6 ans
Parent
commit
a47f153dae

+ 44 - 27
compiler/aarch64/cgcpu.pas

@@ -1872,6 +1872,13 @@ implementation
         ref: treference;
         totalstackframesize: longint;
       begin
+        { on aarch64, we need to store the link register and the generate a frame pointer if the subroutine either
+          - receives parameters on the stack
+          - is not a leaf procedure
+          - has nested procedures
+          - helpers retrieve the stack pointer
+        }
+
         hitem:=list.last;
         { pi_has_unwind_info may already be set at this point if there are
           SEH directives in assembler body. In this case, .seh_endprologue
@@ -1885,28 +1892,30 @@ implementation
 
             if target_info.system=system_aarch64_win64 then
               include(current_procinfo.flags,pi_has_unwind_info);
-
-            { save stack pointer and return address }
-            reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
-            ref.addressmode:=AM_PREINDEXED;
-            list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
-            current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint));
-            current_asmdata.asmcfi.cfa_offset(list,NR_FP,-16);
-            current_asmdata.asmcfi.cfa_offset(list,NR_LR,-8);
-            if target_info.system=system_aarch64_win64 then
-              list.concat(cai_seh_directive.create_offset(ash_savefplr_x,16));
-            { initialise frame pointer }
-            if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then
+            if not(pi_no_framepointer_needed in current_procinfo.flags) then
               begin
-                a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
-                current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FP);
+                { save stack pointer and return address }
+                reference_reset_base(ref,NR_SP,-16,ctempposinvalid,16,[]);
+                ref.addressmode:=AM_PREINDEXED;
+                list.concat(taicpu.op_reg_reg_ref(A_STP,NR_FP,NR_LR,ref));
+                current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint));
+                current_asmdata.asmcfi.cfa_offset(list,NR_FP,-16);
+                current_asmdata.asmcfi.cfa_offset(list,NR_LR,-8);
                 if target_info.system=system_aarch64_win64 then
-                  list.concat(cai_seh_directive.create(ash_setfp));
-              end
-            else
-              begin
-                gen_load_frame_for_exceptfilter(list);
-                localsize:=current_procinfo.maxpushedparasize;
+                  list.concat(cai_seh_directive.create_offset(ash_savefplr_x,16));
+                { initialise frame pointer }
+                if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then
+                  begin
+                    a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_SP,NR_FP);
+                    current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FP);
+                    if target_info.system=system_aarch64_win64 then
+                      list.concat(cai_seh_directive.create(ash_setfp));
+                  end
+                else
+                  begin
+                    gen_load_frame_for_exceptfilter(list);
+                    localsize:=current_procinfo.maxpushedparasize;
+                  end;
               end;
 
             totalstackframesize:=localsize;
@@ -2081,7 +2090,6 @@ implementation
       end;
 
 
-
     procedure tcgaarch64.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
       var
         ref: treference;
@@ -2122,13 +2130,22 @@ implementation
                 load_regs(list,R_INTREGISTER,RS_X19,RS_X28,R_SUBWHOLE);
               end
             else if current_procinfo.final_localsize<>0 then
-              { restore stack pointer }
-              a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
+              begin
+                { restore stack pointer }
+                if pi_no_framepointer_needed in current_procinfo.flags then
+                  handle_reg_imm12_reg(list,A_ADD,OS_ADDR,current_procinfo.framepointer,current_procinfo.final_localsize,
+                    current_procinfo.framepointer,NR_IP0,false,true)
+                else
+                  a_load_reg_reg(list,OS_ADDR,OS_ADDR,NR_FP,NR_SP);
+              end;
 
-            { restore framepointer and return address }
-            reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
-            ref.addressmode:=AM_POSTINDEXED;
-            list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
+            if not(pi_no_framepointer_needed in current_procinfo.flags) then
+              begin
+                { restore framepointer and return address }
+                reference_reset_base(ref,NR_SP,16,ctempposinvalid,16,[]);
+                ref.addressmode:=AM_POSTINDEXED;
+                list.concat(taicpu.op_reg_reg_ref(A_LDP,NR_FP,NR_LR,ref));
+              end;
           end;
 
         { return }

+ 2 - 1
compiler/armgen/aoptarm.pas

@@ -1133,7 +1133,8 @@ Implementation
           GetNextInstruction(p, hp1) and
           (hp1.typ = ait_instruction) and
           (taicpu(hp1).condition = C_None) and
-          (taicpu(hp1).oppostfix = taicpu(p).oppostfix) then
+          (taicpu(hp1).oppostfix = taicpu(p).oppostfix) and
+          (taicpu(hp1).ops>0) and (taicpu(hp1).oper[0]^.typ=top_reg) then
           begin
             { Saves constant dereferencing and makes it easier to change the size if necessary }
             SrcReg := taicpu(p).oper[0]^.reg;

+ 3 - 1
compiler/globtype.pas

@@ -786,7 +786,9 @@ interface
          { subroutine uses get_frame }
          pi_uses_get_frame,
          { x86 only: subroutine uses ymm registers, requires vzeroupper call }
-         pi_uses_ymm
+         pi_uses_ymm,
+         { set if no frame pointer is needed, the rules when this applies is target specific }
+         pi_no_framepointer_needed
        );
        tprocinfoflags=set of tprocinfoflag;
 

+ 10 - 4
compiler/psub.pas

@@ -1046,7 +1046,7 @@ implementation
       end;
 
 
-{$if defined(i386) or defined(x86_64) or defined(arm) or defined(riscv32) or defined(riscv64) or defined(m68k)}
+{$if defined(i386) or defined(x86_64) or defined(arm) or defined(aarch64) or defined(riscv32) or defined(riscv64) or defined(m68k)}
     const
       exception_flags: array[boolean] of tprocinfoflags = (
         [],
@@ -1058,7 +1058,7 @@ implementation
       begin
         tg:=tgobjclass.create;
 
-{$if defined(i386) or defined(x86_64) or defined(arm) or defined(m68k)}
+{$if defined(i386) or defined(x86_64) or defined(arm) or defined(aarch64) or defined(m68k)}
 {$if defined(arm)}
         { frame and stack pointer must be always the same on arm thumb so it makes no
           sense to fiddle with a frame pointer }
@@ -1102,11 +1102,16 @@ implementation
                 not(cs_generate_stackframes in current_settings.localswitches) and
                 not(cs_profile in current_settings.moduleswitches) and
                 not(po_assembler in procdef.procoptions) and
+{$if defined(aarch64)}
+               { on aarch64, it must be a leaf subroutine }
+                not(pi_do_call in flags) and
+{$endif defined(aarch64)}
                 not ((pi_has_stackparameter in flags)
-{$ifndef arm}   { Outgoing parameter(s) on stack do not need stackframe on x86 targets
+{$if defined(i386) or defined(x86_64)}
+               { Outgoing parameter(s) on stack do not need stackframe on x86 targets
                  with fixed stack. On ARM it fails, see bug #25050 }
                   and (not paramanager.use_fixed_stack)
-{$endif arm}
+{$endif defined(i386) or defined(x86_64)}
                   ) and
                 ((flags*([pi_has_assembler_block,pi_is_assembler,
                         pi_needs_stackframe]+
@@ -1137,6 +1142,7 @@ implementation
                     { Only need to set the framepointer }
                     framepointer:=NR_STACK_POINTER_REG;
                     tg.direction:=1;
+                    Include(flags,pi_no_framepointer_needed)
                   end
 {$if defined(arm)}
                 { On arm, the stack frame size can be estimated to avoid using an extra frame pointer,

+ 4 - 1
compiler/utils/ppuutils/ppudump.pp

@@ -1661,7 +1661,10 @@ const
          (mask:pi_uses_get_frame;
          str:' uses get_frame'),
          (mask:pi_uses_ymm;
-         str:' uses ymm register (x86 only)')
+         str:' uses ymm register (x86 only)'),
+         (mask:pi_no_framepointer_needed;
+         str:' set if no frame pointer is needed, the rules when this applies is target specific'
+         )
   );
 var
   procinfooptions : tprocinfoflags;