Browse Source

* x86_64: reworked register saving/restoring to use PUSH/POP instructions for procedures with RSP-based frame and SEH finalization procedures. XMM registers are also saved/restored without involving tempgen in mentioned cases. This prevents SEH finalization procedures from saving registers in stack frame of their parent procedures, fixing incorrect unwind bytecode (Mantis #24791). It also reduces executable size (for compiler itself, by about 100Kb).

git-svn-id: trunk@25389 -
sergei 12 years ago
parent
commit
e41149a7ec
1 changed files with 112 additions and 9 deletions
  1. 112 9
      compiler/x86_64/cgcpu.pas

+ 112 - 9
compiler/x86_64/cgcpu.pas

@@ -40,9 +40,14 @@ unit cgcpu;
         procedure g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);override;
         procedure g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);override;
         procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
         procedure g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);override;
         procedure g_local_unwind(list: TAsmList; l: TAsmLabel);override;
         procedure g_local_unwind(list: TAsmList; l: TAsmLabel);override;
+        procedure g_save_registers(list: TAsmList);override;
+        procedure g_restore_registers(list: TAsmList);override;
 
 
         procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize;intreg, mmreg: tregister; shuffle: pmmshuffle); override;
         procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize;intreg, mmreg: tregister; shuffle: pmmshuffle); override;
         procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize;mmreg, intreg: tregister;shuffle : pmmshuffle); override;
         procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize;mmreg, intreg: tregister;shuffle : pmmshuffle); override;
+      private
+        function use_push: boolean;
+        function saved_xmm_reg_size: longint;
       end;
       end;
 
 
     procedure create_codegen;
     procedure create_codegen;
@@ -103,6 +108,29 @@ unit cgcpu;
       end;
       end;
 
 
 
 
+    function tcgx86_64.use_push: boolean;
+      begin
+        result:=(current_procinfo.framepointer=NR_STACK_POINTER_REG) or
+          (current_procinfo.procdef.proctypeoption=potype_exceptfilter);
+      end;
+
+
+    function tcgx86_64.saved_xmm_reg_size: longint;
+      var
+        i: longint;
+      begin
+        result:=0;
+        if (target_info.system<>system_x86_64_win64) or
+           (not uses_registers(R_MMREGISTER)) then
+          exit;
+        for i:=low(saved_mm_registers) to high(saved_mm_registers) do
+          begin
+            if (saved_mm_registers[i] in rg[R_MMREGISTER].used_in_proc) then
+              inc(result,tcgsize2size[OS_VECTOR]);
+          end;
+      end;
+
+
     procedure tcgx86_64.g_proc_entry(list : TAsmList;localsize:longint;nostackframe:boolean);
     procedure tcgx86_64.g_proc_entry(list : TAsmList;localsize:longint;nostackframe:boolean);
       var
       var
         hitem: tlinkedlistitem;
         hitem: tlinkedlistitem;
@@ -113,7 +141,31 @@ unit cgcpu;
         suppress_endprologue: boolean;
         suppress_endprologue: boolean;
         stackmisalignment: longint;
         stackmisalignment: longint;
         para: tparavarsym;
         para: tparavarsym;
+        xmmsize: longint;
+
+      procedure push_one_reg(reg: tregister);
+        begin
+          list.concat(taicpu.op_reg(A_PUSH,tcgsize2opsize[OS_ADDR],reg));
+          if (target_info.system=system_x86_64_win64) then
+            begin
+              list.concat(cai_seh_directive.create_reg(ash_pushreg,reg));
+              include(current_procinfo.flags,pi_has_unwind_info);
+            end;
+        end;
+
+      procedure push_regs;
+        var
+          r: longint;
         begin
         begin
+          for r := low(saved_standard_registers) to high(saved_standard_registers) do
+            if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then
+              begin
+                inc(stackmisalignment,sizeof(pint));
+                push_one_reg(newreg(R_INTREGISTER,saved_standard_registers[r],R_SUBWHOLE));
+              end;
+        end;
+
+      begin
         hitem:=list.last;
         hitem:=list.last;
         { pi_has_unwind_info may already be set at this point if there are
         { pi_has_unwind_info may already be set at this point if there are
           SEH directives in assembler body. In this case, .seh_endprologue
           SEH directives in assembler body. In this case, .seh_endprologue
@@ -127,17 +179,15 @@ unit cgcpu;
             stackmisalignment := sizeof(pint);
             stackmisalignment := sizeof(pint);
             list.concat(tai_regalloc.alloc(current_procinfo.framepointer,nil));
             list.concat(tai_regalloc.alloc(current_procinfo.framepointer,nil));
             if current_procinfo.framepointer=NR_STACK_POINTER_REG then
             if current_procinfo.framepointer=NR_STACK_POINTER_REG then
-              CGmessage(cg_d_stackframe_omited)
+              begin
+                push_regs;
+                CGmessage(cg_d_stackframe_omited);
+              end
             else
             else
               begin
               begin
                 { push <frame_pointer> }
                 { push <frame_pointer> }
                 inc(stackmisalignment,sizeof(pint));
                 inc(stackmisalignment,sizeof(pint));
-                list.concat(Taicpu.op_reg(A_PUSH,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG));
-                if (target_info.system=system_x86_64_win64) then
-                  begin
-                    list.concat(cai_seh_directive.create_reg(ash_pushreg,NR_FRAME_POINTER_REG));
-                    include(current_procinfo.flags,pi_has_unwind_info);
-                  end;
+                push_one_reg(NR_FRAME_POINTER_REG);
                 { Return address and FP are both on stack }
                 { Return address and FP are both on stack }
                 current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint));
                 current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint));
                 current_asmdata.asmcfi.cfa_offset(list,NR_FRAME_POINTER_REG,-(2*sizeof(pint)));
                 current_asmdata.asmcfi.cfa_offset(list,NR_FRAME_POINTER_REG,-(2*sizeof(pint)));
@@ -145,6 +195,7 @@ unit cgcpu;
                   list.concat(Taicpu.op_reg_reg(A_MOV,tcgsize2opsize[OS_ADDR],NR_STACK_POINTER_REG,NR_FRAME_POINTER_REG))
                   list.concat(Taicpu.op_reg_reg(A_MOV,tcgsize2opsize[OS_ADDR],NR_STACK_POINTER_REG,NR_FRAME_POINTER_REG))
                 else
                 else
                   begin
                   begin
+                    push_regs;
                     { load framepointer from hidden $parentfp parameter }
                     { load framepointer from hidden $parentfp parameter }
                     para:=tparavarsym(current_procinfo.procdef.paras[0]);
                     para:=tparavarsym(current_procinfo.procdef.paras[0]);
                     if not (vo_is_parentfp in para.varoptions) then
                     if not (vo_is_parentfp in para.varoptions) then
@@ -170,6 +221,14 @@ unit cgcpu;
                 }
                 }
               end;
               end;
 
 
+            xmmsize:=saved_xmm_reg_size;
+            if use_push and (xmmsize<>0) then
+              begin
+                localsize:=align(localsize,target_info.stackalign)+xmmsize;
+                reference_reset_base(current_procinfo.save_regs_ref,NR_STACK_POINTER_REG,
+                  localsize-xmmsize,tcgsize2size[OS_VECTOR]);
+              end;
+
             { allocate stackframe space }
             { allocate stackframe space }
             if (localsize<>0) or
             if (localsize<>0) or
                ((target_info.stackalign>sizeof(pint)) and
                ((target_info.stackalign>sizeof(pint)) and
@@ -188,6 +247,16 @@ unit cgcpu;
                     if localsize<>0 then
                     if localsize<>0 then
                       list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize));
                       list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize));
                     include(current_procinfo.flags,pi_has_unwind_info);
                     include(current_procinfo.flags,pi_has_unwind_info);
+                    if use_push and (xmmsize<>0) then
+                      begin
+                        href:=current_procinfo.save_regs_ref;
+                        for r:=low(saved_mm_registers) to high(saved_mm_registers) do
+                          if saved_mm_registers[r] in rg[R_MMREGISTER].used_in_proc then
+                            begin
+                              a_loadmm_reg_ref(list,OS_VECTOR,OS_VECTOR,newreg(R_MMREGISTER,saved_mm_registers[r],R_SUBMMWHOLE),href,nil);
+                              inc(href.offset,tcgsize2size[OS_VECTOR]);
+                            end;
+                      end;
                   end;
                   end;
                end;
                end;
           end;
           end;
@@ -209,6 +278,8 @@ unit cgcpu;
           since registers are not modified before they are saved, and saves do not
           since registers are not modified before they are saved, and saves do not
           change RSP, 'logically' all saves can happen at the end of prologue. }
           change RSP, 'logically' all saves can happen at the end of prologue. }
         href:=current_procinfo.save_regs_ref;
         href:=current_procinfo.save_regs_ref;
+        if (not use_push) then
+          begin
             for r:=low(saved_standard_registers) to high(saved_standard_registers) do
             for r:=low(saved_standard_registers) to high(saved_standard_registers) do
               if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then
               if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then
                 begin
                 begin
@@ -217,6 +288,7 @@ unit cgcpu;
                     href.offset+frame_offset));
                     href.offset+frame_offset));
                  inc(href.offset,sizeof(aint));
                  inc(href.offset,sizeof(aint));
                 end;
                 end;
+          end;
         if uses_registers(R_MMREGISTER) then
         if uses_registers(R_MMREGISTER) then
           begin
           begin
             if (href.offset mod tcgsize2size[OS_VECTOR])<>0 then
             if (href.offset mod tcgsize2size[OS_VECTOR])<>0 then
@@ -256,6 +328,8 @@ unit cgcpu;
 
 
       var
       var
         href : treference;
         href : treference;
+        hreg : tregister;
+        r : longint;
       begin
       begin
         { Release PIC register }
         { Release PIC register }
         if cs_create_pic in current_settings.moduleswitches then
         if cs_create_pic in current_settings.moduleswitches then
@@ -268,11 +342,26 @@ unit cgcpu;
         { remove stackframe }
         { remove stackframe }
         if not nostackframe then
         if not nostackframe then
           begin
           begin
-            if (current_procinfo.framepointer=NR_STACK_POINTER_REG) or
-               (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then
+            if use_push then
               begin
               begin
+                if (saved_xmm_reg_size<>0) then
+                  begin
+                    href:=current_procinfo.save_regs_ref;
+                    for r:=low(saved_mm_registers) to high(saved_mm_registers) do
+                      if saved_mm_registers[r] in rg[R_MMREGISTER].used_in_proc then
+                        begin
+                          { Allocate register so the optimizer does not remove the load }
+                          hreg:=newreg(R_MMREGISTER,saved_mm_registers[r],R_SUBMMWHOLE);
+                          a_reg_alloc(list,hreg);
+                          a_loadmm_ref_reg(list,OS_VECTOR,OS_VECTOR,href,hreg,nil);
+                          inc(href.offset,tcgsize2size[OS_VECTOR]);
+                        end;
+                  end;
+
                 if (current_procinfo.final_localsize<>0) then
                 if (current_procinfo.final_localsize<>0) then
                   increase_sp(current_procinfo.final_localsize);
                   increase_sp(current_procinfo.final_localsize);
+                internal_restore_regs(list,true);
+
                 if (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then
                 if (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then
                   list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG));
                   list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG));
               end
               end
@@ -300,6 +389,20 @@ unit cgcpu;
       end;
       end;
 
 
 
 
+    procedure tcgx86_64.g_save_registers(list: TAsmList);
+      begin
+        if (not use_push) then
+          inherited g_save_registers(list);
+      end;
+
+
+    procedure tcgx86_64.g_restore_registers(list: TAsmList);
+      begin
+        if (not use_push) then
+          inherited g_restore_registers(list);
+      end;
+
+
     procedure tcgx86_64.g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);
     procedure tcgx86_64.g_intf_wrapper(list: TAsmList; procdef: tprocdef; const labelname: string; ioffset: longint);
       var
       var
         make_global : boolean;
         make_global : boolean;