ソースを参照

merge with trunk

git-svn-id: branches/tg74/avx2@28414 -
tg74 11 年 前
コミット
a78fc3fbee

+ 3 - 1
.gitattributes

@@ -203,7 +203,9 @@ compiler/i386/i386prop.inc svneol=native#text/plain
 compiler/i386/i386tab.inc svneol=native#text/plain
 compiler/i386/n386add.pas svneol=native#text/plain
 compiler/i386/n386cal.pas svneol=native#text/plain
+compiler/i386/n386flw.pas svneol=native#text/plain
 compiler/i386/n386inl.pas svneol=native#text/plain
+compiler/i386/n386ld.pas svneol=native#text/plain
 compiler/i386/n386mat.pas svneol=native#text/plain
 compiler/i386/n386mem.pas svneol=native#text/plain
 compiler/i386/n386set.pas svneol=native#text/plain
@@ -218,7 +220,6 @@ compiler/i386/r386nasm.inc svneol=native#text/plain
 compiler/i386/r386nor.inc svneol=native#text/plain
 compiler/i386/r386nri.inc svneol=native#text/plain
 compiler/i386/r386num.inc svneol=native#text/plain
-compiler/i386/r386op.inc svneol=native#text/plain
 compiler/i386/r386ot.inc svneol=native#text/plain
 compiler/i386/r386rni.inc svneol=native#text/plain
 compiler/i386/r386sri.inc svneol=native#text/plain
@@ -228,6 +229,7 @@ compiler/i386/ra386att.pas svneol=native#text/plain
 compiler/i386/ra386int.pas svneol=native#text/plain
 compiler/i386/rgcpu.pas svneol=native#text/plain
 compiler/i386/rropt386.pas svneol=native#text/plain
+compiler/i386/symcpu.pas svneol=native#text/plain
 compiler/i8086/aoptcpu.pas svneol=native#text/plain
 compiler/i8086/aoptcpub.pas svneol=native#text/plain
 compiler/i8086/aoptcpud.pas svneol=native#text/plain

+ 16 - 10
compiler/i386/aopt386.pas

@@ -64,13 +64,16 @@ Begin
    { or nil                                                                }
      While Assigned(BlockStart) Do
        Begin
-         if pass = 0 then
-           PrePeepHoleOpts(AsmL, BlockStart, BlockEnd);
-        { Peephole optimizations }
-         PeepHoleOptPass1(AsmL, BlockStart, BlockEnd);
-        { Only perform them twice in the first pass }
-         if pass = 0 then
-           PeepHoleOptPass1(AsmL, BlockStart, BlockEnd);
+         if (cs_opt_peephole in current_settings.optimizerswitches) then
+           begin
+            if (pass = 0) then
+              PrePeepHoleOpts(AsmL, BlockStart, BlockEnd);
+              { Peephole optimizations }
+               PeepHoleOptPass1(AsmL, BlockStart, BlockEnd);
+              { Only perform them twice in the first pass }
+               if pass = 0 then
+                 PeepHoleOptPass1(AsmL, BlockStart, BlockEnd);
+           end;
         { Data flow analyzer }
          If (cs_opt_asmcse in current_settings.optimizerswitches) Then
            begin
@@ -79,9 +82,12 @@ Begin
                changed := CSE(asmL, blockStart, blockEnd, pass) or changed;
            end;
         { More peephole optimizations }
-         PeepHoleOptPass2(AsmL, BlockStart, BlockEnd);
-         if lastLoop then
-           PostPeepHoleOpts(AsmL, BlockStart, BlockEnd);
+         if (cs_opt_peephole in current_settings.optimizerswitches) then
+           begin
+             PeepHoleOptPass2(AsmL, BlockStart, BlockEnd);
+             if lastLoop then
+               PostPeepHoleOpts(AsmL, BlockStart, BlockEnd);
+           end;
 
         { Free memory }
         dfa.clear;

+ 113 - 56
compiler/i386/cgcpu.pas

@@ -30,7 +30,7 @@ unit cgcpu;
        cgbase,cgobj,cg64f32,cgx86,
        aasmbase,aasmtai,aasmdata,aasmcpu,
        cpubase,parabase,cgutils,
-       symconst,symdef
+       symconst,symdef,symsym
        ;
 
     type
@@ -200,7 +200,7 @@ unit cgcpu;
         if use_push(cgpara) then
           begin
             { Record copy? }
-            if (cgpara.size in [OS_NO,OS_F64]) or (size=OS_NO) then
+            if (cgpara.size=OS_NO) or (size=OS_NO) then
               begin
                 cgpara.check_simple_location;
                 len:=align(cgpara.intsize,cgpara.alignment);
@@ -212,9 +212,19 @@ unit cgcpu;
               begin
                 if tcgsize2size[cgpara.size]<>tcgsize2size[size] then
                   internalerror(200501161);
-                { We need to push the data in reverse order,
-                  therefor we use a recursive algorithm }
-                pushdata(cgpara.location,0);
+                if (cgpara.size=OS_F64) then
+                  begin
+                    href:=r;
+                    make_simple_ref(list,href);
+                    inc(href.offset,4);
+                    list.concat(taicpu.op_ref(A_PUSH,S_L,href));
+                    dec(href.offset,4);
+                    list.concat(taicpu.op_ref(A_PUSH,S_L,href));
+                  end
+                else
+                  { We need to push the data in reverse order,
+                    therefor we use a recursive algorithm }
+                  pushdata(cgpara.location,0);
               end
           end
         else
@@ -294,17 +304,15 @@ unit cgcpu;
 
     procedure tcg386.g_proc_exit(list : TAsmList;parasize:longint;nostackframe:boolean);
 
-      procedure increase_fp(a : tcgint);
+      procedure increase_sp(a : tcgint);
         var
           href : treference;
         begin
-          reference_reset_base(href,current_procinfo.framepointer,a,0);
+          reference_reset_base(href,NR_STACK_POINTER_REG,a,0);
           { normally, lea is a better choice than an add }
-          list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,current_procinfo.framepointer));
+          list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,NR_STACK_POINTER_REG));
         end;
 
-      var
-        stacksize : longint;
       begin
         { MMX needs to call EMMS }
         if assigned(rg[R_MMXREGISTER]) and
@@ -314,20 +322,15 @@ unit cgcpu;
         { remove stackframe }
         if not nostackframe then
           begin
-            if current_procinfo.framepointer=NR_STACK_POINTER_REG then
+            if (current_procinfo.framepointer=NR_STACK_POINTER_REG) or
+               (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then
               begin
-                stacksize:=current_procinfo.calc_stackframe_size;
-                if (target_info.stackalign>4) and
-                   ((stacksize <> 0) or
-                    (pi_do_call in current_procinfo.flags) or
-                    { can't detect if a call in this case -> use nostackframe }
-                    { if you (think you) know what you are doing              }
-                    (po_assembler in current_procinfo.procdef.procoptions)) then
-                  stacksize := align(stacksize+sizeof(aint),target_info.stackalign) - sizeof(aint);
-                if stacksize<>0 then
-                  increase_fp(stacksize);
+                if current_procinfo.final_localsize<>0 then
+                  increase_sp(current_procinfo.final_localsize);
                 if (not paramanager.use_fixed_stack) then
                   internal_restore_regs(list,true);
+                if (current_procinfo.procdef.proctypeoption=potype_exceptfilter) then
+                  list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG));
               end
             else
               begin
@@ -387,7 +390,8 @@ unit cgcpu;
            { but not on win32 }
            { and not for safecall with hidden exceptions, because the result }
            { wich contains the exception is passed in EAX }
-           if (target_info.system <> system_i386_win32) and
+           if ((target_info.system <> system_i386_win32) or
+               (target_info.abi=abi_old_win32_gnu)) and
               not ((current_procinfo.procdef.proccalloption = pocall_safecall) and
                (tf_safecall_exceptions in target_info.flags)) and
               paramanager.ret_in_param(current_procinfo.procdef.returndef,
@@ -411,7 +415,7 @@ unit cgcpu;
 
     procedure tcg386.g_copyvaluepara_openarray(list : TAsmList;const ref:treference;const lenloc:tlocation;elesize:tcgint;destreg:tregister);
       var
-        power,len  : longint;
+        power  : longint;
         opsize : topsize;
 {$ifndef __NOWINPECOFF__}
         again,ok : tasmlabel;
@@ -421,9 +425,21 @@ unit cgcpu;
         getcpuregister(list,NR_EDI);
         a_load_loc_reg(list,OS_INT,lenloc,NR_EDI);
         list.concat(Taicpu.op_reg(A_INC,S_L,NR_EDI));
-        { Now EDI contains (high+1). Copy it to ECX for later use. }
-        getcpuregister(list,NR_ECX);
-        list.concat(Taicpu.op_reg_reg(A_MOV,S_L,NR_EDI,NR_ECX));
+        { Now EDI contains (high+1). }
+
+        { special case handling for elesize=8, 4 and 2:
+          set ECX = (high+1) instead of ECX = (high+1)*elesize.
+
+          In the case of elesize=4 and 2, this allows us to avoid the SHR later.
+          In the case of elesize=8, we can later use a SHL ECX, 1 instead of
+          SHR ECX, 2 which is one byte shorter. }
+        if (elesize=8) or (elesize=4) or (elesize=2) then
+          begin
+            { Now EDI contains (high+1). Copy it to ECX for later use. }
+            getcpuregister(list,NR_ECX);
+            list.concat(Taicpu.op_reg_reg(A_MOV,S_L,NR_EDI,NR_ECX));
+          end;
+        { EDI := EDI * elesize }
         if (elesize<>1) then
          begin
            if ispowerof2(elesize, power) then
@@ -431,6 +447,12 @@ unit cgcpu;
            else
              list.concat(Taicpu.op_const_reg(A_IMUL,S_L,elesize,NR_EDI));
          end;
+        if (elesize<>8) and (elesize<>4) and (elesize<>2) then
+          begin
+            { Now EDI contains (high+1)*elesize. Copy it to ECX for later use. }
+            getcpuregister(list,NR_ECX);
+            list.concat(Taicpu.op_reg_reg(A_MOV,S_L,NR_EDI,NR_ECX));
+          end;
 {$ifndef __NOWINPECOFF__}
         { windows guards only a few pages for stack growing, }
         { so we have to access every page first              }
@@ -464,27 +486,38 @@ unit cgcpu;
         a_loadaddr_ref_reg(list,ref,NR_ESI);
 
         { calculate size }
-        len:=elesize;
         opsize:=S_B;
-        if (len and 3)=0 then
+        if elesize=8 then
+          begin
+            opsize:=S_L;
+            { ECX is number of qwords, convert to dwords }
+            list.concat(Taicpu.op_const_reg(A_SHL,S_L,1,NR_ECX))
+          end
+        else if elesize=4 then
+          begin
+            opsize:=S_L;
+            { ECX is already number of dwords, so no need to SHL/SHR }
+          end
+        else if elesize=2 then
+          begin
+            opsize:=S_W;
+            { ECX is already number of words, so no need to SHL/SHR }
+          end
+        else
+         if (elesize and 3)=0 then
          begin
            opsize:=S_L;
-           len:=len shr 2;
+           { ECX is number of bytes, convert to dwords }
+           list.concat(Taicpu.op_const_reg(A_SHR,S_L,2,NR_ECX))
          end
         else
-         if (len and 1)=0 then
+         if (elesize and 1)=0 then
           begin
             opsize:=S_W;
-            len:=len shr 1;
+            { ECX is number of bytes, convert to words }
+            list.concat(Taicpu.op_const_reg(A_SHR,S_L,1,NR_ECX))
           end;
 
-        if len>1 then
-          begin
-            if ispowerof2(len, power) then
-              list.concat(Taicpu.op_const_reg(A_SHL,S_L,power,NR_ECX))
-            else
-              list.concat(Taicpu.op_const_reg(A_IMUL,S_L,len,NR_ECX));
-          end;
         if ts_cld in current_settings.targetswitches then
           list.concat(Taicpu.op_none(A_CLD,S_NO));
         list.concat(Taicpu.op_none(A_REP,S_NO));
@@ -532,7 +565,7 @@ unit cgcpu;
       begin
         if not paramanager.use_fixed_stack then
           begin
-            cg.a_reg_alloc(list,NR_FUNCTION_RESULT_REG);
+            a_reg_alloc(list,NR_FUNCTION_RESULT_REG);
             list.concat(Taicpu.op_reg(A_POP,tcgsize2opsize[OS_INT],NR_FUNCTION_RESULT_REG))
           end
         else
@@ -561,7 +594,7 @@ unit cgcpu;
                (current_settings.optimizecputype in [cpu_Pentium2,cpu_Pentium3,cpu_Pentium4]) } then
               begin
                 current_module.requires_ebx_pic_helper:=true;
-                cg.a_call_name_static(list,'fpc_geteipasebx');
+                a_call_name_static(list,'fpc_geteipasebx');
               end
             else
               begin
@@ -587,17 +620,17 @@ unit cgcpu;
       possible calling conventions:
                     default stdcall cdecl pascal register
       default(0):      OK     OK    OK     OK       OK
-      virtual(1):      OK     OK    OK     OK       OK(2)
+      virtual(1):      OK     OK    OK     OK       OK(2 or 1)
 
       (0):
           set self parameter to correct value
           jmp mangledname
 
-      (1): The wrapper code use %eax to reach the virtual method address
+      (1): The wrapper code use %ecx to reach the virtual method address
            set self to correct value
            move self,%eax
-           mov  0(%eax),%eax ; load vmt
-           jmp  vmtoffs(%eax) ; method offs
+           mov  0(%eax),%ecx ; load vmt
+           jmp  vmtoffs(%ecx) ; method offs
 
       (2): Virtual use values pushed on stack to reach the method address
            so the following code be generated:
@@ -613,6 +646,30 @@ unit cgcpu;
 
       }
 
+      { returns whether ECX is used (either as a parameter or is nonvolatile and shouldn't be changed) }
+      function is_ecx_used: boolean;
+        var
+          i: Integer;
+          hp: tparavarsym;
+          paraloc: PCGParaLocation;
+        begin
+          if not (RS_ECX in paramanager.get_volatile_registers_int(procdef.proccalloption)) then
+            exit(true);
+          for i:=0 to procdef.paras.count-1 do
+           begin
+             hp:=tparavarsym(procdef.paras[i]);
+             procdef.init_paraloc_info(calleeside);
+             paraloc:=hp.paraloc[calleeside].Location;
+             while paraloc<>nil do
+               begin
+                 if (paraloc^.Loc=LOC_REGISTER) and (getsupreg(paraloc^.register)=RS_ECX) then
+                   exit(true);
+                 paraloc:=paraloc^.Next;
+               end;
+           end;
+          Result:=false;
+        end;
+
       procedure getselftoeax(offs: longint);
         var
           href : treference;
@@ -627,27 +684,27 @@ unit cgcpu;
               else
                 selfoffsetfromsp:=sizeof(aint);
               reference_reset_base(href,NR_ESP,selfoffsetfromsp+offs,4);
-              cg.a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
+              a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
             end;
         end;
 
-      procedure loadvmttoeax;
+      procedure loadvmtto(reg: tregister);
         var
           href : treference;
         begin
-          { mov  0(%eax),%eax ; load vmt}
+          { mov  0(%eax),%reg ; load vmt}
           reference_reset_base(href,NR_EAX,0,4);
-          cg.a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
+          a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,reg);
         end;
 
-      procedure op_oneaxmethodaddr(op: TAsmOp);
+      procedure op_onregmethodaddr(op: TAsmOp; reg: tregister);
         var
           href : treference;
         begin
           if (procdef.extnumber=$ffff) then
             Internalerror(200006139);
-          { call/jmp  vmtoffs(%eax) ; method offs }
-          reference_reset_base(href,NR_EAX,tobjectdef(procdef.struct).vmtmethodoffset(procdef.extnumber),4);
+          { call/jmp  vmtoffs(%reg) ; method offs }
+          reference_reset_base(href,reg,tobjectdef(procdef.struct).vmtmethodoffset(procdef.extnumber),4);
           list.concat(taicpu.op_ref(op,S_L,href));
         end;
 
@@ -660,7 +717,7 @@ unit cgcpu;
             Internalerror(200006139);
           { mov vmtoffs(%eax),%eax ; method offs }
           reference_reset_base(href,NR_EAX,tobjectdef(procdef.struct).vmtmethodoffset(procdef.extnumber),4);
-          cg.a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
+          a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,NR_EAX);
         end;
 
 
@@ -695,13 +752,13 @@ unit cgcpu;
         if (po_virtualmethod in procdef.procoptions) and
             not is_objectpascal_helper(procdef.struct) then
           begin
-            if (procdef.proccalloption=pocall_register) then
+            if (procdef.proccalloption=pocall_register) and is_ecx_used then
               begin
                 { case 2 }
                 list.concat(taicpu.op_reg(A_PUSH,S_L,NR_EBX)); { allocate space for address}
                 list.concat(taicpu.op_reg(A_PUSH,S_L,NR_EAX));
                 getselftoeax(8);
-                loadvmttoeax;
+                loadvmtto(NR_EAX);
                 loadmethodoffstoeax;
                 { mov %eax,4(%esp) }
                 reference_reset_base(href,NR_ESP,4,4);
@@ -715,8 +772,8 @@ unit cgcpu;
               begin
                 { case 1 }
                 getselftoeax(0);
-                loadvmttoeax;
-                op_oneaxmethodaddr(A_JMP);
+                loadvmtto(NR_ECX);
+                op_onregmethodaddr(A_JMP,NR_ECX);
               end;
           end
         { case 0 }

+ 2 - 0
compiler/i386/cpubase.inc

@@ -38,6 +38,7 @@
         S_YMM
       );
 
+      TOpSizes = set of topsize;
 
 {*****************************************************************************
                                 Registers
@@ -138,6 +139,7 @@
       }
       saved_standard_registers : array[0..3] of tsuperregister = (RS_EBX,RS_ESI,RS_EDI,RS_EBP);
 
+      saved_address_registers : array[0..0] of tsuperregister = (RS_INVALID);
       saved_mm_registers : array[0..0] of tsuperregister = (RS_INVALID);
       {# Required parameter alignment when calling a routine declared as
          stdcall and cdecl. The alignment value should be the one defined

+ 2 - 0
compiler/i386/cpuelf.pas

@@ -110,6 +110,8 @@ implementation
           result:=R_386_GOTPC;
         RELOC_PLT32 :
           result:=R_386_PLT32;
+        RELOC_GOTOFF:
+          result:=R_386_GOTOFF;
       else
         result:=0;
         InternalError(2012082301);

+ 37 - 3
compiler/i386/cpuinfo.pas

@@ -46,7 +46,10 @@ Type
        cpu_Pentium2,
        cpu_Pentium3,
        cpu_Pentium4,
-       cpu_PentiumM
+       cpu_PentiumM,
+       cpu_core_i,
+       cpu_core_avx,
+       cpu_core_avx2
       );
 
    tfputype =
@@ -85,7 +88,10 @@ Const
      'PENTIUM2',
      'PENTIUM3',
      'PENTIUM4',
-     'PENTIUMM'
+     'PENTIUMM',
+     'COREI',
+     'COREAVX',
+     'COREAVX2'
    );
 
    fputypestr : array[tfputype] of string[6] = ('',
@@ -117,12 +123,40 @@ Const
                                   cs_opt_tailrecursion,cs_opt_nodecse,cs_useebp,
 				  cs_opt_reorder_fields,cs_opt_fastmath];
 
-   level1optimizerswitches = genericlevel1optimizerswitches + [cs_opt_peephole];
+   level1optimizerswitches = genericlevel1optimizerswitches;
    level2optimizerswitches = genericlevel2optimizerswitches + level1optimizerswitches +
      [cs_opt_regvar,cs_opt_stackframe,cs_opt_tailrecursion,cs_opt_nodecse];
    level3optimizerswitches = genericlevel3optimizerswitches + level2optimizerswitches + [{,cs_opt_loopunroll}];
    level4optimizerswitches = genericlevel4optimizerswitches + level3optimizerswitches + [cs_useebp];
 
+type
+   tcpuflags =
+      (CPUX86_HAS_SSEUNIT,
+       CPUX86_HAS_BMI1,
+       CPUX86_HAS_BMI2,
+       CPUX86_HAS_POPCNT,
+       CPUX86_HAS_AVXUNIT,
+       CPUX86_HAS_LZCNT,
+       CPUX86_HAS_MOVBE,
+       CPUX86_HAS_FMA,
+       CPUX86_HAS_FMA4
+      );
+
+ const
+   cpu_capabilities : array[tcputype] of set of tcpuflags = (
+     { cpu_none      } [],
+     { cpu_386       } [],
+     { cpu_Pentium   } [],
+     { cpu_Pentium2  } [],
+     { cpu_Pentium3  } [CPUX86_HAS_SSEUNIT],
+     { cpu_Pentium4  } [CPUX86_HAS_SSEUNIT],
+     { cpu_PentiumM  } [CPUX86_HAS_SSEUNIT],
+     { cpu_core_i    } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT],
+     { cpu_core_avx  } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT],
+     { cpu_core_avx2 } [CPUX86_HAS_SSEUNIT,CPUX86_HAS_POPCNT,CPUX86_HAS_AVXUNIT,CPUX86_HAS_BMI1,CPUX86_HAS_BMI2,CPUX86_HAS_LZCNT,CPUX86_HAS_MOVBE,CPUX86_HAS_FMA]
+   );
+
+
 Implementation
 
 end.

+ 7 - 1
compiler/i386/cpunode.pas

@@ -51,10 +51,16 @@ unit cpunode;
 
        n386add,
        n386cal,
+       n386ld,
        n386mem,
        n386set,
        n386inl,
-       n386mat
+{$ifdef TEST_WIN32_SEH}
+       n386flw,
+{$endif TEST_WIN32_SEH}
+       n386mat,
+       { symtable }
+       symcpu
        ;
 
 end.

+ 0 - 51
compiler/i386/cpupara.pas

@@ -40,12 +40,6 @@ unit cpupara;
           function get_volatile_registers_int(calloption : tproccalloption):tcpuregisterset;override;
           function get_volatile_registers_fpu(calloption : tproccalloption):tcpuregisterset;override;
           function get_volatile_registers_mm(calloption : tproccalloption):tcpuregisterset;override;
-          { Returns the location for the nr-st 32 Bit int parameter
-            if every parameter before is an 32 Bit int parameter as well
-            and if the calling conventions for the helper routines of the
-            rtl are used.
-          }
-          procedure getintparaloc(pd : tabstractprocdef; nr : longint; var cgpara : tcgpara);override;
           function create_paraloc_info(p : tabstractprocdef; side: tcallercallee):longint;override;
           function create_varargs_paraloc_info(p : tabstractprocdef; varargspara:tvarargsparalist):longint;override;
           procedure createtempparaloc(list: TAsmList;calloption : tproccalloption;parasym : tparavarsym;can_use_final_stack_loc : boolean;var cgpara:TCGPara);override;
@@ -274,51 +268,6 @@ unit cpupara;
       end;
 
 
-    procedure ti386paramanager.getintparaloc(pd : tabstractprocdef; nr : longint; var cgpara : tcgpara);
-      var
-        paraloc : pcgparalocation;
-        psym: tparavarsym;
-        pdef: tdef;
-      begin
-        psym:=tparavarsym(pd.paras[nr-1]);
-        pdef:=psym.vardef;
-        if push_addr_param(psym.varspez,pdef,pd.proccalloption) then
-          pdef:=getpointerdef(pdef);
-        cgpara.reset;
-        cgpara.size:=def_cgsize(pdef);
-        cgpara.intsize:=tcgsize2size[cgpara.size];
-        cgpara.alignment:=get_para_align(pd.proccalloption);
-        cgpara.def:=pdef;
-        paraloc:=cgpara.add_location;
-        with paraloc^ do
-         begin
-           size:=def_cgsize(pdef);
-           def:=pdef;
-           if pd.proccalloption=pocall_register then
-             begin
-               if (nr<=length(parasupregs)) then
-                 begin
-                   if nr=0 then
-                     internalerror(200309271);
-                   loc:=LOC_REGISTER;
-                   register:=newreg(R_INTREGISTER,parasupregs[nr-1],R_SUBWHOLE);
-                 end
-               else
-                 begin
-                   loc:=LOC_REFERENCE;
-                   reference.index:=NR_STACK_POINTER_REG;
-                   { the previous parameters didn't take up room in memory }
-                   reference.offset:=sizeof(aint)*(nr-length(parasupregs)-1)
-                 end;
-             end
-           else
-             begin
-               loc:=LOC_REFERENCE;
-               reference.index:=NR_STACK_POINTER_REG;
-               reference.offset:=sizeof(aint)*nr;
-             end;
-          end;
-      end;
 
 
     function  ti386paramanager.get_funcretloc(p : tabstractprocdef; side: tcallercallee; forcetempdef: tdef): TCGPara;

+ 17 - 7
compiler/i386/csopt386.pas

@@ -353,6 +353,7 @@ var
 
   function getNextRegToTest(var prev: tai; currentReg: tsuperregister): tsuperregister;
   begin
+    getNextRegToTest := RS_INVALID;
     if not checkingPrevSequences then
       begin
         if (currentreg = RS_INVALID) then
@@ -423,9 +424,13 @@ begin {CheckSequence}
   TmpResult := False;
   FillChar(OrgRegInfo, Sizeof(OrgRegInfo), 0);
   FillChar(startRegInfo, sizeof(startRegInfo), 0);
+  FillChar(HighRegInfo, sizeof(HighRegInfo), 0);
+  FillChar(prevreginfo, sizeof(prevreginfo), 0);
   OrgRegFound := 0;
   HighFound := 0;
   OrgRegResult := False;
+  highPrev := nil;
+  orgPrev := nil;
   with startRegInfo do
     begin
       newRegsEncountered := [RS_EBP, RS_ESP];
@@ -1074,6 +1079,7 @@ var
   prev: tai;
   newOrgRegRState, newOrgRegWState: byte;
 begin
+  newOrgRegwState := 0;
   if getLastInstruction(hp,prev) then
     with ptaiprop(prev.optinfo)^ do
       begin
@@ -1429,6 +1435,7 @@ var
 
 begin
   replacereg := false;
+  readStateChanged := false;
   if canreplacereg(orgsupreg,newsupreg,p,orgregcanbemodified,newregmodified, orgregread, removelast,endp) then
     begin
 {$ifdef replaceregdebug}
@@ -1595,12 +1602,12 @@ var
   regcounter: tsuperregister;
   optimizable: boolean;
 begin
+  memtoreg := NR_NO;
+
   if not getlastinstruction(t,hp) or
      not issimplememloc(ref) then
-    begin
-      memtoreg := NR_NO;
-      exit;
-    end;
+    exit;
+
   p := ptaiprop(hp.optinfo);
   optimizable := false;
   for regcounter := RS_EAX to RS_EDI do
@@ -1832,15 +1839,18 @@ procedure doCSE(asml: TAsmList; First, Last: tai; findPrevSeqs, doSubOpts: boole
  removed immediately because sometimes an instruction needs to be checked in
  two different sequences}
 var cnt, cnt2, {cnt3,} orgNrofMods: longint;
-    p, hp1, hp2, prevSeq: tai;
-    hp4: tai;
-    hp5 : tai;
+    p, hp1, hp2, hp4, hp5, prevSeq: tai;
     reginfo: toptreginfo;
     memreg: tregister;
     regcounter: tsuperregister;
 begin
   p := First;
   SkipHead(p);
+  hp1 := nil;
+  hp2 := nil;
+  hp4 := nil;
+  hp5 := nil;
+  cnt := 0;
   while (p <> Last) do
     begin
       case p.typ of

+ 1 - 0
compiler/i386/daopt386.pas

@@ -901,6 +901,7 @@ var
   Cnt: Word;
 begin
   TmpResult := False;
+  Result := False;
   if supreg = RS_INVALID then
     exit;
   if (p1.typ = ait_instruction) then

+ 5 - 5
compiler/i386/hlcgcpu.pas

@@ -78,10 +78,10 @@ implementation
                      (cgpara.location^.reference.index=NR_STACK_POINTER_REG) then
                     begin
                       cg.g_stackpointer_alloc(list,stacksize);
-                      reference_reset_base(href,NR_STACK_POINTER_REG,0,sizeof(pint));
+                      reference_reset_base(href,voidstackpointertype,NR_STACK_POINTER_REG,0,voidstackpointertype.size);
                     end
                   else
-                    reference_reset_base(href,cgpara.location^.reference.index,cgpara.location^.reference.offset,cgpara.alignment);
+                    reference_reset_base(href,voidstackpointertype,cgpara.location^.reference.index,cgpara.location^.reference.offset,cgpara.alignment);
                   cg.a_loadfpu_reg_ref(list,locsize,locsize,l.register,href);
                 end;
               LOC_FPUREGISTER:
@@ -123,10 +123,10 @@ implementation
                      (cgpara.location^.reference.index=NR_STACK_POINTER_REG) then
                     begin
                       cg.g_stackpointer_alloc(list,stacksize);
-                      reference_reset_base(href,NR_STACK_POINTER_REG,0,sizeof(pint));
+                      reference_reset_base(href,voidstackpointertype,NR_STACK_POINTER_REG,0,voidstackpointertype.size);
                     end
                   else
-                    reference_reset_base(href,cgpara.location^.reference.index,cgpara.location^.reference.offset,cgpara.alignment);
+                    reference_reset_base(href,voidstackpointertype,cgpara.location^.reference.index,cgpara.location^.reference.offset,cgpara.alignment);
                   cg.a_loadmm_reg_ref(list,locsize,locsize,l.register,href,mms_movescalar);
                 end;
               LOC_FPUREGISTER:
@@ -152,7 +152,7 @@ implementation
                     cg.a_load_ref_cgpara(list,locsize,l.reference,cgpara)
                   else
                     begin
-                      reference_reset_base(href,cgpara.location^.reference.index,cgpara.location^.reference.offset,cgpara.alignment);
+                      reference_reset_base(href,voidstackpointertype,cgpara.location^.reference.index,cgpara.location^.reference.offset,cgpara.alignment);
                       cg.g_concatcopy(list,l.reference,href,stacksize);
                     end;
                 end;

+ 228 - 85
compiler/i386/n386add.pas

@@ -31,10 +31,14 @@ interface
     type
        ti386addnode = class(tx86addnode)
          function use_generic_mul32to64: boolean; override;
+         function use_generic_mul64bit: boolean; override;
          procedure second_addordinal; override;
          procedure second_add64bit;override;
          procedure second_cmp64bit;override;
          procedure second_mul(unsigned: boolean);
+         procedure second_mul64bit;
+       protected
+         procedure set_mul_result_location;
        end;
 
   implementation
@@ -58,6 +62,12 @@ interface
       result := False;
     end;
 
+    function ti386addnode.use_generic_mul64bit: boolean;
+    begin
+      result:=(cs_check_overflow in current_settings.localswitches) or
+        (cs_opt_size in current_settings.optimizerswitches);
+    end;
+
     { handles all unsigned multiplications, and 32->64 bit signed ones.
       32bit-only signed mul is handled by generic codegen }
     procedure ti386addnode.second_addordinal;
@@ -66,6 +76,11 @@ interface
     begin
       unsigned:=not(is_signed(left.resultdef)) or
                 not(is_signed(right.resultdef));
+      { use IMUL instead of MUL in case overflow checking is off and we're
+        doing a 32->32-bit multiplication }
+      if not (cs_check_overflow in current_settings.localswitches) and
+         not is_64bit(resultdef) then
+        unsigned:=false;
       if (nodetype=muln) and (unsigned or is_64bit(resultdef)) then
         second_mul(unsigned)
       else
@@ -117,6 +132,11 @@ interface
             op:=OP_OR;
           andn:
             op:=OP_AND;
+          muln:
+            begin
+              second_mul64bit;
+              exit;
+            end
           else
             begin
               { everything should be handled in pass_1 (JM) }
@@ -209,8 +229,7 @@ interface
 
     procedure ti386addnode.second_cmp64bit;
       var
-        hregister,
-        hregister2 : tregister;
+        hlab       : tasmlabel;
         href       : treference;
         unsigned   : boolean;
 
@@ -227,10 +246,12 @@ interface
            case nodetype of
               ltn,gtn:
                 begin
-                   cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),current_procinfo.CurrTrueLabel);
+                   if (hlab<>current_procinfo.CurrTrueLabel) then
+                     cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),current_procinfo.CurrTrueLabel);
                    { cheat a little bit for the negative test }
                    toggleflag(nf_swapped);
-                   cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),current_procinfo.CurrFalseLabel);
+                   if (hlab<>current_procinfo.CurrFalseLabel) then
+                     cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),current_procinfo.CurrFalseLabel);
                    toggleflag(nf_swapped);
                 end;
               lten,gten:
@@ -240,13 +261,15 @@ interface
                      nodetype:=ltn
                    else
                      nodetype:=gtn;
-                   cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),current_procinfo.CurrTrueLabel);
+                   if (hlab<>current_procinfo.CurrTrueLabel) then
+                     cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),current_procinfo.CurrTrueLabel);
                    { cheat for the negative test }
                    if nodetype=ltn then
                      nodetype:=gtn
                    else
                      nodetype:=ltn;
-                   cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),current_procinfo.CurrFalseLabel);
+                   if (hlab<>current_procinfo.CurrFalseLabel) then
+                     cg.a_jmp_flags(current_asmdata.CurrAsmList,getresflags(unsigned),current_procinfo.CurrFalseLabel);
                    nodetype:=oldnodetype;
                 end;
               equaln:
@@ -289,24 +312,46 @@ interface
                   ((right.resultdef.typ=orddef) and
                    (torddef(right.resultdef).ordtype=u64bit));
 
+        { we have LOC_JUMP as result }
+        location_reset(location,LOC_JUMP,OS_NO);
+
+        { Relational compares against constants having low dword=0 can omit the
+          second compare based on the fact that any unsigned value is >=0 }
+        hlab:=nil;
+        if (right.location.loc=LOC_CONSTANT) and
+           (lo(right.location.value64)=0) then
+          begin
+            case getresflags(true) of
+              F_AE: hlab:=current_procinfo.CurrTrueLabel;
+              F_B:  hlab:=current_procinfo.CurrFalseLabel;
+            end;
+          end;
+
+        if (right.location.loc=LOC_CONSTANT) and
+           (left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
+          begin
+            tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
+            href:=left.location.reference;
+            inc(href.offset,4);
+            emit_const_ref(A_CMP,S_L,aint(hi(right.location.value64)),href);
+            firstjmp64bitcmp;
+            if assigned(hlab) then
+              cg.a_jmp_always(current_asmdata.CurrAsmList,hlab)
+            else
+              begin
+                emit_const_ref(A_CMP,S_L,aint(lo(right.location.value64)),left.location.reference);
+                secondjmp64bitcmp;
+              end;
+            location_freetemp(current_asmdata.CurrAsmList,left.location);
+            exit;
+          end;
+
         { left and right no register?  }
         { then one must be demanded    }
-        if (left.location.loc<>LOC_REGISTER) then
+        if not (left.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
          begin
-           if (right.location.loc<>LOC_REGISTER) then
-            begin
-              { we can reuse a CREGISTER for comparison }
-              if (left.location.loc<>LOC_CREGISTER) then
-               begin
-                 hregister:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-                 hregister2:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-                 cg64.a_load64_loc_reg(current_asmdata.CurrAsmList,left.location,joinreg64(hregister,hregister2));
-                 location_freetemp(current_asmdata.CurrAsmList,left.location);
-                 location_reset(left.location,LOC_REGISTER,left.location.size);
-                 left.location.register64.reglo:=hregister;
-                 left.location.register64.reghi:=hregister2;
-               end;
-            end
+           if not (right.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
+             hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,true)
            else
             begin
               location_swap(left.location,right.location);
@@ -314,51 +359,44 @@ interface
             end;
          end;
 
-        { at this point, left.location.loc should be LOC_REGISTER }
-        if right.location.loc=LOC_REGISTER then
-         begin
-           emit_reg_reg(A_CMP,S_L,right.location.register64.reghi,left.location.register64.reghi);
-           firstjmp64bitcmp;
-           emit_reg_reg(A_CMP,S_L,right.location.register64.reglo,left.location.register64.reglo);
-           secondjmp64bitcmp;
-         end
+        { at this point, left.location.loc should be LOC_[C]REGISTER }
+        case right.location.loc of
+          LOC_REGISTER,
+          LOC_CREGISTER :
+            begin
+              emit_reg_reg(A_CMP,S_L,right.location.register64.reghi,left.location.register64.reghi);
+              firstjmp64bitcmp;
+              emit_reg_reg(A_CMP,S_L,right.location.register64.reglo,left.location.register64.reglo);
+              secondjmp64bitcmp;
+            end;
+          LOC_CREFERENCE,
+          LOC_REFERENCE :
+            begin
+              tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
+              href:=right.location.reference;
+              inc(href.offset,4);
+              emit_ref_reg(A_CMP,S_L,href,left.location.register64.reghi);
+              firstjmp64bitcmp;
+              emit_ref_reg(A_CMP,S_L,right.location.reference,left.location.register64.reglo);
+              secondjmp64bitcmp;
+              location_freetemp(current_asmdata.CurrAsmList,right.location);
+            end;
+          LOC_CONSTANT :
+            begin
+              current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_CMP,S_L,aint(hi(right.location.value64)),left.location.register64.reghi));
+              firstjmp64bitcmp;
+              if assigned(hlab) then
+                cg.a_jmp_always(current_asmdata.CurrAsmList,hlab)
+              else
+                begin
+                  current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_CMP,S_L,aint(lo(right.location.value64)),left.location.register64.reglo));
+                  secondjmp64bitcmp;
+                end;
+            end;
         else
-         begin
-           case right.location.loc of
-             LOC_CREGISTER :
-               begin
-                 emit_reg_reg(A_CMP,S_L,right.location.register64.reghi,left.location.register64.reghi);
-                 firstjmp64bitcmp;
-                 emit_reg_reg(A_CMP,S_L,right.location.register64.reglo,left.location.register64.reglo);
-                 secondjmp64bitcmp;
-               end;
-             LOC_CREFERENCE,
-             LOC_REFERENCE :
-               begin
-                 tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
-                 href:=right.location.reference;
-                 inc(href.offset,4);
-                 emit_ref_reg(A_CMP,S_L,href,left.location.register64.reghi);
-                 firstjmp64bitcmp;
-                 emit_ref_reg(A_CMP,S_L,right.location.reference,left.location.register64.reglo);
-                 secondjmp64bitcmp;
-                 cg.a_jmp_always(current_asmdata.CurrAsmList,current_procinfo.CurrFalseLabel);
-                 location_freetemp(current_asmdata.CurrAsmList,right.location);
-               end;
-             LOC_CONSTANT :
-               begin
-                 current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_CMP,S_L,aint(hi(right.location.value64)),left.location.register64.reghi));
-                 firstjmp64bitcmp;
-                 current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_CMP,S_L,aint(lo(right.location.value64)),left.location.register64.reglo));
-                 secondjmp64bitcmp;
-               end;
-             else
-               internalerror(200203282);
-           end;
-         end;
+          internalerror(200203282);
+        end;
 
-        { we have LOC_JUMP as result }
-        location_reset(location,LOC_JUMP,OS_NO)
       end;
 
 
@@ -366,6 +404,32 @@ interface
                                 x86 MUL
 *****************************************************************************}
 
+    procedure ti386addnode.set_mul_result_location;
+    begin
+      location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
+      {Free EAX,EDX}
+      cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
+      if is_64bit(resultdef) then
+      begin
+        {Allocate a couple of registers and store EDX:EAX into it}
+        location.register64.reghi := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EDX, location.register64.reghi);
+        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
+        location.register64.reglo := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EAX, location.register64.reglo);
+      end
+      else
+      begin
+        {Allocate a new register and store the result in EAX in it.}
+        location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
+        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EAX,location.register);
+      end;
+      location_freetemp(current_asmdata.CurrAsmList,left.location);
+      location_freetemp(current_asmdata.CurrAsmList,right.location);
+    end;
+
+
     procedure ti386addnode.second_mul(unsigned: boolean);
 
     var reg:Tregister;
@@ -379,8 +443,6 @@ interface
     begin
       pass_left_right;
 
-      {The location.register will be filled in later (JM)}
-      location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
       { Mul supports registers and references, so if not register/reference,
         load the location into a register.
         The variant of IMUL which is capable of doing 32->64 bits has the same restrictions. }
@@ -418,26 +480,107 @@ interface
           cg.a_call_name(current_asmdata.CurrAsmList,'FPC_OVERFLOW',false);
           cg.a_label(current_asmdata.CurrAsmList,hl4);
         end;
-      {Free EAX,EDX}
-      cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-      if is_64bit(resultdef) then
-      begin
-        {Allocate a couple of registers and store EDX:EAX into it}
-        location.register64.reghi := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-        cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EDX, location.register64.reghi);
-        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-        location.register64.reglo := cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-        cg.a_load_reg_reg(current_asmdata.CurrAsmList, OS_INT, OS_INT, NR_EAX, location.register64.reglo);
-      end
+      set_mul_result_location;
+    end;
+
+
+    procedure ti386addnode.second_mul64bit;
+    var
+      list: TAsmList;
+      hreg1,hreg2: tregister;
+    begin
+      { 64x64 multiplication yields 128-bit result, but we're only
+        interested in its lower 64 bits. This lower part is independent
+        of operand signs, and so is the generated code. }
+      { pass_left_right already called from second_add64bit }
+      list:=current_asmdata.CurrAsmList;
+      if left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
+        tcgx86(cg).make_simple_ref(list,left.location.reference);
+      if right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
+        tcgx86(cg).make_simple_ref(list,right.location.reference);
+
+      { calculate 32-bit terms lo(right)*hi(left) and hi(left)*lo(right) }
+      if (right.location.loc=LOC_CONSTANT) then
+        begin
+          { Omit zero terms, if any }
+          hreg1:=NR_NO;
+          hreg2:=NR_NO;
+          if lo(right.location.value64)<>0 then
+            hreg1:=cg.getintregister(list,OS_INT);
+          if hi(right.location.value64)<>0 then
+            hreg2:=cg.getintregister(list,OS_INT);
+
+          { Take advantage of 3-operand form of IMUL }
+          case left.location.loc of
+            LOC_REGISTER,LOC_CREGISTER:
+              begin
+                if hreg1<>NR_NO then
+                  emit_const_reg_reg(A_IMUL,S_L,longint(lo(right.location.value64)),left.location.register64.reghi,hreg1);
+                if hreg2<>NR_NO then
+                  emit_const_reg_reg(A_IMUL,S_L,longint(hi(right.location.value64)),left.location.register64.reglo,hreg2);
+              end;
+            LOC_REFERENCE,LOC_CREFERENCE:
+              begin
+                if hreg2<>NR_NO then
+                  list.concat(taicpu.op_const_ref_reg(A_IMUL,S_L,longint(hi(right.location.value64)),left.location.reference,hreg2));
+                inc(left.location.reference.offset,4);
+                if hreg1<>NR_NO then
+                  list.concat(taicpu.op_const_ref_reg(A_IMUL,S_L,longint(lo(right.location.value64)),left.location.reference,hreg1));
+                dec(left.location.reference.offset,4);
+              end;
+          else
+            InternalError(2014011602);
+          end;
+        end
       else
-      begin
-        {Allocate a new register and store the result in EAX in it.}
-        location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EAX,location.register);
-      end;
-      location_freetemp(current_asmdata.CurrAsmList,left.location);
-      location_freetemp(current_asmdata.CurrAsmList,right.location);
+        begin
+          hreg1:=cg.getintregister(list,OS_INT);
+          hreg2:=cg.getintregister(list,OS_INT);
+          cg64.a_load64low_loc_reg(list,left.location,hreg1);
+          cg64.a_load64high_loc_reg(list,left.location,hreg2);
+          case right.location.loc of
+            LOC_REGISTER,LOC_CREGISTER:
+              begin
+                emit_reg_reg(A_IMUL,S_L,right.location.register64.reghi,hreg1);
+                emit_reg_reg(A_IMUL,S_L,right.location.register64.reglo,hreg2);
+              end;
+            LOC_REFERENCE,LOC_CREFERENCE:
+              begin
+                emit_ref_reg(A_IMUL,S_L,right.location.reference,hreg2);
+                inc(right.location.reference.offset,4);
+                emit_ref_reg(A_IMUL,S_L,right.location.reference,hreg1);
+                dec(right.location.reference.offset,4);
+              end;
+          else
+            InternalError(2014011603);
+          end;
+        end;
+      { add hi*lo and lo*hi terms together }
+      if (hreg1<>NR_NO) and (hreg2<>NR_NO) then
+        emit_reg_reg(A_ADD,S_L,hreg2,hreg1);
+
+      { load lo(right) into EAX }
+      cg.getcpuregister(list,NR_EAX);
+      cg64.a_load64low_loc_reg(list,right.location,NR_EAX);
+
+      { multiply EAX by lo(left), producing 64-bit value in EDX:EAX }
+      cg.getcpuregister(list,NR_EDX);
+      if (left.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
+        emit_reg(A_MUL,S_L,left.location.register64.reglo)
+      else if (left.location.loc in [LOC_REFERENCE,LOC_CREFERENCE]) then
+        emit_ref(A_MUL,S_L,left.location.reference)
+      else
+        InternalError(2014011604);
+      { add previously calculated terms to the high half }
+      if (hreg1<>NR_NO) then
+        emit_reg_reg(A_ADD,S_L,hreg1,NR_EDX)
+      else if (hreg2<>NR_NO) then
+        emit_reg_reg(A_ADD,S_L,hreg2,NR_EDX)
+      else
+        InternalError(2014011604);
+
+      { Result is now in EDX:EAX. Copy it to virtual registers. }
+      set_mul_result_location;
     end;
 
 

+ 2 - 1
compiler/i386/n386cal.pas

@@ -93,7 +93,8 @@ implementation
           it is always the first parameter (apart from hidden parentfp,
           but this one is never put into a register (vs_nonregable set)
           so funcret is always in EAX for register calling }
-        if (target_info.system = system_i386_win32) and
+        if ((target_info.system = system_i386_win32) and
+            not (target_info.abi=abi_old_win32_gnu)) and
             paramanager.ret_in_param(procdefinition.returndef,procdefinition) and
             not ((procdefinition.proccalloption=pocall_register) or
                  ((procdefinition.proccalloption=pocall_internproc) and

+ 685 - 0
compiler/i386/n386flw.pas

@@ -0,0 +1,685 @@
+{
+    Copyright (c) 2011 by Free Pascal development team
+
+    Generate Win32-specific exception handling code
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ ****************************************************************************
+}
+unit n386flw;
+
+{$i fpcdefs.inc}
+
+interface
+
+  uses
+    node,nflw,ncgflw,psub;
+
+  type
+    ti386raisenode=class(tcgraisenode)
+      function pass_1 : tnode;override;
+    end;
+
+    ti386onnode=class(tcgonnode)
+      procedure pass_generate_code;override;
+    end;
+
+    ti386tryexceptnode=class(tcgtryexceptnode)
+      procedure pass_generate_code;override;
+    end;
+
+    ti386tryfinallynode=class(tcgtryfinallynode)
+      finalizepi: tcgprocinfo;
+      constructor create(l,r:TNode);override;
+      constructor create_implicit(l,r,_t1:TNode);override;
+      function pass_1: tnode;override;
+      function simplify(forinline: boolean): tnode;override;
+      procedure pass_generate_code;override;
+    end;
+
+implementation
+
+  uses
+    cutils,globtype,globals,verbose,systems,
+    nbas,ncal,nmem,nutils,
+    symconst,symbase,symtable,symsym,symdef,
+    cgbase,cgobj,cgcpu,cgutils,tgobj,
+    cpubase,htypechk,
+    parabase,paramgr,pdecsub,pass_1,pass_2,ncgutil,cga,
+    aasmbase,aasmtai,aasmdata,aasmcpu,procinfo,cpupi;
+
+  var
+    endexceptlabel: tasmlabel;
+
+
+{ ti386raisenode }
+
+function ti386raisenode.pass_1 : tnode;
+  var
+    statements : tstatementnode;
+    raisenode : tcallnode;
+  begin
+    { difference from generic code is that address stack is not popped on reraise }
+    if (target_info.system<>system_i386_win32) or assigned(left) then
+      result:=inherited pass_1
+    else
+      begin
+        result:=internalstatements(statements);
+        raisenode:=ccallnode.createintern('fpc_reraise',nil);
+        include(raisenode.callnodeflags,cnf_call_never_returns);
+        addstatement(statements,raisenode);
+      end;
+end;
+
+{ ti386onnode }
+
+procedure ti386onnode.pass_generate_code;
+  var
+    oldflowcontrol : tflowcontrol;
+    exceptvarsym : tlocalvarsym;
+  begin
+    if (target_info.system<>system_i386_win32) then
+      begin
+        inherited pass_generate_code;
+        exit;
+      end;
+
+    location_reset(location,LOC_VOID,OS_NO);
+
+    oldflowcontrol:=flowcontrol;
+    flowcontrol:=flowcontrol*[fc_unwind]+[fc_inflowcontrol];
+
+    { RTL will put exceptobject into EAX when jumping here }
+    cg.a_reg_alloc(current_asmdata.CurrAsmList,NR_FUNCTION_RESULT_REG);
+
+    { Retrieve exception variable }
+    if assigned(excepTSymtable) then
+      exceptvarsym:=tlocalvarsym(excepTSymtable.SymList[0])
+    else
+      exceptvarsym:=nil;
+
+    if assigned(exceptvarsym) then
+      begin
+        exceptvarsym.localloc.loc:=LOC_REFERENCE;
+        exceptvarsym.localloc.size:=OS_ADDR;
+        tg.GetLocal(current_asmdata.CurrAsmList,sizeof(pint),voidpointertype,exceptvarsym.localloc.reference);
+        cg.a_load_reg_ref(current_asmdata.CurrAsmList,OS_ADDR,OS_ADDR,NR_FUNCTION_RESULT_REG,exceptvarsym.localloc.reference);
+      end;
+    cg.a_reg_dealloc(current_asmdata.CurrAsmList,NR_FUNCTION_RESULT_REG);
+
+    if assigned(right) then
+      secondpass(right);
+
+    { deallocate exception symbol }
+    if assigned(exceptvarsym) then
+      begin
+        tg.UngetLocal(current_asmdata.CurrAsmList,exceptvarsym.localloc.reference);
+        exceptvarsym.localloc.loc:=LOC_INVALID;
+      end;
+    cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+    cg.a_jmp_always(current_asmdata.CurrAsmList,endexceptlabel);
+
+    flowcontrol:=oldflowcontrol+(flowcontrol-[fc_inflowcontrol]);
+  end;
+
+{ ti386tryfinallynode }
+
+function reset_regvars(var n: tnode; arg: pointer): foreachnoderesult;
+  begin
+    case n.nodetype of
+      temprefn:
+        make_not_regable(n,[]);
+      calln:
+        include(tprocinfo(arg).flags,pi_do_call);
+    end;
+    result:=fen_true;
+  end;
+
+function copy_parasize(var n: tnode; arg: pointer): foreachnoderesult;
+  begin
+    case n.nodetype of
+      calln:
+        tcgprocinfo(arg).allocate_push_parasize(tcallnode(n).pushed_parasize);
+    end;
+    result:=fen_true;
+  end;
+
+constructor ti386tryfinallynode.create(l, r: TNode);
+  begin
+    inherited create(l,r);
+    if (target_info.system<>system_i386_win32) or (
+      { Don't create child procedures for generic methods, their nested-like
+        behavior causes compilation errors because real nested procedures
+        aren't allowed for generics. Not creating them doesn't harm because
+        generic node tree is discarded without generating code. }
+        assigned(current_procinfo.procdef.struct) and
+        (df_generic in current_procinfo.procdef.struct.defoptions)
+      ) then
+      exit;
+    finalizepi:=tcgprocinfo(cprocinfo.create(current_procinfo));
+    finalizepi.force_nested;
+    finalizepi.procdef:=create_finalizer_procdef;
+    finalizepi.entrypos:=r.fileinfo;
+    finalizepi.entryswitches:=r.localswitches;
+    finalizepi.exitpos:=current_filepos; // last_endtoken_pos?
+    finalizepi.exitswitches:=current_settings.localswitches;
+    { Regvar optimization for symbols is suppressed when using exceptions, but
+      temps may be still placed into registers. This must be fixed. }
+    foreachnodestatic(r,@reset_regvars,finalizepi);
+    include(finalizepi.flags,pi_has_assembler_block);
+    include(finalizepi.flags,pi_do_call);
+    include(finalizepi.flags,pi_uses_exceptions);
+  end;
+
+constructor ti386tryfinallynode.create_implicit(l, r, _t1: TNode);
+  begin
+    inherited create_implicit(l, r, _t1);
+    if (target_info.system<>system_i386_win32) then
+      exit;
+
+    { safecall procedures can handle implicit finalization as part of "except" flow }
+    if implicitframe and (current_procinfo.procdef.proccalloption=pocall_safecall) then
+      exit;
+
+    if assigned(current_procinfo.procdef.struct) and
+      (df_generic in current_procinfo.procdef.struct.defoptions) then
+      InternalError(2013012501);
+
+    finalizepi:=tcgprocinfo(cprocinfo.create(current_procinfo));
+    finalizepi.force_nested;
+    finalizepi.procdef:=create_finalizer_procdef;
+    finalizepi.entrypos:=current_filepos;
+    finalizepi.exitpos:=current_filepos; // last_endtoken_pos?
+    finalizepi.entryswitches:=r.localswitches;
+    finalizepi.exitswitches:=current_settings.localswitches;
+    include(finalizepi.flags,pi_has_assembler_block);
+    include(finalizepi.flags,pi_do_call);
+    include(finalizepi.flags,pi_uses_exceptions);
+  end;
+
+
+function ti386tryfinallynode.pass_1: tnode;
+  var
+    selfsym: tparavarsym;
+  begin
+    result:=inherited pass_1;
+    if (target_info.system=system_i386_win32) then
+      begin
+        { safecall method will access 'self' from except block -> make it non-regable }
+        if implicitframe and (current_procinfo.procdef.proccalloption=pocall_safecall) and
+          is_class(current_procinfo.procdef.struct) then
+          begin
+            selfsym:=tparavarsym(current_procinfo.procdef.parast.Find('self'));
+            if (selfsym=nil) or (selfsym.typ<>paravarsym) then
+              InternalError(2011123101);
+            selfsym.varregable:=vr_none;
+          end;
+      end;
+  end;
+
+
+function ti386tryfinallynode.simplify(forinline: boolean): tnode;
+  begin
+    result:=inherited simplify(forinline);
+    if (target_info.system<>system_i386_win32) then
+      exit;
+
+    if (result=nil) and assigned(finalizepi) then
+      begin
+        finalizepi.code:=right;
+        foreachnodestatic(right,@copy_parasize,finalizepi);
+        right:=ccallnode.create(nil,tprocsym(finalizepi.procdef.procsym),nil,nil,[]);
+        firstpass(right);
+        { For implicit frames, no actual code is available at this time,
+          it is added later in assembler form. So store the nested procinfo
+          for later use. }
+        if implicitframe then
+          begin
+            current_procinfo.finalize_procinfo:=finalizepi;
+            { don't leave dangling pointer }
+            tcgprocinfo(current_procinfo).final_asmnode:=nil;
+          end;
+      end;
+  end;
+
+
+procedure emit_scope_start(handler,data: TAsmSymbol);
+  var
+    href: treference;
+    hreg: tregister;
+  begin
+    hreg:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
+    reference_reset_base(href,hreg,0,sizeof(pint));
+    href.segment:=NR_FS;
+    emit_reg_reg(A_XOR,S_L,hreg,hreg);
+    emit_sym(A_PUSH,S_L,data);
+    emit_reg(A_PUSH,S_L,NR_FRAME_POINTER_REG);
+    emit_sym(A_PUSH,S_L,handler);
+    emit_ref(A_PUSH,S_L,href);
+    emit_reg_ref(A_MOV,S_L,NR_ESP,href);
+  end;
+
+procedure emit_scope_end;
+  var
+    href: treference;
+    hreg,hreg2: tregister;
+  begin
+    hreg:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
+    hreg2:=cg.getintregister(current_asmdata.CurrAsmList,OS_ADDR);
+    reference_reset_base(href,hreg,0,sizeof(pint));
+    href.segment:=NR_FS;
+    emit_reg_reg(A_XOR,S_L,hreg,hreg);
+    emit_reg(A_POP,S_L,hreg2);
+    emit_const_reg(A_ADD,S_L,3*sizeof(pint),NR_ESP);
+    emit_reg_ref(A_MOV,S_L,hreg2,href);
+  end;
+
+procedure ti386tryfinallynode.pass_generate_code;
+  var
+    finallylabel,
+    exceptlabel,
+    safecalllabel,
+    endfinallylabel,
+    exitfinallylabel,
+    continuefinallylabel,
+    breakfinallylabel,
+    oldCurrExitLabel,
+    oldContinueLabel,
+    oldBreakLabel : tasmlabel;
+    oldflowcontrol,tryflowcontrol : tflowcontrol;
+    is_safecall: boolean;
+    hreg: tregister;
+  begin
+    if (target_info.system<>system_i386_win32) then
+      begin
+        inherited pass_generate_code;
+        exit;
+      end;
+    location_reset(location,LOC_VOID,OS_NO);
+    tryflowcontrol:=[];
+    oldBreakLabel:=nil;
+    oldContinueLabel:=nil;
+    continuefinallylabel:=nil;
+    breakfinallylabel:=nil;
+    exceptlabel:=nil;
+    safecalllabel:=nil;
+    is_safecall:=implicitframe and (current_procinfo.procdef.proccalloption=pocall_safecall);
+
+    { check if child nodes do a break/continue/exit }
+    oldflowcontrol:=flowcontrol;
+    flowcontrol:=[fc_inflowcontrol];
+    current_asmdata.getjumplabel(finallylabel);
+    current_asmdata.getjumplabel(endfinallylabel);
+
+    { the finally block must catch break, continue and exit }
+    { statements                                            }
+    oldCurrExitLabel:=current_procinfo.CurrExitLabel;
+    if implicitframe then
+      exitfinallylabel:=finallylabel
+    else
+      current_asmdata.getjumplabel(exitfinallylabel);
+    current_procinfo.CurrExitLabel:=exitfinallylabel;
+    if assigned(current_procinfo.CurrBreakLabel) then
+      begin
+        oldContinueLabel:=current_procinfo.CurrContinueLabel;
+        oldBreakLabel:=current_procinfo.CurrBreakLabel;
+        if implicitframe then
+          begin
+            breakfinallylabel:=finallylabel;
+            continuefinallylabel:=finallylabel;
+          end
+        else
+          begin
+            current_asmdata.getjumplabel(breakfinallylabel);
+            current_asmdata.getjumplabel(continuefinallylabel);
+          end;
+        current_procinfo.CurrContinueLabel:=continuefinallylabel;
+        current_procinfo.CurrBreakLabel:=breakfinallylabel;
+      end;
+
+    { Start of scope }
+    if is_safecall then
+      begin
+        with cg.rg[R_INTREGISTER] do
+          used_in_proc:=used_in_proc+[RS_EBX,RS_ESI,RS_EDI];
+
+        current_asmdata.getjumplabel(exceptlabel);
+        emit_scope_start(
+          current_asmdata.RefAsmSymbol('__FPC_except_safecall'),
+          exceptlabel
+        );
+      end
+    else
+      emit_scope_start(
+        current_asmdata.RefAsmSymbol('__FPC_finally_handler'),
+        current_asmdata.RefAsmSymbol(finalizepi.procdef.mangledname)
+      );
+
+    { try code }
+    if assigned(left) then
+      begin
+        secondpass(left);
+        tryflowcontrol:=flowcontrol;
+        if codegenerror then
+          exit;
+      end;
+
+    { don't generate line info for internal cleanup }
+    current_asmdata.CurrAsmList.concat(tai_marker.create(mark_NoLineInfoStart));
+
+    cg.a_label(current_asmdata.CurrAsmList,finallylabel);
+    emit_scope_end;
+    if is_safecall then
+      begin
+        current_asmdata.getjumplabel(safecalllabel);
+        hreg:=cg.GetIntRegister(current_asmdata.CurrAsmList,OS_INT);
+        cg.a_load_const_reg(current_asmdata.CurrAsmList,OS_INT,0,hreg);
+        cg.a_jmp_always(current_asmdata.CurrAsmList,safecalllabel);
+        { RTL handler will jump here on exception }
+        cg.a_label(current_asmdata.CurrAsmList,exceptlabel);
+        handle_safecall_exception;
+        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_FUNCTION_RESULT_REG,hreg);
+        cg.a_label(current_asmdata.CurrAsmList,safecalllabel);
+      end;
+
+    { end cleanup }
+    current_asmdata.CurrAsmList.concat(tai_marker.create(mark_NoLineInfoEnd));
+
+    { generate finally code as a separate procedure }
+    { !!! this resets flowcontrol, how to check flow away? }
+    if not implicitframe then
+      tcgprocinfo(current_procinfo).generate_exceptfilter(finalizepi);
+
+    flowcontrol:=[fc_inflowcontrol];
+    { right is a call to finalizer procedure }
+    secondpass(right);
+
+    { goto is allowed if it stays inside the finally block,
+      this is checked using the exception block number }
+    if (flowcontrol-[fc_gotolabel])<>[fc_inflowcontrol] then
+      CGMessage(cg_e_control_flow_outside_finally);
+    if codegenerror then
+      exit;
+
+    { don't generate line info for internal cleanup }
+    current_asmdata.CurrAsmList.concat(tai_marker.create(mark_NoLineInfoStart));
+
+    if not implicitframe then
+      begin
+        if tryflowcontrol*[fc_exit,fc_break,fc_continue]<>[] then
+          cg.a_jmp_always(current_asmdata.CurrAsmList,endfinallylabel);
+        { do some magic for exit,break,continue in the try block }
+        if fc_exit in tryflowcontrol then
+          begin
+            cg.a_label(current_asmdata.CurrAsmList,exitfinallylabel);
+            cg.g_call(current_asmdata.CurrAsmList,'_FPC_leave');
+            cg.a_jmp_always(current_asmdata.CurrAsmList,oldCurrExitLabel);
+          end;
+        if fc_break in tryflowcontrol then
+          begin
+            cg.a_label(current_asmdata.CurrAsmList,breakfinallylabel);
+            cg.g_call(current_asmdata.CurrAsmList,'_FPC_leave');
+            cg.a_jmp_always(current_asmdata.CurrAsmList,oldBreakLabel);
+          end;
+        if fc_continue in tryflowcontrol then
+          begin
+            cg.a_label(current_asmdata.CurrAsmList,continuefinallylabel);
+            cg.g_call(current_asmdata.CurrAsmList,'_FPC_leave');
+            cg.a_jmp_always(current_asmdata.CurrAsmList,oldContinueLabel);
+          end;
+      end;
+    if is_safecall then
+      cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,hreg,NR_FUNCTION_RETURN_REG);
+    cg.a_label(current_asmdata.CurrAsmList,endfinallylabel);
+
+    { end cleanup }
+    current_asmdata.CurrAsmList.concat(tai_marker.create(mark_NoLineInfoEnd));
+
+    current_procinfo.CurrExitLabel:=oldCurrExitLabel;
+    if assigned(current_procinfo.CurrBreakLabel) then
+      begin
+        current_procinfo.CurrContinueLabel:=oldContinueLabel;
+        current_procinfo.CurrBreakLabel:=oldBreakLabel;
+      end;
+    flowcontrol:=oldflowcontrol+(tryflowcontrol-[fc_inflowcontrol]);
+  end;
+
+{ ti386tryexceptnode }
+
+procedure ti386tryexceptnode.pass_generate_code;
+  var
+    exceptlabel,oldendexceptlabel,
+    lastonlabel,
+    exitexceptlabel,
+    continueexceptlabel,
+    breakexceptlabel,
+    exittrylabel,
+    continuetrylabel,
+    breaktrylabel,
+    oldCurrExitLabel,
+    oldContinueLabel,
+    oldBreakLabel : tasmlabel;
+    onlabel,
+    filterlabel: tasmlabel;
+    oldflowcontrol,tryflowcontrol,
+    exceptflowcontrol : tflowcontrol;
+    hnode : tnode;
+    hlist : tasmlist;
+    onnodecount : tai_const;
+  label
+    errorexit;
+  begin
+    if (target_info.system<>system_i386_win32) then
+      begin
+        inherited pass_generate_code;
+        exit;
+      end;
+    location_reset(location,LOC_VOID,OS_NO);
+
+    oldflowcontrol:=flowcontrol;
+    flowcontrol:=[fc_inflowcontrol];
+    { this can be called recursivly }
+    oldBreakLabel:=nil;
+    oldContinueLabel:=nil;
+    oldendexceptlabel:=endexceptlabel;
+
+    { Win32 SEH unwinding does not preserve registers. Indicate that they are
+      going to be destroyed. }
+    cg.alloccpuregisters(current_asmdata.CurrAsmList,R_INTREGISTER,[RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]);
+    cg.dealloccpuregisters(current_asmdata.CurrAsmList,R_INTREGISTER,[RS_EAX,RS_EBX,RS_ECX,RS_EDX,RS_ESI,RS_EDI]);
+
+    { save the old labels for control flow statements }
+    oldCurrExitLabel:=current_procinfo.CurrExitLabel;
+    if assigned(current_procinfo.CurrBreakLabel) then
+      begin
+        oldContinueLabel:=current_procinfo.CurrContinueLabel;
+        oldBreakLabel:=current_procinfo.CurrBreakLabel;
+      end;
+
+    { get new labels for the control flow statements }
+    current_asmdata.getjumplabel(exittrylabel);
+    current_asmdata.getjumplabel(exitexceptlabel);
+    if assigned(current_procinfo.CurrBreakLabel) then
+      begin
+        current_asmdata.getjumplabel(breaktrylabel);
+        current_asmdata.getjumplabel(continuetrylabel);
+        current_asmdata.getjumplabel(breakexceptlabel);
+        current_asmdata.getjumplabel(continueexceptlabel);
+      end;
+
+    current_asmdata.getjumplabel(exceptlabel);
+    current_asmdata.getjumplabel(endexceptlabel);
+    current_asmdata.getjumplabel(lastonlabel);
+    filterlabel:=nil;
+
+    { start of scope }
+    if assigned(right) then
+      begin
+        current_asmdata.getdatalabel(filterlabel);
+        emit_scope_start(
+          current_asmdata.RefAsmSymbol('__FPC_on_handler'),
+          filterlabel);
+      end
+    else
+      emit_scope_start(
+        current_asmdata.RefAsmSymbol('__FPC_except_handler'),
+        exceptlabel);
+
+    { set control flow labels for the try block }
+    current_procinfo.CurrExitLabel:=exittrylabel;
+    if assigned(oldBreakLabel) then
+      begin
+        current_procinfo.CurrContinueLabel:=continuetrylabel;
+        current_procinfo.CurrBreakLabel:=breaktrylabel;
+      end;
+
+    secondpass(left);
+    tryflowcontrol:=flowcontrol;
+    if codegenerror then
+      goto errorexit;
+
+    emit_scope_end;
+    { jump over except handlers }
+    cg.a_jmp_always(current_asmdata.CurrAsmList,endexceptlabel);
+
+    if fc_exit in tryflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,exittrylabel);
+        emit_scope_end;
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldCurrExitLabel);
+      end;
+    if fc_break in tryflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,breaktrylabel);
+        emit_scope_end;
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldBreakLabel);
+      end;
+    if fc_continue in tryflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,continuetrylabel);
+        emit_scope_end;
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldContinueLabel);
+      end;
+
+    { target for catch-all handler }
+    cg.a_label(current_asmdata.CurrAsmList,exceptlabel);
+
+    { set control flow labels for the except block }
+    { and the on statements                        }
+    current_procinfo.CurrExitLabel:=exitexceptlabel;
+    if assigned(oldBreakLabel) then
+      begin
+        current_procinfo.CurrContinueLabel:=continueexceptlabel;
+        current_procinfo.CurrBreakLabel:=breakexceptlabel;
+      end;
+
+    flowcontrol:=[fc_inflowcontrol];
+    { on statements }
+    if assigned(right) then
+      begin
+        { emit filter table to a temporary asmlist }
+        hlist:=TAsmList.Create;
+        new_section(hlist,sec_rodata,filterlabel.name,4);
+        cg.a_label(hlist,filterlabel);
+        onnodecount:=tai_const.create_32bit(0);
+        hlist.concat(onnodecount);
+
+        hnode:=right;
+        while assigned(hnode) do
+          begin
+            if hnode.nodetype<>onn then
+              InternalError(2011103101);
+            { TODO: make it done without using global label }
+            current_asmdata.getglobaljumplabel(onlabel);
+            hlist.concat(tai_const.create_sym(current_asmdata.RefAsmSymbol(tonnode(hnode).excepttype.vmt_mangledname,AT_DATA)));
+            hlist.concat(tai_const.create_sym(onlabel));
+            cg.a_label(current_asmdata.CurrAsmList,onlabel);
+            secondpass(hnode);
+            inc(onnodecount.value);
+            hnode:=tonnode(hnode).left;
+          end;
+        { add 'else' node to the filter list, too }
+        if assigned(t1) then
+          begin
+            hlist.concat(tai_const.create_32bit(-1));
+            hlist.concat(tai_const.create_sym(lastonlabel));
+            inc(onnodecount.value);
+          end;
+        { now move filter table to permanent list all at once }
+        maybe_new_object_file(current_asmdata.asmlists[al_typedconsts]);
+        current_asmdata.asmlists[al_typedconsts].concatlist(hlist);
+        hlist.free;
+      end;
+
+    cg.a_label(current_asmdata.CurrAsmList,lastonlabel);
+    if assigned(t1) then
+      begin
+        { here we don't have to reset flowcontrol           }
+        { the default and on flowcontrols are handled equal }
+        secondpass(t1);
+        cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+        if (flowcontrol*[fc_exit,fc_break,fc_continue]<>[]) then
+          cg.a_jmp_always(current_asmdata.CurrAsmList,endexceptlabel);
+      end;
+    exceptflowcontrol:=flowcontrol;
+
+    if fc_exit in exceptflowcontrol then
+      begin
+        { do some magic for exit in the try block }
+        cg.a_label(current_asmdata.CurrAsmList,exitexceptlabel);
+        cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldCurrExitLabel);
+      end;
+
+    if fc_break in exceptflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,breakexceptlabel);
+        cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldBreakLabel);
+      end;
+
+    if fc_continue in exceptflowcontrol then
+      begin
+        cg.a_label(current_asmdata.CurrAsmList,continueexceptlabel);
+        cg.g_call(current_asmdata.CurrAsmList,'FPC_DONEEXCEPTION');
+        cg.a_jmp_always(current_asmdata.CurrAsmList,oldContinueLabel);
+      end;
+
+    cg.a_label(current_asmdata.CurrAsmList,endexceptlabel);
+
+errorexit:
+    { restore all saved labels }
+    endexceptlabel:=oldendexceptlabel;
+
+    { restore the control flow labels }
+    current_procinfo.CurrExitLabel:=oldCurrExitLabel;
+    if assigned(oldBreakLabel) then
+      begin
+        current_procinfo.CurrContinueLabel:=oldContinueLabel;
+        current_procinfo.CurrBreakLabel:=oldBreakLabel;
+      end;
+
+    { return all used control flow statements }
+    flowcontrol:=oldflowcontrol+(exceptflowcontrol +
+      tryflowcontrol - [fc_inflowcontrol]);
+  end;
+
+initialization
+  craisenode:=ti386raisenode;
+  connode:=ti386onnode;
+  ctryexceptnode:=ti386tryexceptnode;
+  ctryfinallynode:=ti386tryfinallynode;
+end.

+ 86 - 2
compiler/i386/n386inl.pas

@@ -26,16 +26,100 @@ unit n386inl;
 interface
 
     uses
-       nx86inl;
+       node,nx86inl;
 
     type
        ti386inlinenode = class(tx86inlinenode)
+       public
+         function first_sar: tnode; override;
+         procedure second_rox_sar; override;
        end;
 
 implementation
 
   uses
-    ninl;
+    globtype,globals,
+    defutil,
+    aasmbase,aasmdata,
+    cgbase,pass_2,
+    cpuinfo,cpubase,
+    cga,cgutils,cgx86,cgobj,hlcgobj,
+    ninl,ncon,ncal;
+
+
+  function ti386inlinenode.first_sar: tnode;
+    begin
+      if is_64bitint(resultdef) and (
+        (inlinenumber=in_sar_x) or (
+          (inlinenumber=in_sar_x_y) and
+          (tcallparanode(left).left.nodetype=ordconstn)
+      )) then
+        begin
+          result:=nil;
+          expectloc:=LOC_REGISTER;
+        end
+      else
+        result:=inherited first_sar;
+    end;
+
+
+  procedure ti386inlinenode.second_rox_sar;
+    var
+      op1: tnode;
+      hreg64hi,hreg64lo: tregister;
+      v: aint;
+    begin
+      if is_64bitint(resultdef) and (
+        (inlinenumber=in_sar_x) or (
+          (inlinenumber=in_sar_x_y) and
+          (tcallparanode(left).left.nodetype=ordconstn)
+      )) then
+        begin
+          { x sar constant }
+          if (left.nodetype=callparan) and
+             assigned(tcallparanode(left).right) then
+            begin
+              op1:=tcallparanode(tcallparanode(left).right).left;
+              secondpass(tcallparanode(left).left);
+              v:=Tordconstnode(tcallparanode(left).left).value.svalue and 63;
+            end
+          else
+            begin
+              op1:=left;
+              v:=1;
+            end;
+          secondpass(op1);
+
+          location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
+
+          { load left operator in a register }
+          hlcg.location_force_reg(current_asmdata.CurrAsmList,op1.location,op1.resultdef,resultdef,false);
+          hreg64hi:=op1.location.register64.reghi;
+          hreg64lo:=op1.location.register64.reglo;
+
+          if (v=63) then
+            begin
+              emit_const_reg(A_SAR,S_L,31,hreg64hi);
+              cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_32,OS_32,hreg64hi,hreg64lo);
+            end
+          else if (v>31) then
+            begin
+              cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_32,OS_32,hreg64hi,hreg64lo);
+              emit_const_reg(A_SAR,S_L,31,hreg64hi);
+              emit_const_reg(A_SAR,S_L,v and 31,hreg64lo);
+            end
+          else
+            begin
+              emit_const_reg_reg(A_SHRD,S_L,v and 31,hreg64hi,hreg64lo);
+              emit_const_reg(A_SAR,S_L,v and 31,hreg64hi);
+            end;
+          location.register64.reghi:=hreg64hi;
+          location.register64.reglo:=hreg64lo;
+        end
+      else
+        inherited second_rox_sar;
+    end;
+
 
 begin
    cinlinenode:=ti386inlinenode;

+ 61 - 0
compiler/i386/n386ld.pas

@@ -0,0 +1,61 @@
+{
+    Copyright (c) 1998-2014 by Florian Klaempfl
+
+    Generate i386 assembler for load nodes
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ ****************************************************************************
+}
+unit n386ld;
+
+{$i fpcdefs.inc}
+
+interface
+
+    uses
+      globtype,
+      symsym,
+      node,ncgld,pass_1;
+
+    type
+      ti386loadnode = class(tcgloadnode)
+         procedure generate_absaddr_access(vs: tabsolutevarsym); override;
+      end;
+
+
+implementation
+
+    uses
+      globals,
+      symcpu,
+      nld,
+      cpubase;
+
+{*****************************************************************************
+                            TI386LOADNODE
+*****************************************************************************}
+
+    procedure ti386loadnode.generate_absaddr_access(vs: tabsolutevarsym);
+      begin
+        if tcpuabsolutevarsym(symtableentry).absseg then
+          location.reference.segment:=NR_FS;
+        inherited;
+      end;
+
+
+begin
+   cloadnode:=ti386loadnode;
+end.

+ 27 - 275
compiler/i386/n386mat.pas

@@ -29,7 +29,7 @@ interface
       node,nmat,ncgmat,nx86mat;
 
     type
-      ti386moddivnode = class(tmoddivnode)
+      ti386moddivnode = class(tx86moddivnode)
          procedure pass_generate_code;override;
       end;
 
@@ -61,295 +61,47 @@ implementation
                              TI386MODDIVNODE
 *****************************************************************************}
 
-    function log2(i : dword) : dword;
-      begin
-        result:=0;
-        i:=i shr 1;
-        while i<>0 do
-          begin
-            i:=i shr 1;
-            inc(result);
-          end;
-      end;
 
 
    procedure ti386moddivnode.pass_generate_code;
       var
-        hreg1,hreg2:Tregister;
+        hreg1:Tregister;
         power:longint;
         hl:Tasmlabel;
-        op:Tasmop;
-        e : longint;
-        d,l,r,s,m,a,n,t : dword;
-        m_low,m_high,j,k : qword;
       begin
-        secondpass(left);
-        if codegenerror then
-          exit;
-        secondpass(right);
-        if codegenerror then
-          exit;
-
         if is_64bitint(resultdef) then
           { should be handled in pass_1 (JM) }
           internalerror(200109052);
-        { put numerator in register }
-        location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
-        hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,false);
-        hreg1:=left.location.register;
 
-        if (nodetype=divn) and (right.nodetype=ordconstn) then
+        if (nodetype=divn) and (right.nodetype=ordconstn) and
+          is_signed(left.resultdef) and
+          ispowerof2(tordconstnode(right).value.svalue,power) and
+          ((current_settings.optimizecputype = cpu_386) or
+           (cs_opt_size in current_settings.optimizerswitches)) then
           begin
-            if ispowerof2(tordconstnode(right).value.svalue,power) then
-              begin
-                { for signed numbers, the numerator must be adjusted before the
-                  shift instruction, but not wih unsigned numbers! Otherwise,
-                  "Cardinal($ffffffff) div 16" overflows! (JM) }
-                if is_signed(left.resultdef) Then
-                  begin
-                    if (current_settings.optimizecputype <> cpu_386) and
-                       not(cs_opt_size in current_settings.optimizerswitches) then
-                      { use a sequence without jumps, saw this in
-                        comp.compilers (JM) }
-                      begin
-                        { no jumps, but more operations }
-                        hreg2:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-                        emit_reg_reg(A_MOV,S_L,hreg1,hreg2);
-                        {If the left value is signed, hreg2=$ffffffff, otherwise 0.}
-                        emit_const_reg(A_SAR,S_L,31,hreg2);
-                        {If signed, hreg2=right value-1, otherwise 0.}
-                        emit_const_reg(A_AND,S_L,tordconstnode(right).value.svalue-1,hreg2);
-                        { add to the left value }
-                        emit_reg_reg(A_ADD,S_L,hreg2,hreg1);
-                        { do the shift }
-                        emit_const_reg(A_SAR,S_L,power,hreg1);
-                      end
-                    else
-                      begin
-                        { a jump, but less operations }
-                        emit_reg_reg(A_TEST,S_L,hreg1,hreg1);
-                        current_asmdata.getjumplabel(hl);
-                        cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NS,hl);
-                        if power=1 then
-                          emit_reg(A_INC,S_L,hreg1)
-                        else
-                          emit_const_reg(A_ADD,S_L,tordconstnode(right).value.svalue-1,hreg1);
-                        cg.a_label(current_asmdata.CurrAsmList,hl);
-                        emit_const_reg(A_SAR,S_L,power,hreg1);
-                      end
-                  end
-                else
-                  emit_const_reg(A_SHR,S_L,power,hreg1);
-                location.register:=hreg1;
-              end
+            { signed divide-by-power-of-two optimized for size }
+            secondpass(left);
+            if codegenerror then
+              exit;
+            secondpass(right);
+            if codegenerror then
+              exit;
+            hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,resultdef,false);
+            hreg1:=left.location.register;
+            emit_reg_reg(A_TEST,S_L,hreg1,hreg1);
+            current_asmdata.getjumplabel(hl);
+            cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NS,hl);
+            if power=1 then
+              emit_reg(A_INC,S_L,hreg1)
             else
-              begin
-                if is_signed(left.resultdef) then
-                  begin
-                    e:=tordconstnode(right).value.svalue;
-                    d:=abs(e);
-                    { Determine algorithm (a), multiplier (m), and shift factor (s) for 32-bit
-                      signed integer division. Based on: Granlund, T.; Montgomery, P.L.:
-                      "Division by Invariant Integers using Multiplication". SIGPLAN Notices,
-                      Vol. 29, June 1994, page 61.
-                    }
-
-                    l:=log2(d);
-                    j:=qword($80000000) mod qword(d);
-                    k:=(qword(1) shl (32+l)) div (qword($80000000-j));
-                    m_low:=((qword(1)) shl (32+l)) div d;
-                    m_high:=(((qword(1)) shl (32+l)) + k) div d;
-                    while ((m_low shr 1) < (m_high shr 1)) and (l > 0) do
-                      begin
-                        m_low:=m_low shr 1;
-                        m_high:=m_high shr 1;
-                        dec(l);
-                      end;
-                    m:=dword(m_high);
-                    s:=l;
-                    if (m_high shr 31)<>0 then
-                      a:=1
-                    else
-                      a:=0;
-                    cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-                    emit_const_reg(A_MOV,S_L,aint(m),NR_EAX);
-                    cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-                    emit_reg(A_IMUL,S_L,hreg1);
-                    emit_reg_reg(A_MOV,S_L,hreg1,NR_EAX);
-                    if a<>0 then
-                      begin
-                        emit_reg_reg(A_ADD,S_L,NR_EAX,NR_EDX);
-                        {
-                          printf ("; dividend: memory location or register other than EAX or EDX\n");
-                          printf ("\n");
-                          printf ("MOV EAX, 0%08LXh\n", m);
-                          printf ("IMUL dividend\n");
-                          printf ("MOV EAX, dividend\n");
-                          printf ("ADD EDX, EAX\n");
-                          if (s) printf ("SAR EDX, %d\n", s);
-                          printf ("SHR EAX, 31\n");
-                          printf ("ADD EDX, EAX\n");
-                          if (e < 0) printf ("NEG EDX\n");
-                          printf ("\n");
-                          printf ("; quotient now in EDX\n");
-                        }
-                      end;
-                      {
-                        printf ("; dividend: memory location of register other than EAX or EDX\n");
-                        printf ("\n");
-                        printf ("MOV EAX, 0%08LXh\n", m);
-                        printf ("IMUL dividend\n");
-                        printf ("MOV EAX, dividend\n");
-                        if (s) printf ("SAR EDX, %d\n", s);
-                        printf ("SHR EAX, 31\n");
-                        printf ("ADD EDX, EAX\n");
-                        if (e < 0) printf ("NEG EDX\n");
-                        printf ("\n");
-                        printf ("; quotient now in EDX\n");
-                      }
-                    if s<>0 then
-                      emit_const_reg(A_SAR,S_L,s,NR_EDX);
-                    emit_const_reg(A_SHR,S_L,31,NR_EAX);
-                    emit_reg_reg(A_ADD,S_L,NR_EAX,NR_EDX);
-                    if e<0 then
-                      emit_reg(A_NEG,S_L,NR_EDX);
-                    cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-                    cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-                    location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-                    cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register)
-                  end
-                else
-                  begin
-                    d:=tordconstnode(right).value.svalue;
-                    if d>=$80000000 then
-                      begin
-                        emit_const_reg(A_CMP,S_L,aint(d),hreg1);
-                        location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-                        emit_const_reg(A_MOV,S_L,0,location.register);
-                        emit_const_reg(A_SBB,S_L,-1,location.register);
-                      end
-                    else
-                      begin
-                        { Reduce divisor until it becomes odd }
-                        n:=0;
-                        t:=d;
-                        while (t and 1)=0 do
-                          begin
-                            t:=t shr 1;
-                            inc(n);
-                          end;
-                        { Generate m, s for algorithm 0. Based on: Granlund, T.; Montgomery,
-                        P.L.: "Division by Invariant Integers using Multiplication".
-                        SIGPLAN Notices, Vol. 29, June 1994, page 61.
-                        }
-                        l:=log2(t)+1;
-                        j:=qword($ffffffff) mod qword(t);
-                        k:=(qword(1) shl (32+l)) div (qword($ffffffff-j));
-                        m_low:=((qword(1)) shl (32+l)) div t;
-                        m_high:=(((qword(1)) shl (32+l)) + k) div t;
-                        while ((m_low shr 1) < (m_high shr 1)) and (l>0) do
-                          begin
-                            m_low:=m_low shr 1;
-                            m_high:=m_high shr 1;
-                            l:=l-1;
-                          end;
-                        if (m_high shr 32)=0 then
-                          begin
-                            m:=dword(m_high);
-                            s:=l;
-                            a:=0;
-                          end
-
-                        { Generate m, s for algorithm 1. Based on: Magenheimer, D.J.; et al:
-                        "Integer Multiplication and Division on the HP Precision Architecture".
-                        IEEE Transactions on Computers, Vol 37, No. 8, August 1988, page 980.
-                        }
-                        else
-                          begin
-                            s:=log2(t);
-                            m_low:=(qword(1) shl (32+s)) div qword(t);
-                            r:=dword(((qword(1)) shl (32+s)) mod qword(t));
-                            if (r < ((t>>1)+1)) then
-                              m:=dword(m_low)
-                            else
-                              m:=dword(m_low)+1;
-                            a:=1;
-                          end;
-                        { Reduce multiplier for either algorithm to smallest possible }
-                        while (m and 1)=0 do
-                          begin
-                            m:=m shr 1;
-                            dec(s);
-                          end;
-                        { Adjust multiplier for reduction of even divisors }
-                        inc(s,n);
-                        cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-                        emit_const_reg(A_MOV,S_L,aint(m),NR_EAX);
-                        cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-                        emit_reg(A_MUL,S_L,hreg1);
-                        if a<>0 then
-                          begin
-                            {
-                            printf ("; dividend: register other than EAX or memory location\n");
-                            printf ("\n");
-                            printf ("MOV EAX, 0%08lXh\n", m);
-                            printf ("MUL dividend\n");
-                            printf ("ADD EAX, 0%08lXh\n", m);
-                            printf ("ADC EDX, 0\n");
-                            if (s) printf ("SHR EDX, %d\n", s);
-                            printf ("\n");
-                            printf ("; quotient now in EDX\n");
-                            }
-                            emit_const_reg(A_ADD,S_L,aint(m),NR_EAX);
-                            emit_const_reg(A_ADC,S_L,0,NR_EDX);
-                          end;
-                        if s<>0 then
-                          emit_const_reg(A_SHR,S_L,aint(s),NR_EDX);
-                        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-                        cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-                        location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-                        cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register)
-                      end;
-                  end
-              end
+              emit_const_reg(A_ADD,S_L,tordconstnode(right).value.svalue-1,hreg1);
+            cg.a_label(current_asmdata.CurrAsmList,hl);
+            emit_const_reg(A_SAR,S_L,power,hreg1);
+            location_reset(location,LOC_REGISTER,def_cgsize(resultdef));
+            location.register:=hreg1;
           end
         else
-          begin
-            cg.getcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-            emit_reg_reg(A_MOV,S_L,hreg1,NR_EAX);
-            cg.getcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-            {Sign extension depends on the left type.}
-            if torddef(left.resultdef).ordtype=u32bit then
-              emit_reg_reg(A_XOR,S_L,NR_EDX,NR_EDX)
-            else
-              emit_none(A_CDQ,S_NO);
-
-            {Division depends on the right type.}
-            if Torddef(right.resultdef).ordtype=u32bit then
-              op:=A_DIV
-            else
-              op:=A_IDIV;
-
-            if right.location.loc in [LOC_REFERENCE,LOC_CREFERENCE] then
-              emit_ref(op,S_L,right.location.reference)
-            else if right.location.loc in [LOC_REGISTER,LOC_CREGISTER] then
-              emit_reg(op,S_L,right.location.register)
-            else
-              begin
-                hreg1:=cg.getintregister(current_asmdata.CurrAsmList,right.location.size);
-                hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,u32inttype,right.location,hreg1);
-                emit_reg(op,S_L,hreg1);
-              end;
-
-            {Copy the result into a new register. Release EAX & EDX.}
-            cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EDX);
-            cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_EAX);
-            location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-            if nodetype=divn then
-              cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EAX,location.register)
-            else
-              cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,NR_EDX,location.register);
-          end;
+          inherited pass_generate_code;
       end;
 
 

+ 15 - 4
compiler/i386/n386mem.pas

@@ -28,10 +28,13 @@ interface
     uses
       globtype,
       cgbase,cpuinfo,cpubase,
-      node,nmem,ncgmem,nx86mem;
+      node,nmem,ncgmem,nx86mem,ni86mem;
 
     type
-       ti386addrnode = class(tcgaddrnode)
+       ti386addrnode = class(ti86addrnode)
+         protected
+          procedure set_absvarsym_resultdef; override;
+         public
           procedure pass_generate_code;override;
        end;
 
@@ -44,7 +47,7 @@ implementation
     uses
       systems,
       cutils,verbose,
-      symdef,paramgr,
+      symconst,symdef,symcpu,paramgr,
       aasmtai,aasmdata,
       nld,ncon,nadd,
       cgutils,cgobj;
@@ -53,8 +56,16 @@ implementation
                              TI386ADDRNODE
 *****************************************************************************}
 
-    procedure ti386addrnode.pass_generate_code;
+    procedure ti386addrnode.set_absvarsym_resultdef;
+      begin
+        if not(nf_typedaddr in flags) then
+          resultdef:=voidnearfspointertype
+        else
+          resultdef:=tcpupointerdefclass(cpointerdef).createx86(left.resultdef,x86pt_near_fs);
+      end;
 
+
+    procedure ti386addrnode.pass_generate_code;
       begin
         inherited pass_generate_code;
         { for use of other segments, not used }

+ 293 - 81
compiler/i386/popt386.pas

@@ -23,6 +23,8 @@ unit popt386;
 
 {$i fpcdefs.inc}
 
+{ $define DEBUG_AOPTCPU}
+
 interface
 
 uses Aasmbase,aasmtai,aasmdata,aasmcpu,verbose;
@@ -35,7 +37,7 @@ procedure PostPeepHoleOpts(asml: TAsmList; BlockStart, BlockEnd: tai);
 implementation
 
 uses
-  globtype,systems,
+  cutils,globtype,systems,
   globals,cgbase,procinfo,
   symsym,
 {$ifdef finaldestdebug}
@@ -318,7 +320,7 @@ begin
                          12: begin
                             {imul 12, reg1, reg2 to
                                lea (,reg1,4), reg2
-                               lea (,reg1,8) reg2
+                               lea (reg2,reg1,8), reg2
                              imul 12, reg1 to
                                lea (reg1,reg1,2), reg1
                                lea (,reg1,4), reg1}
@@ -440,9 +442,101 @@ begin
 end;
 
 
+function MatchInstruction(const instr: tai; const op: TAsmOp; const opsize: topsizes): boolean;
+  begin
+    result :=
+      (instr.typ = ait_instruction) and
+      (taicpu(instr).opcode = op) and
+      ((opsize = []) or (taicpu(instr).opsize in opsize));
+  end;
+
+
+function MatchInstruction(const instr: tai; const op1,op2: TAsmOp; const opsize: topsizes): boolean;
+  begin
+    result :=
+      (instr.typ = ait_instruction) and
+      ((taicpu(instr).opcode = op1) or
+       (taicpu(instr).opcode = op2)
+      ) and
+      ((opsize = []) or (taicpu(instr).opsize in opsize));
+  end;
+
+
+function MatchInstruction(const instr: tai; const op1,op2,op3: TAsmOp; const opsize: topsizes): boolean;
+  begin
+    result :=
+      (instr.typ = ait_instruction) and
+      ((taicpu(instr).opcode = op1) or
+       (taicpu(instr).opcode = op2) or
+       (taicpu(instr).opcode = op3)
+      ) and
+      ((opsize = []) or (taicpu(instr).opsize in opsize));
+  end;
+
+
+function MatchOperand(const oper: TOper; const reg: TRegister): boolean; inline;
+  begin
+    result := (oper.typ = top_reg) and (oper.reg = reg);
+  end;
+
+
+function MatchOperand(const oper: TOper; const a: tcgint): boolean; inline;
+  begin
+    result := (oper.typ = top_const) and (oper.val = a);
+  end;
+
+
+function MatchOperand(const oper1: TOper; const oper2: TOper): boolean; inline;
+  begin
+    result := oper1.typ = oper2.typ;
+
+    if result then
+      case oper1.typ of
+        top_const:
+          Result:=oper1.val = oper2.val;
+        top_reg:
+          Result:=oper1.reg = oper2.reg;
+        top_ref:
+          Result:=RefsEqual(oper1.ref^, oper2.ref^);
+        else
+          internalerror(2013102801);
+      end
+  end;
+
+
+function MatchReference(const ref : treference;base,index : TRegister) : Boolean;
+  begin
+   Result:=(ref.offset=0) and
+     (ref.scalefactor in [0,1]) and
+     (ref.segment=NR_NO) and
+     (ref.symbol=nil) and
+     (ref.relsymbol=nil) and
+     ((base=NR_INVALID) or
+      (ref.base=base)) and
+     ((index=NR_INVALID) or
+      (ref.index=index));
+  end;
+
 
+{ First pass of peephole optimizations }
 procedure PeepHoleOptPass1(Asml: TAsmList; BlockStart, BlockEnd: tai);
-{First pass of peepholeoptimizations}
+
+{$ifdef DEBUG_AOPTCPU}
+  procedure DebugMsg(const s: string;p : tai);
+    begin
+      asml.insertbefore(tai_comment.Create(strpnew(s)), p);
+    end;
+{$else DEBUG_AOPTCPU}
+  procedure DebugMsg(const s: string;p : tai);inline;
+    begin
+    end;
+{$endif DEBUG_AOPTCPU}
+
+function WriteOk : Boolean;
+  begin
+    writeln('Ok');
+    Result:=True;
+  end;
 
 var
   l : longint;
@@ -626,6 +720,7 @@ begin
       case p.Typ Of
         ait_instruction:
           begin
+            current_filepos:=taicpu(p).fileinfo;
             if InsContainsSegRef(taicpu(p)) then
               begin
                 p := tai(p.next);
@@ -758,6 +853,8 @@ begin
                         S_B: v:=$80;
                         S_W: v:=$8000;
                         S_L: v:=aint($80000000);
+                        else
+                          internalerror(2013112905);
                       end;
                       if (taicpu(p).oper[0]^.typ=Top_const) and
                          (taicpu(p).oper[0]^.val=v) and
@@ -938,47 +1035,89 @@ begin
                          (getsupreg(taicpu(p).oper[0]^.ref^.base) in [RS_EAX..RS_ESP]) and
                          (taicpu(p).oper[0]^.ref^.index = NR_NO) and
                          (not(Assigned(taicpu(p).oper[0]^.ref^.Symbol))) then
-                        if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) and
-                           (taicpu(p).oper[0]^.ref^.offset = 0) then
-                          begin
-                            hp1 := taicpu.op_reg_reg(A_MOV, S_L,taicpu(p).oper[0]^.ref^.base,
-                              taicpu(p).oper[1]^.reg);
-                            InsertLLItem(asml,p.previous,p.next, hp1);
-                            p.free;
-                            p := hp1;
-                            continue;
-                          end
-                        else if (taicpu(p).oper[0]^.ref^.offset = 0) then
-                          begin
-                            hp1 := tai(p.Next);
-                            asml.remove(p);
-                            p.free;
-                            p := hp1;
-                            continue;
-                          end
-                        else
-                          with taicpu(p).oper[0]^.ref^ do
-                            if (base = taicpu(p).oper[1]^.reg) then
-                              begin
-                                l := offset;
-                                if (l=1) and UseIncDec then
-                                  begin
-                                    taicpu(p).opcode := A_INC;
-                                    taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
-                                    taicpu(p).ops := 1
-                                  end
-                                else if (l=-1) and UseIncDec then
-                                  begin
-                                    taicpu(p).opcode := A_DEC;
-                                    taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
-                                    taicpu(p).ops := 1;
-                                  end
-                                else
-                                  begin
-                                    taicpu(p).opcode := A_ADD;
-                                    taicpu(p).loadConst(0,l);
-                                  end;
-                              end;
+                        begin
+                          if (taicpu(p).oper[0]^.ref^.base <> taicpu(p).oper[1]^.reg) and
+                             (taicpu(p).oper[0]^.ref^.offset = 0) then
+                            begin
+                              hp1 := taicpu.op_reg_reg(A_MOV, S_L,taicpu(p).oper[0]^.ref^.base,
+                                taicpu(p).oper[1]^.reg);
+                              InsertLLItem(asml,p.previous,p.next, hp1);
+                              p.free;
+                              p := hp1;
+                              continue;
+                            end
+                          else if (taicpu(p).oper[0]^.ref^.offset = 0) then
+                            begin
+                              hp1 := tai(p.Next);
+                              asml.remove(p);
+                              p.free;
+                              p := hp1;
+                              continue;
+                            end
+                          { continue to use lea to adjust the stack pointer,
+                            it is the recommended way, but only if not optimizing for size }
+                          else if (taicpu(p).oper[1]^.reg<>NR_STACK_POINTER_REG) or
+                            (cs_opt_size in current_settings.optimizerswitches) then
+                            with taicpu(p).oper[0]^.ref^ do
+                              if (base = taicpu(p).oper[1]^.reg) then
+                                begin
+                                  l := offset;
+                                  if (l=1) and UseIncDec then
+                                    begin
+                                      taicpu(p).opcode := A_INC;
+                                      taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
+                                      taicpu(p).ops := 1
+                                    end
+                                  else if (l=-1) and UseIncDec then
+                                    begin
+                                      taicpu(p).opcode := A_DEC;
+                                      taicpu(p).loadreg(0,taicpu(p).oper[1]^.reg);
+                                      taicpu(p).ops := 1;
+                                    end
+                                  else
+                                    begin
+                                      if (l<0) and (l<>-2147483648) then
+                                        begin
+                                          taicpu(p).opcode := A_SUB;
+                                          taicpu(p).loadConst(0,-l);
+                                        end
+                                      else
+                                        begin
+                                          taicpu(p).opcode := A_ADD;
+                                          taicpu(p).loadConst(0,l);
+                                        end;
+                                    end;
+                                end;
+                        end
+(*
+                      This is unsafe, lea doesn't modify the flags but "add"
+                      does. This breaks webtbs/tw15694.pp. The above
+                      transformations are also unsafe, but they don't seem to
+                      be triggered by code that FPC generators (or that at
+                      least does not occur in the tests...). This needs to be
+                      fixed by checking for the liveness of the flags register.
+
+                      else if MatchReference(taicpu(p).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_INVALID) then
+                        begin
+                          hp1:=taicpu.op_reg_reg(A_ADD,S_L,taicpu(p).oper[0]^.ref^.index,
+                            taicpu(p).oper[0]^.ref^.base);
+                          InsertLLItem(asml,p.previous,p.next, hp1);
+                          DebugMsg('Peephole Lea2AddBase done',hp1);
+                          p.free;
+                          p:=hp1;
+                          continue;
+                        end
+                      else if MatchReference(taicpu(p).oper[0]^.ref^,NR_INVALID,taicpu(p).oper[1]^.reg) then
+                        begin
+                          hp1:=taicpu.op_reg_reg(A_ADD,S_L,taicpu(p).oper[0]^.ref^.base,
+                            taicpu(p).oper[0]^.ref^.index);
+                          InsertLLItem(asml,p.previous,p.next,hp1);
+                          DebugMsg('Peephole Lea2AddIndex done',hp1);
+                          p.free;
+                          p:=hp1;
+                          continue;
+                        end
+*)
                     end;
                   A_MOV:
                     begin
@@ -1103,9 +1242,7 @@ begin
                                 end;
                     { Next instruction is also a MOV ? }
                       if GetNextInstruction(p, hp1) and
-                         (tai(hp1).typ = ait_instruction) and
-                         (taicpu(hp1).opcode = A_MOV) and
-                         (taicpu(hp1).opsize = taicpu(p).opsize) then
+                        MatchInstruction(hp1,A_MOV,[taicpu(p).opsize]) then
                         begin
                           if (taicpu(hp1).oper[0]^.typ = taicpu(p).oper[1]^.typ) and
                              (taicpu(hp1).oper[1]^.typ = taicpu(p).oper[0]^.typ) then
@@ -1264,23 +1401,17 @@ begin
                                   taicpu(hp1).loadReg(0,taicpu(hp1).oper[1]^.reg);
                                   taicpu(hp1).loadRef(1,taicpu(p).oper[1]^.ref^);
                                   taicpu(p).loadReg(1,taicpu(hp1).oper[0]^.reg);
+                                  taicpu(hp1).fileinfo := taicpu(p).fileinfo;
                                 end
                         end;
                       if GetNextInstruction(p, hp1) and
-                         (Tai(hp1).typ = ait_instruction) and
-                         ((Taicpu(hp1).opcode = A_BTS) or (Taicpu(hp1).opcode = A_BTR)) and
-                         (Taicpu(hp1).opsize = Taicpu(p).opsize) and
+                         MatchInstruction(hp1,A_BTS,A_BTR,[Taicpu(p).opsize]) and
                          GetNextInstruction(hp1, hp2) and
-                         (Tai(hp2).typ = ait_instruction) and
-                         (Taicpu(hp2).opcode = A_OR) and
-                         (Taicpu(hp1).opsize = Taicpu(p).opsize) and
-                         (Taicpu(hp2).opsize = Taicpu(p).opsize) and
-                         (Taicpu(p).oper[0]^.typ = top_const) and (Taicpu(p).oper[0]^.val=0) and
+                         MatchInstruction(hp2,A_OR,[Taicpu(p).opsize]) and
+                         MatchOperand(Taicpu(p).oper[0]^,0) and
                          (Taicpu(p).oper[1]^.typ = top_reg) and
-                         (Taicpu(hp1).oper[1]^.typ = top_reg) and
-                         (Taicpu(p).oper[1]^.reg=Taicpu(hp1).oper[1]^.reg) and
-                         (Taicpu(hp2).oper[1]^.typ = top_reg) and
-                         (Taicpu(p).oper[1]^.reg=Taicpu(hp2).oper[1]^.reg) then
+                         MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp1).oper[1]^) and
+                         MatchOperand(Taicpu(p).oper[1]^,Taicpu(hp2).oper[1]^) then
                          {mov reg1,0
                           bts reg1,operand1             -->      mov reg1,operand2
                           or  reg1,operand2                      bts reg1,operand1}
@@ -1290,6 +1421,33 @@ begin
                           insertllitem(asml,hp2,hp2.next,hp1);
                           asml.remove(p);
                           p.free;
+                          p:=hp1;
+                        end;
+                      if GetNextInstruction(p, hp1) and
+                         MatchInstruction(hp1,A_LEA,[S_L]) and
+                         (Taicpu(p).oper[0]^.typ = top_ref) and
+                         (Taicpu(p).oper[1]^.typ = top_reg) and
+                         ((MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(hp1).oper[1]^.reg,Taicpu(p).oper[1]^.reg) and
+                           (Taicpu(hp1).oper[0]^.ref^.base<>Taicpu(p).oper[1]^.reg)
+                          ) or
+                          (MatchReference(Taicpu(hp1).oper[0]^.ref^,Taicpu(p).oper[1]^.reg,Taicpu(hp1).oper[1]^.reg) and
+                           (Taicpu(hp1).oper[0]^.ref^.index<>Taicpu(p).oper[1]^.reg)
+                          )
+                         ) then
+                         {mov reg1,ref
+                          lea reg2,[reg1,reg2]          -->      add reg2,ref}
+                        begin
+                          TmpUsedRegs := UsedRegs;
+                          { reg1 may not be used afterwards }
+                          if not(RegUsedAfterInstruction(taicpu(p).oper[1]^.reg, hp1, TmpUsedRegs)) then
+                            begin
+                              Taicpu(hp1).opcode:=A_ADD;
+                              Taicpu(hp1).oper[0]^.ref^:=Taicpu(p).oper[0]^.ref^;
+                              DebugMsg('Peephole MovLea2Add done',hp1);
+                              asml.remove(p);
+                              p.free;
+                              p:=hp1;
+                            end;
                         end;
                     end;
 
@@ -1302,8 +1460,7 @@ begin
                          IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
                          (getsupreg(taicpu(hp1).oper[0]^.reg) in [RS_EAX, RS_EBX, RS_ECX, RS_EDX]) and
                          GetNextInstruction(hp1,hp2) and
-                         (hp2.typ = ait_instruction) and
-                         (taicpu(hp2).opcode = A_MOV) and
+                         MatchInstruction(hp2,A_MOV,[]) and
                          (taicpu(hp2).oper[0]^.typ = top_reg) and
                          OpsEqual(taicpu(hp2).oper[1]^,taicpu(p).oper[0]^) and
                          (((taicpu(hp1).ops=2) and
@@ -1791,6 +1948,17 @@ end;
 
 procedure PeepHoleOptPass2(asml: TAsmList; BlockStart, BlockEnd: tai);
 
+{$ifdef DEBUG_AOPTCPU}
+  procedure DebugMsg(const s: string;p : tai);
+    begin
+      asml.insertbefore(tai_comment.Create(strpnew(s)), p);
+    end;
+{$else DEBUG_AOPTCPU}
+  procedure DebugMsg(const s: string;p : tai);inline;
+    begin
+    end;
+{$endif DEBUG_AOPTCPU}
+
   function CanBeCMOV(p : tai) : boolean;
     begin
        CanBeCMOV:=assigned(p) and (p.typ=ait_instruction) and
@@ -1808,10 +1976,9 @@ procedure PeepHoleOptPass2(asml: TAsmList; BlockStart, BlockEnd: tai);
     end;
 
 var
-  p,hp1,hp2: tai;
+  p,hp1,hp2,hp3: tai;
   l : longint;
   condition : tasmcond;
-  hp3: tai;
   UsedRegs, TmpUsedRegs: TRegSet;
   carryadd_opcode: Tasmop;
 
@@ -2064,12 +2231,19 @@ begin
                   else if (taicpu(p).oper[0]^.typ = top_ref) and
                      GetNextInstruction(p,hp1) and
                      (hp1.typ = ait_instruction) and
-                     IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) and
+                     (IsFoldableArithOp(taicpu(hp1),taicpu(p).oper[1]^.reg) or
+                      ((taicpu(hp1).opcode=A_LEA) and
+                       (taicpu(hp1).oper[1]^.reg = taicpu(p).oper[1]^.reg) and
+                       ((MatchReference(taicpu(hp1).oper[0]^.ref^,taicpu(p).oper[1]^.reg,NR_INVALID) and
+                        (taicpu(hp1).oper[0]^.ref^.index<>taicpu(p).oper[1]^.reg)) or
+                        (MatchReference(taicpu(hp1).oper[0]^.ref^,NR_INVALID,taicpu(p).oper[1]^.reg) and
+                        (taicpu(hp1).oper[0]^.ref^.base<>taicpu(p).oper[1]^.reg))
+                       )
+                      )
+                     ) and
                      GetNextInstruction(hp1,hp2) and
-                     (hp2.typ = ait_instruction) and
-                     (taicpu(hp2).opcode = A_MOV) and
-                     (taicpu(hp2).oper[0]^.typ = top_reg) and
-                     (taicpu(hp2).oper[0]^.reg = taicpu(p).oper[1]^.reg) and
+                     MatchInstruction(hp2,A_MOV,[]) and
+                     MatchOperand(taicpu(p).oper[1]^,taicpu(hp2).oper[0]^) and
                      (taicpu(hp2).oper[1]^.typ = top_ref) then
                     begin
                       TmpUsedRegs := UsedRegs;
@@ -2085,7 +2259,17 @@ begin
                         begin
                           case taicpu(hp1).opcode of
                             A_INC,A_DEC:
-                              taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^)
+                              taicpu(hp1).loadRef(0,taicpu(p).oper[0]^.ref^);
+                            A_LEA:
+                              begin
+                                taicpu(hp1).opcode:=A_ADD;
+                                if taicpu(hp1).oper[0]^.ref^.index<>taicpu(p).oper[1]^.reg then
+                                  taicpu(hp1).loadreg(0,taicpu(hp1).oper[0]^.ref^.index)
+                                else
+                                  taicpu(hp1).loadreg(0,taicpu(hp1).oper[0]^.ref^.base);
+                                taicpu(hp1).loadRef(1,taicpu(p).oper[0]^.ref^);
+                                DebugMsg('Peephole FoldLea done',hp1);
+                              end
                             else
                               taicpu(hp1).loadRef(1,taicpu(p).oper[0]^.ref^);
                           end;
@@ -2108,6 +2292,7 @@ end;
 procedure PostPeepHoleOpts(asml: TAsmList; BlockStart, BlockEnd: tai);
 var
   p,hp1,hp2: tai;
+  IsTestConstX: boolean;
 begin
   p := BlockStart;
   while (p <> BlockEnd) Do
@@ -2213,22 +2398,22 @@ See test/tgadint64 in the test suite.
               A_TEST, A_OR:
                 {removes the line marked with (x) from the sequence
                  and/or/xor/add/sub/... $x, %y
-                 test/or %y, %y   (x)
+                 test/or %y, %y  | test $-1, %y    (x)
                  j(n)z _Label
-                    as the first instruction already adjusts the ZF}
+                    as the first instruction already adjusts the ZF
+                    %y operand may also be a reference }
                  begin
-                   if OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) then
-                    if GetLastInstruction(p, hp1) and
+                   IsTestConstX:=(taicpu(p).opcode=A_TEST) and
+                     MatchOperand(taicpu(p).oper[0]^,-1);
+                   if (OpsEqual(taicpu(p).oper[0]^,taicpu(p).oper[1]^) or IsTestConstX) and
+                      GetLastInstruction(p, hp1) and
                       (tai(hp1).typ = ait_instruction) and
                       GetNextInstruction(p,hp2) and
-                      (hp2.typ = ait_instruction) and
-                      ((taicpu(hp2).opcode = A_SETcc) or
-                       (taicpu(hp2).opcode = A_Jcc) or
-                       (taicpu(hp2).opcode = A_CMOVcc)) then
+                      MatchInstruction(hp2,A_SETcc,A_Jcc,A_CMOVcc,[]) then
                      case taicpu(hp1).opcode Of
-                       A_ADD, A_SUB, A_OR, A_XOR, A_AND{, A_SHL, A_SHR}:
+                       A_ADD, A_SUB, A_OR, A_XOR, A_AND:
                          begin
-                           if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[0]^) and
+                           if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
                              { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
                              { and in case of carry for A(E)/B(E)/C/NC                  }
                               ((taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) or
@@ -2242,9 +2427,28 @@ See test/tgadint64 in the test suite.
                                continue
                              end;
                          end;
+                       A_SHL, A_SAL, A_SHR, A_SAR:
+                         begin
+                           if OpsEqual(taicpu(hp1).oper[1]^,taicpu(p).oper[1]^) and
+                             { SHL/SAL/SHR/SAR with a value of 0 do not change the flags }
+                             { therefore, it's only safe to do this optimization for     }
+                             { shifts by a (nonzero) constant                            }
+                              (taicpu(hp1).oper[0]^.typ = top_const) and
+                              (taicpu(hp1).oper[0]^.val <> 0) and
+                             { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
+                             { and in case of carry for A(E)/B(E)/C/NC                  }
+                              (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
+                             begin
+                               hp1 := tai(p.next);
+                               asml.remove(p);
+                               p.free;
+                               p := tai(hp1);
+                               continue
+                             end;
+                         end;
                        A_DEC, A_INC, A_NEG:
                          begin
-                           if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[0]^) and
+                           if OpsEqual(taicpu(hp1).oper[0]^,taicpu(p).oper[1]^) and
                              { does not work in case of overflow for G(E)/L(E)/C_O/C_NO }
                              { and in case of carry for A(E)/B(E)/C/NC                  }
                              (taicpu(hp2).condition in [C_Z,C_NZ,C_E,C_NE]) then
@@ -2269,7 +2473,15 @@ See test/tgadint64 in the test suite.
                                continue
                              end;
                          end
-                     end
+                     else
+                       { change "test  $-1,%reg" into "test %reg,%reg" }
+                       if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
+                         taicpu(p).loadoper(0,taicpu(p).oper[1]^);
+                     end { case }
+                   else
+                     { change "test  $-1,%reg" into "test %reg,%reg" }
+                     if IsTestConstX and (taicpu(p).oper[1]^.typ=top_reg) then
+                       taicpu(p).loadoper(0,taicpu(p).oper[1]^);
                  end;
             end;
           end;

+ 8 - 8
compiler/i386/r386ari.inc

@@ -8,22 +8,22 @@
 15,
 6,
 5,
-38,
 39,
 40,
 41,
+42,
 26,
 7,
 10,
 19,
 9,
-32,
 33,
 34,
 35,
 36,
 37,
-27,
+38,
+28,
 11,
 4,
 22,
@@ -31,13 +31,13 @@
 8,
 20,
 12,
+32,
 25,
-28,
 18,
 24,
-47,
-30,
 31,
+29,
+30,
 57,
 58,
 59,
@@ -48,7 +48,7 @@
 64,
 17,
 23,
-29,
+27,
 56,
 48,
 49,
@@ -58,11 +58,11 @@
 53,
 54,
 55,
-42,
 43,
 44,
 45,
 46,
+47,
 65,
 66,
 67,

+ 4 - 4
compiler/i386/r386att.inc

@@ -24,13 +24,14 @@
 '%ebp',
 '%sp',
 '%esp',
-'%eip',
-'%cs',
-'%ds',
 '%es',
+'%cs',
 '%ss',
+'%ds',
 '%fs',
 '%gs',
+'%flags',
+'%eip',
 '%dr0',
 '%dr1',
 '%dr2',
@@ -46,7 +47,6 @@
 '%tr5',
 '%tr6',
 '%tr7',
-'%flags',
 '%st(0)',
 '%st(1)',
 '%st(2)',

+ 22 - 22
compiler/i386/r386con.inc

@@ -24,29 +24,29 @@ NR_BP = tregister($01030006);
 NR_EBP = tregister($01040006);
 NR_SP = tregister($01030007);
 NR_ESP = tregister($01040007);
-NR_EIP = tregister($05040000);
+NR_ES = tregister($05000000);
 NR_CS = tregister($05000001);
-NR_DS = tregister($05000002);
-NR_ES = tregister($05000003);
-NR_SS = tregister($05000004);
-NR_FS = tregister($05000005);
-NR_GS = tregister($05000006);
-NR_DR0 = tregister($05000007);
-NR_DR1 = tregister($05000008);
-NR_DR2 = tregister($05000009);
-NR_DR3 = tregister($0500000a);
-NR_DR6 = tregister($0500000b);
-NR_DR7 = tregister($0500000c);
-NR_CR0 = tregister($0500000d);
-NR_CR2 = tregister($0500000e);
-NR_CR3 = tregister($0500000f);
-NR_CR4 = tregister($05000010);
-NR_TR3 = tregister($05000011);
-NR_TR4 = tregister($05000012);
-NR_TR5 = tregister($05000013);
-NR_TR6 = tregister($05000014);
-NR_TR7 = tregister($05000015);
-NR_FLAGS = tregister($05000016);
+NR_SS = tregister($05000002);
+NR_DS = tregister($05000003);
+NR_FS = tregister($05000004);
+NR_GS = tregister($05000005);
+NR_FLAGS = tregister($05000006);
+NR_EIP = tregister($05040007);
+NR_DR0 = tregister($05000008);
+NR_DR1 = tregister($05000009);
+NR_DR2 = tregister($0500000a);
+NR_DR3 = tregister($0500000b);
+NR_DR6 = tregister($0500000d);
+NR_DR7 = tregister($0500000e);
+NR_CR0 = tregister($05000010);
+NR_CR2 = tregister($05000012);
+NR_CR3 = tregister($05000013);
+NR_CR4 = tregister($05000014);
+NR_TR3 = tregister($0500001b);
+NR_TR4 = tregister($0500001c);
+NR_TR5 = tregister($0500001d);
+NR_TR6 = tregister($0500001e);
+NR_TR7 = tregister($0500001f);
 NR_ST0 = tregister($02000000);
 NR_ST1 = tregister($02000001);
 NR_ST2 = tregister($02000002);

+ 1 - 1
compiler/i386/r386dwrf.inc

@@ -24,7 +24,6 @@
 5,
 4,
 4,
-8,
 -1,
 -1,
 -1,
@@ -32,6 +31,7 @@
 -1,
 -1,
 -1,
+8,
 -1,
 -1,
 -1,

+ 4 - 4
compiler/i386/r386int.inc

@@ -24,13 +24,14 @@
 'ebp',
 'sp',
 'esp',
-'eip',
-'cs',
-'ds',
 'es',
+'cs',
 'ss',
+'ds',
 'fs',
 'gs',
+'flags',
+'eip',
 'dr0',
 'dr1',
 'dr2',
@@ -46,7 +47,6 @@
 'tr5',
 'tr6',
 'tr7',
-'flags',
 'st(0)',
 'st(1)',
 'st(2)',

+ 8 - 8
compiler/i386/r386iri.inc

@@ -9,22 +9,22 @@
 15,
 6,
 5,
-38,
 39,
 40,
 41,
+42,
 26,
 7,
 10,
 19,
 9,
-32,
 33,
 34,
 35,
 36,
 37,
-27,
+38,
+28,
 11,
 4,
 22,
@@ -32,13 +32,13 @@
 8,
 20,
 12,
+32,
 25,
-28,
 18,
 24,
-47,
-30,
 31,
+29,
+30,
 57,
 58,
 59,
@@ -49,7 +49,7 @@
 64,
 17,
 23,
-29,
+27,
 56,
 48,
 49,
@@ -59,11 +59,11 @@
 53,
 54,
 55,
-42,
 43,
 44,
 45,
 46,
+47,
 65,
 66,
 67,

+ 4 - 4
compiler/i386/r386nasm.inc

@@ -24,13 +24,14 @@
 'ebp',
 'sp',
 'esp',
-'eip',
-'cs',
-'ds',
 'es',
+'cs',
 'ss',
+'ds',
 'fs',
 'gs',
+'flags',
+'eip',
 'dr0',
 'dr1',
 'dr2',
@@ -46,7 +47,6 @@
 'tr5',
 'tr6',
 'tr7',
-'flags',
 'st0',
 'st1',
 'st2',

+ 8 - 8
compiler/i386/r386nri.inc

@@ -9,22 +9,22 @@
 15,
 6,
 5,
-38,
 39,
 40,
 41,
+42,
 26,
 7,
 10,
 19,
 9,
-32,
 33,
 34,
 35,
 36,
 37,
-27,
+38,
+28,
 11,
 4,
 22,
@@ -32,13 +32,13 @@
 8,
 20,
 12,
+32,
 25,
-28,
 18,
 24,
-47,
-30,
 31,
+29,
+30,
 57,
 58,
 59,
@@ -49,7 +49,7 @@
 64,
 17,
 23,
-29,
+27,
 56,
 48,
 49,
@@ -59,11 +59,11 @@
 53,
 54,
 55,
-42,
 43,
 44,
 45,
 46,
+47,
 65,
 66,
 67,

+ 7 - 7
compiler/i386/r386num.inc

@@ -24,29 +24,29 @@ tregister($01030006),
 tregister($01040006),
 tregister($01030007),
 tregister($01040007),
-tregister($05040000),
+tregister($05000000),
 tregister($05000001),
 tregister($05000002),
 tregister($05000003),
 tregister($05000004),
 tregister($05000005),
 tregister($05000006),
-tregister($05000007),
+tregister($05040007),
 tregister($05000008),
 tregister($05000009),
 tregister($0500000a),
 tregister($0500000b),
-tregister($0500000c),
 tregister($0500000d),
 tregister($0500000e),
-tregister($0500000f),
 tregister($05000010),
-tregister($05000011),
 tregister($05000012),
 tregister($05000013),
 tregister($05000014),
-tregister($05000015),
-tregister($05000016),
+tregister($0500001b),
+tregister($0500001c),
+tregister($0500001d),
+tregister($0500001e),
+tregister($0500001f),
 tregister($02000000),
 tregister($02000001),
 tregister($02000002),

+ 0 - 82
compiler/i386/r386op.inc

@@ -1,82 +0,0 @@
-{ don't edit, this file is generated from x86reg.dat }
-0,
-0,
-4,
-0,
-0,
-1,
-5,
-1,
-1,
-2,
-6,
-2,
-2,
-3,
-7,
-3,
-3,
-6,
-6,
-7,
-7,
-5,
-5,
-4,
-4,
-0,
-1,
-3,
-0,
-2,
-4,
-5,
-0,
-1,
-2,
-3,
-6,
-7,
-0,
-2,
-3,
-4,
-3,
-4,
-5,
-6,
-7,
-0,
-0,
-1,
-2,
-3,
-4,
-5,
-6,
-7,
-0,
-0,
-1,
-2,
-3,
-4,
-5,
-6,
-7,
-0,
-1,
-2,
-3,
-4,
-5,
-6,
-7,
-0,
-1,
-2,
-3,
-4,
-5,
-6,
-7

+ 3 - 3
compiler/i386/r386ot.inc

@@ -24,13 +24,14 @@ OT_REG16,
 OT_REG32,
 OT_REG16,
 OT_REG32,
-OT_NONE,
-OT_REG_CS,
 OT_REG_DESS,
+OT_REG_CS,
 OT_REG_DESS,
 OT_REG_DESS,
 OT_REG_FSGS,
 OT_REG_FSGS,
+OT_NONE,
+OT_NONE,
 OT_REG_DREG,
 OT_REG_DREG,
 OT_REG_DREG,
@@ -46,7 +47,6 @@ OT_REG_TREG,
 OT_REG_TREG,
 OT_REG_TREG,
 OT_REG_TREG,
-OT_NONE,
 OT_FPU0,
 OT_FPUREG,
 OT_FPUREG,

+ 2 - 2
compiler/i386/r386rni.inc

@@ -57,13 +57,13 @@
 78,
 79,
 80,
+25,
 26,
 27,
 28,
 29,
 30,
 31,
-32,
 33,
 34,
 35,
@@ -79,4 +79,4 @@
 45,
 46,
 47,
-25
+32

+ 8 - 8
compiler/i386/r386sri.inc

@@ -9,22 +9,22 @@
 15,
 6,
 5,
-38,
 39,
 40,
 41,
+42,
 26,
 7,
 10,
 19,
 9,
-32,
 33,
 34,
 35,
 36,
 37,
-27,
+38,
+28,
 11,
 4,
 22,
@@ -32,13 +32,13 @@
 8,
 20,
 12,
+32,
 25,
-28,
 18,
 24,
-47,
-30,
 31,
+29,
+30,
 57,
 58,
 59,
@@ -49,7 +49,7 @@
 64,
 17,
 23,
-29,
+27,
 56,
 48,
 49,
@@ -59,11 +59,11 @@
 53,
 54,
 55,
-42,
 43,
 44,
 45,
 46,
+47,
 65,
 66,
 67,

+ 4 - 4
compiler/i386/r386std.inc

@@ -24,13 +24,14 @@
 'ebp',
 'sp',
 'esp',
-'eip',
-'cs',
-'ds',
 'es',
+'cs',
 'ss',
+'ds',
 'fs',
 'gs',
+'flags',
+'eip',
 'dr0',
 'dr1',
 'dr2',
@@ -46,7 +47,6 @@
 'tr5',
 'tr6',
 'tr7',
-'flags',
 'st(0)',
 'st(1)',
 'st(2)',

+ 1 - 0
compiler/i386/rropt386.pas

@@ -206,6 +206,7 @@ begin
   sequenceEnd := false;
   reg1Modified := false;
   reg2Modified := false;
+  switchLast := false;
   endP := start;
   while tmpResult and not sequenceEnd do
     begin

+ 211 - 0
compiler/i386/symcpu.pas

@@ -0,0 +1,211 @@
+{
+    Copyright (c) 2014 by Florian Klaempfl
+
+    Symbol table overrides for i386
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+ ****************************************************************************
+}
+unit symcpu;
+
+{$i fpcdefs.inc}
+
+interface
+
+uses
+  symtype,symdef,symsym,symx86,symi86;
+
+type
+  { defs }
+  tcpufiledef = class(tfiledef)
+  end;
+  tcpufiledefclass = class of tcpufiledef;
+
+  tcpuvariantdef = class(tvariantdef)
+  end;
+  tcpuvariantdefclass = class of tcpuvariantdef;
+
+  tcpuformaldef = class(tformaldef)
+  end;
+  tcpuformaldefclass = class of tcpuformaldef;
+
+  tcpuforwarddef = class(tforwarddef)
+  end;
+  tcpuforwarddefclass = class of tcpuforwarddef;
+
+  tcpuundefineddef = class(tundefineddef)
+  end;
+  tcpuundefineddefclass = class of tcpuundefineddef;
+
+  tcpuerrordef = class(terrordef)
+  end;
+  tcpuerrordefclass = class of tcpuerrordef;
+
+  tcpupointerdef = class(tx86pointerdef)
+  end;
+  tcpupointerdefclass = class of tcpupointerdef;
+
+  tcpurecorddef = class(trecorddef)
+  end;
+  tcpurecorddefclass = class of tcpurecorddef;
+
+  tcpuimplementedinterface = class(timplementedinterface)
+  end;
+  tcpuimplementedinterfaceclass = class of tcpuimplementedinterface;
+
+  tcpuobjectdef = class(tobjectdef)
+  end;
+  tcpuobjectdefclass = class of tcpuobjectdef;
+
+  tcpuclassrefdef = class(tclassrefdef)
+  end;
+  tcpuclassrefdefclass = class of tcpuclassrefdef;
+
+  tcpuarraydef = class(tarraydef)
+  end;
+  tcpuarraydefclass = class of tcpuarraydef;
+
+  tcpuorddef = class(torddef)
+  end;
+  tcpuorddefclass = class of tcpuorddef;
+
+  tcpufloatdef = class(tfloatdef)
+  end;
+  tcpufloatdefclass = class of tcpufloatdef;
+
+  tcpuprocvardef = class(ti86procvardef)
+  end;
+  tcpuprocvardefclass = class of tcpuprocvardef;
+
+  tcpuprocdef = class(ti86procdef)
+  end;
+  tcpuprocdefclass = class of tcpuprocdef;
+
+  tcpustringdef = class(tstringdef)
+  end;
+  tcpustringdefclass = class of tcpustringdef;
+
+  tcpuenumdef = class(tenumdef)
+  end;
+  tcpuenumdefclass = class of tcpuenumdef;
+
+  tcpusetdef = class(tsetdef)
+  end;
+  tcpusetdefclass = class of tcpusetdef;
+
+  { syms }
+  tcpulabelsym = class(tlabelsym)
+  end;
+  tcpulabelsymclass = class of tcpulabelsym;
+
+  tcpuunitsym = class(tunitsym)
+  end;
+  tcpuunitsymclass = class of tcpuunitsym;
+
+  tcpunamespacesym = class(tnamespacesym)
+  end;
+  tcpunamespacesymclass = class of tcpunamespacesym;
+
+  tcpuprocsym = class(tprocsym)
+  end;
+  tcpuprocsymclass = class of tcpuprocsym;
+
+  tcputypesym = class(ttypesym)
+  end;
+  tcpuypesymclass = class of tcputypesym;
+
+  tcpufieldvarsym = class(tfieldvarsym)
+  end;
+  tcpufieldvarsymclass = class of tcpufieldvarsym;
+
+  tcpulocalvarsym = class(tlocalvarsym)
+  end;
+  tcpulocalvarsymclass = class of tcpulocalvarsym;
+
+  tcpuparavarsym = class(tparavarsym)
+  end;
+  tcpuparavarsymclass = class of tcpuparavarsym;
+
+  tcpustaticvarsym = class(tstaticvarsym)
+  end;
+  tcpustaticvarsymclass = class of tcpustaticvarsym;
+
+  tcpuabsolutevarsym = class(ti86absolutevarsym)
+  end;
+  tcpuabsolutevarsymclass = class of tcpuabsolutevarsym;
+
+  tcpupropertysym = class(tpropertysym)
+  end;
+  tcpupropertysymclass = class of tcpupropertysym;
+
+  tcpuconstsym = class(tconstsym)
+  end;
+  tcpuconstsymclass = class of tcpuconstsym;
+
+  tcpuenumsym = class(tenumsym)
+  end;
+  tcpuenumsymclass = class of tcpuenumsym;
+
+  tcpusyssym = class(tsyssym)
+  end;
+  tcpusyssymclass = class of tcpusyssym;
+
+
+const
+  pbestrealtype : ^tdef = @s80floattype;
+
+
+implementation
+
+begin
+  { used tdef classes }
+  cfiledef:=tcpufiledef;
+  cvariantdef:=tcpuvariantdef;
+  cformaldef:=tcpuformaldef;
+  cforwarddef:=tcpuforwarddef;
+  cundefineddef:=tcpuundefineddef;
+  cerrordef:=tcpuerrordef;
+  cpointerdef:=tcpupointerdef;
+  crecorddef:=tcpurecorddef;
+  cimplementedinterface:=tcpuimplementedinterface;
+  cobjectdef:=tcpuobjectdef;
+  cclassrefdef:=tcpuclassrefdef;
+  carraydef:=tcpuarraydef;
+  corddef:=tcpuorddef;
+  cfloatdef:=tcpufloatdef;
+  cprocvardef:=tcpuprocvardef;
+  cprocdef:=tcpuprocdef;
+  cstringdef:=tcpustringdef;
+  cenumdef:=tcpuenumdef;
+  csetdef:=tcpusetdef;
+
+  { used tsym classes }
+  clabelsym:=tcpulabelsym;
+  cunitsym:=tcpuunitsym;
+  cnamespacesym:=tcpunamespacesym;
+  cprocsym:=tcpuprocsym;
+  ctypesym:=tcputypesym;
+  cfieldvarsym:=tcpufieldvarsym;
+  clocalvarsym:=tcpulocalvarsym;
+  cparavarsym:=tcpuparavarsym;
+  cstaticvarsym:=tcpustaticvarsym;
+  cabsolutevarsym:=tcpuabsolutevarsym;
+  cpropertysym:=tcpupropertysym;
+  cconstsym:=tcpuconstsym;
+  cenumsym:=tcpuenumsym;
+  csyssym:=tcpusyssym;
+end.
+