Browse Source

merge with trunk

git-svn-id: branches/tg74/avx2@25604 -
tg74 12 years ago
parent
commit
d0344d7d0e

+ 175 - 76
compiler/x86/aasmcpu.pas

@@ -181,11 +181,13 @@ interface
       OT_UNITY     = OT_IMMEDIATE or OT_ONENESS;  { for shift/rotate instructions  }
 
       { Size of the instruction table converted by nasmconv.pas }
-{$ifdef x86_64}
+{$if defined(x86_64)}
       instabentries = {$i x8664nop.inc}
-{$else x86_64}
+{$elseif defined(i386)}
       instabentries = {$i i386nop.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+      instabentries = {$i i8086nop.inc}
+{$endif}
       maxinfolen    = 8;
       MaxInsChanges = 3; { Max things a instruction can change }
 
@@ -244,11 +246,13 @@ interface
 
 
       InsProp : array[tasmop] of TInsProp =
-{$ifdef x86_64}
+{$if defined(x86_64)}
         {$i x8664pro.inc}
-{$else x86_64}
+{$elseif defined(i386)}
         {$i i386prop.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+        {$i i8086prop.inc}
+{$endif}
 
     type
       TOperandOrder = (op_intel,op_att);
@@ -292,7 +296,7 @@ interface
          constructor op_reg_reg_reg(op : tasmop;_size : topsize;_op1,_op2,_op3 : tregister);
          constructor op_const_reg_reg(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;_op3 : tregister);
          constructor op_const_ref_reg(op : tasmop;_size : topsize;_op1 : aint;const _op2 : treference;_op3 : tregister);
-         constructor op_reg_reg_ref(op : tasmop;_size : topsize;_op1,_op2 : tregister; const _op3 : treference);
+         constructor op_ref_reg_reg(op : tasmop;_size : topsize;const _op1 : treference;_op2,_op3 : tregister);
          constructor op_const_reg_ref(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;const _op3 : treference);
 
          { this is for Jmp instructions }
@@ -306,7 +310,21 @@ interface
          procedure changeopsize(siz:topsize);
 
          function  GetString:string;
-         procedure CheckNonCommutativeOpcodes;
+
+         { This is a workaround for the GAS non commutative fpu instruction braindamage.
+           Early versions of the UnixWare assembler had a bug where some fpu instructions
+           were reversed and GAS still keeps this "feature" for compatibility.
+           for details: http://sourceware.org/binutils/docs/as/i386_002dBugs.html#i386_002dBugs
+                        http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=372528
+                        http://en.wikibooks.org/wiki/X86_Assembly/GAS_Syntax#Caveats
+
+           Since FPC is "GAS centric" due to its history it generates instructions with the same operand order so
+           when generating output for other assemblers, the opcodes must be fixed before writing them.
+           This function returns the fixed opcodes. Changing the opcodes permanently is no good idea
+           because in case of smartlinking assembler is generated twice so at the second run wrong
+           assembler is generated.
+           }
+         function FixNonCommutativeOpcodes: tasmop;
       private
          FOperandOrder : TOperandOrder;
          procedure init(_size : topsize); { this need to be called by all constructor }
@@ -357,7 +375,8 @@ implementation
        systems,
        procinfo,
        itcpugas,
-       symsym;
+       symsym,
+       cpuinfo;
 
 {*****************************************************************************
                               Instruction table
@@ -409,6 +428,8 @@ implementation
        IF_SSE42  = $00200000;
        IF_AVX    = $00200000;
        IF_SANDYBRIDGE = $00200000;
+       IF_BMI1 = $00200000;
+       IF_BMI2 = $00200000;
 
        IF_8086   = $00000000;  { 8086 instruction  }
        IF_186    = $01000000;  { 186+ instruction  }
@@ -438,16 +459,18 @@ implementation
        PInsTabMemRefSizeInfoCache=^TInsTabMemRefSizeInfoCache;
 
      const
-{$ifdef x86_64}
+{$if defined(x86_64)}
        InsTab:array[0..instabentries-1] of TInsEntry={$i x8664tab.inc}
-{$else x86_64}
+{$elseif defined(i386)}
        InsTab:array[0..instabentries-1] of TInsEntry={$i i386tab.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+       InsTab:array[0..instabentries-1] of TInsEntry={$i i8086tab.inc}
+{$endif}
      var
        InsTabCache : PInsTabCache;
        InsTabMemRefSizeInfoCache: PInsTabMemRefSizeInfoCache;
      const
-{$ifdef x86_64}
+{$if defined(x86_64)}
        { Intel style operands ! }
        opsize_2_type:array[0..2,topsize] of longint=(
          (OT_NONE,
@@ -485,7 +508,7 @@ implementation
       reg_ot_table : array[tregisterindex] of longint = (
         {$i r8664ot.inc}
       );
-{$else x86_64}
+{$elseif defined(i386)}
        { Intel style operands ! }
        opsize_2_type:array[0..2,topsize] of longint=(
          (OT_NONE,
@@ -523,7 +546,45 @@ implementation
       reg_ot_table : array[tregisterindex] of longint = (
         {$i r386ot.inc}
       );
-{$endif x86_64}
+{$elseif defined(i8086)}
+       { Intel style operands ! }
+       opsize_2_type:array[0..2,topsize] of longint=(
+         (OT_NONE,
+          OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS16,OT_BITS32,OT_BITS32,
+          OT_BITS16,OT_BITS32,OT_BITS64,
+          OT_BITS32,OT_BITS64,OT_BITS80,OT_BITS64,OT_NONE,
+          OT_BITS64,
+          OT_NEAR,OT_FAR,OT_SHORT,
+          OT_NONE,
+          OT_BITS128,
+          OT_BITS256
+         ),
+         (OT_NONE,
+          OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
+          OT_BITS16,OT_BITS32,OT_BITS64,
+          OT_BITS32,OT_BITS64,OT_BITS80,OT_BITS64,OT_NONE,
+          OT_BITS64,
+          OT_NEAR,OT_FAR,OT_SHORT,
+          OT_NONE,
+          OT_BITS128,
+          OT_BITS256
+         ),
+         (OT_NONE,
+          OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
+          OT_BITS16,OT_BITS32,OT_BITS64,
+          OT_BITS32,OT_BITS64,OT_BITS80,OT_BITS64,OT_NONE,
+          OT_BITS64,
+          OT_NEAR,OT_FAR,OT_SHORT,
+          OT_NONE,
+          OT_BITS128,
+          OT_BITS256
+         )
+      );
+
+      reg_ot_table : array[tregisterindex] of longint = (
+        {$i r8086ot.inc}
+      );
+{$endif}
 
     function MemRefInfo(aAsmop: TAsmOp): TInsTabMemRefSizeInfoRec;
     begin
@@ -755,14 +816,14 @@ implementation
       end;
 
 
-    constructor taicpu.op_reg_reg_ref(op : tasmop;_size : topsize;_op1,_op2 : tregister;const _op3 : treference);
+    constructor taicpu.op_ref_reg_reg(op : tasmop;_size : topsize;const _op1 : treference;_op2,_op3 : tregister);
       begin
          inherited create(op);
          init(_size);
          ops:=3;
-         loadreg(0,_op1);
+         loadref(0,_op1);
          loadreg(1,_op2);
-         loadref(2,_op3);
+         loadreg(2,_op3);
       end;
 
 
@@ -961,8 +1022,10 @@ implementation
       end;
 
 
-    procedure taicpu.CheckNonCommutativeOpcodes;
+    function taicpu.FixNonCommutativeOpcodes: tasmop;
       begin
+        result:=opcode;
+
         { we need ATT order }
         SetOperandOrder(op_att);
 
@@ -981,21 +1044,21 @@ implementation
            (ops=0) then
           begin
             if opcode=A_FSUBR then
-              opcode:=A_FSUB
+              result:=A_FSUB
             else if opcode=A_FSUB then
-              opcode:=A_FSUBR
+              result:=A_FSUBR
             else if opcode=A_FDIVR then
-              opcode:=A_FDIV
+              result:=A_FDIV
             else if opcode=A_FDIV then
-              opcode:=A_FDIVR
+              result:=A_FDIVR
             else if opcode=A_FSUBRP then
-              opcode:=A_FSUBP
+              result:=A_FSUBP
             else if opcode=A_FSUBP then
-              opcode:=A_FSUBRP
+              result:=A_FSUBRP
             else if opcode=A_FDIVRP then
-              opcode:=A_FDIVP
+              result:=A_FDIVP
             else if opcode=A_FDIVP then
-              opcode:=A_FDIVRP;
+              result:=A_FDIVRP;
           end;
         if (
             (ops=1) and
@@ -1005,13 +1068,13 @@ implementation
            ) then
          begin
            if opcode=A_FSUBRP then
-             opcode:=A_FSUBP
+             result:=A_FSUBP
            else if opcode=A_FSUBP then
-             opcode:=A_FSUBRP
+             result:=A_FSUBRP
            else if opcode=A_FDIVRP then
-             opcode:=A_FDIVP
+             result:=A_FDIVP
            else if opcode=A_FDIVP then
-             opcode:=A_FDIVRP;
+             result:=A_FDIVRP;
          end;
       end;
 
@@ -1492,15 +1555,19 @@ implementation
 
     function regval(r:Tregister):byte;
       const
-    {$ifdef x86_64}
+    {$if defined(x86_64)}
         opcode_table:array[tregisterindex] of tregisterindex = (
           {$i r8664op.inc}
         );
-    {$else x86_64}
+    {$elseif defined(i386)}
         opcode_table:array[tregisterindex] of tregisterindex = (
           {$i r386op.inc}
         );
-    {$endif x86_64}
+    {$elseif defined(i8086)}
+        opcode_table:array[tregisterindex] of tregisterindex = (
+          {$i r8086op.inc}
+        );
+    {$endif}
       var
         regidx : tregisterindex;
       begin
@@ -2039,7 +2106,7 @@ implementation
             else
               rex:=rex and $F7;
           end;
-        if not(exists_vex) then 
+        if not(exists_vex) then
         begin
           if rex<>0 then
             Inc(len);
@@ -2473,8 +2540,11 @@ implementation
             24,25,26 :     // 030..032
               begin
                 getvalsym(c-24);
+{$ifndef i8086}
+                { currval is an aint so this cannot happen on i8086 and causes only a warning }
                 if (currval<-65536) or (currval>65535) then
                  Message2(asmw_e_value_exceeds_bounds,'word',tostr(currval));
+{$endif i8086}
                 if assigned(currsym) then
                  objdata_writereloc(currval,2,currsym,currabsreloc)
                 else
@@ -2807,7 +2877,9 @@ implementation
                  (oper[0]^.reg=oper[1]^.reg)
                 ) or
                 (((opcode=A_MOVSS) or (opcode=A_MOVSD) or (opcode=A_MOVQ) or
-                  (opcode=A_MOVAPS) or (OPCODE=A_MOVAPD)) and
+                  (opcode=A_MOVAPS) or (OPCODE=A_MOVAPD) or
+                  (opcode=A_VMOVSS) or (opcode=A_VMOVSD) or (opcode=A_VMOVQ) or
+                  (opcode=A_VMOVAPS) or (OPCODE=A_VMOVAPD)) and
                  (regtype = R_MMREGISTER) and
                  (ops=2) and
                  (oper[0]^.typ=top_reg) and
@@ -2862,8 +2934,11 @@ implementation
       begin
         { the information in the instruction table is made for the string copy
           operation MOVSD so hack here (FK)
+
+          VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
+          so fix it here (FK)
         }
-        if (opcode=A_MOVSD) and (ops=2) then
+        if ((opcode=A_MOVSD) or (opcode=A_VMOVSS) or (opcode=A_VMOVSD)) and (ops=2) then
           begin
             case opnr of
               0:
@@ -2880,23 +2955,44 @@ implementation
 
 
     function spilling_create_load(const ref:treference;r:tregister):Taicpu;
+      var
+        tmpref: treference;
       begin
         case getregtype(r) of
           R_INTREGISTER :
-            { we don't need special code here for 32 bit loads on x86_64, since
-              those will automatically zero-extend the upper 32 bits. }
-            result:=taicpu.op_ref_reg(A_MOV,reg2opsize(r),ref,r);
-          R_MMREGISTER :
-            case getsubreg(r) of
-              R_SUBMMD:
-                result:=taicpu.op_ref_reg(A_MOVSD,reg2opsize(r),ref,r);
-              R_SUBMMS:
-                result:=taicpu.op_ref_reg(A_MOVSS,reg2opsize(r),ref,r);
-              R_SUBMMWHOLE:
-                result:=taicpu.op_ref_reg(A_MOVQ,S_NO,ref,r);
-              else
-                internalerror(200506043);
+            begin
+              tmpref:=ref;
+              if getsubreg(r)=R_SUBH then
+                inc(tmpref.offset);
+              { we don't need special code here for 32 bit loads on x86_64, since
+                those will automatically zero-extend the upper 32 bits. }
+              result:=taicpu.op_ref_reg(A_MOV,reg2opsize(r),tmpref,r);
             end;
+          R_MMREGISTER :
+            if current_settings.fputype in fpu_avx_instructionsets then
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_ref_reg(A_VMOVSD,reg2opsize(r),ref,r);
+                R_SUBMMS:
+                  result:=taicpu.op_ref_reg(A_VMOVSS,reg2opsize(r),ref,r);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_ref_reg(A_VMOVQ,S_NO,ref,r);
+                else
+                  internalerror(200506043);
+              end
+            else
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_ref_reg(A_MOVSD,reg2opsize(r),ref,r);
+                R_SUBMMS:
+                  result:=taicpu.op_ref_reg(A_MOVSS,reg2opsize(r),ref,r);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_ref_reg(A_MOVQ,S_NO,ref,r);
+                else
+                  internalerror(200506043);
+              end;
           else
             internalerror(200401041);
         end;
@@ -2906,10 +3002,14 @@ implementation
     function spilling_create_store(r:tregister; const ref:treference):Taicpu;
       var
         size: topsize;
+        tmpref: treference;
       begin
         case getregtype(r) of
           R_INTREGISTER :
             begin
+              tmpref:=ref;
+              if getsubreg(r)=R_SUBH then
+                inc(tmpref.offset);
               size:=reg2opsize(r);
 {$ifdef x86_64}
               { even if it's a 32 bit reg, we still have to spill 64 bits
@@ -2920,19 +3020,33 @@ implementation
                   r:=newreg(getregtype(r),getsupreg(r),R_SUBWHOLE);
                 end;
 {$endif x86_64}
-              result:=taicpu.op_reg_ref(A_MOV,size,r,ref);
+              result:=taicpu.op_reg_ref(A_MOV,size,r,tmpref);
             end;
           R_MMREGISTER :
-            case getsubreg(r) of
-              R_SUBMMD:
-                result:=taicpu.op_reg_ref(A_MOVSD,reg2opsize(r),r,ref);
-              R_SUBMMS:
-                result:=taicpu.op_reg_ref(A_MOVSS,reg2opsize(r),r,ref);
-              R_SUBMMWHOLE:
-                result:=taicpu.op_reg_ref(A_MOVQ,S_NO,r,ref);
-              else
-                internalerror(200506042);
-            end;
+            if current_settings.fputype in fpu_avx_instructionsets then
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_reg_ref(A_VMOVSD,reg2opsize(r),r,ref);
+                R_SUBMMS:
+                  result:=taicpu.op_reg_ref(A_VMOVSS,reg2opsize(r),r,ref);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_reg_ref(A_VMOVQ,S_NO,r,ref);
+                else
+                  internalerror(200506042);
+              end
+            else
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_reg_ref(A_MOVSD,reg2opsize(r),r,ref);
+                R_SUBMMS:
+                  result:=taicpu.op_reg_ref(A_MOVSS,reg2opsize(r),r,ref);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_reg_ref(A_MOVQ,S_NO,r,ref);
+                else
+                  internalerror(200506042);
+              end;
           else
             internalerror(200401041);
         end;
@@ -2975,19 +3089,11 @@ implementation
       actRegTypes  : int64;
       actRegMemTypes: int64;
       NewRegSize: int64;
-      NewMemSize: int64;
-      NewConstSize: int64;
-      RegSize: int64;
-      MemSize: int64;
-      ConstSize: int64;
       RegMMXSizeMask: int64;
       RegXMMSizeMask: int64;
       RegYMMSizeMask: int64;
 
       bitcount: integer;
-      IsRegSizeMemSize: boolean;
-      ExistsRegMem: boolean;
-      s: string;
 
       function bitcnt(aValue: int64): integer;
       var
@@ -3020,10 +3126,6 @@ implementation
           InsTabMemRefSizeInfoCache^[AsmOp].ConstSize    := csiUnkown;
           InsTabMemRefSizeInfoCache^[AsmOp].ExistsSSEAVX := false;
 
-          RegSize := 0;
-          IsRegSizeMemSize := true;
-          ExistsRegMem     := false;
-
           insentry:=@instab[i];
           RegMMXSizeMask := 0;
           RegXMMSizeMask := 0;
@@ -3041,12 +3143,9 @@ implementation
             actMemSize       := 0;
             actMemCount      := 0;
             actRegMemTypes   := 0;
-            NewMemSize       := 0;
 
             actConstSize     := 0;
             actConstCount    := 0;
-            NewConstSize     := 0;
-
 
             if asmop = a_movups then
             begin

+ 31 - 9
compiler/x86/agx86att.pas

@@ -53,6 +53,8 @@ interface
         procedure WriteOper_jmp(const o:toper);
        protected
         fskipPopcountSuffix: boolean;
+        { http://gcc.gnu.org/bugzilla/show_bug.cgi?id=56656 }
+        fNoInterUnitMovQ: boolean;
        public
         procedure WriteInstruction(hp: tai);override;
      end;
@@ -90,6 +92,8 @@ interface
         InstrWriter := Tx86InstrWriter.create(self);
         { Apple's assembler does not support a size suffix for popcount }
         Tx86InstrWriter(InstrWriter).fskipPopcountSuffix := true;
+        { Apple's assembler is broken regarding some movq suffix handling }
+        Tx86InstrWriter(InstrWriter).fNoInterUnitMovQ := true;
       end;
 
 {****************************************************************************
@@ -293,6 +297,23 @@ interface
                end;
            end;
 {$endif x86_64}
+        { see fNoInterUnitMovQ declaration comment }
+        if fNoInterUnitMovQ then
+          begin
+            if ((op=A_MOVQ) or
+                (op=A_VMOVQ)) and
+               (((taicpu(hp).oper[0]^.typ=top_reg) and
+                 (getregtype(taicpu(hp).oper[0]^.reg)=R_INTREGISTER)) or
+                ((taicpu(hp).oper[1]^.typ=top_reg) and
+                 (getregtype(taicpu(hp).oper[1]^.reg)=R_INTREGISTER))) then
+              begin
+                if op=A_MOVQ then
+                  op:=A_MOVD
+                else
+                  op:=A_VMOVD;
+                taicpu(hp).opcode:=op;
+              end;
+          end;
         owner.AsmWrite(#9);
         { movsd should not be translated to movsl when there
           are (xmm) arguments }
@@ -401,7 +422,7 @@ interface
             supported_targets : [system_x86_64_linux,system_x86_64_freebsd,
                                  system_x86_64_win64,system_x86_64_embedded,
                                  system_x86_64_openbsd,system_x86_64_netbsd];
-            flags : [af_allowdirect,af_needar,af_smartlink_sections,af_supports_dwarf];
+            flags : [af_needar,af_smartlink_sections,af_supports_dwarf];
             labelprefix : '.L';
             comment : '# ';
             dollarsign: '$';
@@ -414,7 +435,7 @@ interface
             asmbin : 'gas';
             asmcmd : '--64 -o $OBJ $ASM';
             supported_targets : [system_x86_64_solaris];
-            flags : [af_allowdirect,af_needar,af_smartlink_sections,af_supports_dwarf];
+            flags : [af_needar,af_smartlink_sections,af_supports_dwarf];
             labelprefix : '.L';
             comment : '# ';
             dollarsign: '$';
@@ -429,7 +450,7 @@ interface
             asmbin : 'as';
             asmcmd : '-o $OBJ $ASM -arch x86_64';
             supported_targets : [system_x86_64_darwin];
-            flags : [af_allowdirect,af_needar,af_smartlink_sections,af_supports_dwarf];
+            flags : [af_needar,af_smartlink_sections,af_supports_dwarf];
             labelprefix : 'L';
             comment : '# ';
             dollarsign: '$';
@@ -445,8 +466,8 @@ interface
             supported_targets : [system_i386_GO32V2,system_i386_linux,system_i386_Win32,system_i386_freebsd,system_i386_solaris,system_i386_beos,
                                 system_i386_netbsd,system_i386_Netware,system_i386_qnx,system_i386_wdosx,system_i386_openbsd,
                                 system_i386_netwlibc,system_i386_wince,system_i386_embedded,system_i386_symbian,system_i386_haiku,system_x86_6432_linux,
-                                system_i386_nativent];
-            flags : [af_allowdirect,af_needar,af_smartlink_sections,af_supports_dwarf];
+                                system_i386_nativent,system_i386_android];
+            flags : [af_needar,af_smartlink_sections,af_supports_dwarf];
             labelprefix : '.L';
             comment : '# ';
             dollarsign: '$';
@@ -460,7 +481,7 @@ interface
             asmbin : 'as';
             asmcmd : '-o $OBJ $ASM';
             supported_targets : [system_i386_linux,system_i386_OS2,system_i386_freebsd,system_i386_netbsd,system_i386_openbsd,system_i386_EMX,system_i386_embedded];
-            flags : [af_allowdirect,af_needar,af_stabs_use_function_absolute_addresses];
+            flags : [af_needar,af_stabs_use_function_absolute_addresses];
             labelprefix : 'L';
             comment : '# ';
             dollarsign: '$';
@@ -474,7 +495,7 @@ interface
             asmbin : 'as';
             asmcmd : '-o $OBJ $ASM -arch i386';
             supported_targets : [system_i386_darwin,system_i386_iphonesim];
-            flags : [af_allowdirect,af_needar,af_smartlink_sections,af_supports_dwarf,af_stabs_use_function_absolute_addresses];
+            flags : [af_needar,af_smartlink_sections,af_supports_dwarf,af_stabs_use_function_absolute_addresses];
             labelprefix : 'L';
             comment : '# ';
             dollarsign: '$';
@@ -488,8 +509,9 @@ interface
             asmcmd : '--32 -o $OBJ $ASM';
             supported_targets : [system_i386_GO32V2,system_i386_linux,system_i386_Win32,system_i386_freebsd,system_i386_solaris,system_i386_beos,
                                 system_i386_netbsd,system_i386_Netware,system_i386_qnx,system_i386_wdosx,system_i386_openbsd,
-                                system_i386_netwlibc,system_i386_wince,system_i386_embedded,system_i386_symbian,system_i386_haiku,system_x86_6432_linux];
-            flags : [af_allowdirect,af_needar,af_smartlink_sections,af_supports_dwarf];
+                                system_i386_netwlibc,system_i386_wince,system_i386_embedded,system_i386_symbian,system_i386_haiku,
+                                system_x86_6432_linux,system_i386_android];
+            flags : [af_needar,af_smartlink_sections,af_supports_dwarf];
             labelprefix : '.L';
             comment : '# ';
             dollarsign: '$';

+ 20 - 19
compiler/x86/agx86int.pas

@@ -457,6 +457,7 @@ implementation
       consttype : taiconst_type;
       do_line,DoNotSplitLine,
       quoted   : boolean;
+      fixed_opcode: TAsmOp;
     begin
       if not assigned(p) then
        exit;
@@ -696,7 +697,7 @@ implementation
              end;
            ait_instruction :
              begin
-               taicpu(hp).CheckNonCommutativeOpcodes;
+               fixed_opcode:=taicpu(hp).FixNonCommutativeOpcodes;
                taicpu(hp).SetOperandOrder(op_intel);
                { Reset }
                suffix:='';
@@ -707,8 +708,8 @@ implementation
                if (taicpu(hp).opsize=S_W) and
                    (
                     (
-                     (taicpu(hp).opcode=A_PUSH) or
-                     (taicpu(hp).opcode=A_POP)
+                     (fixed_opcode=A_PUSH) or
+                     (fixed_opcode=A_POP)
                     ) and
                     (taicpu(hp).oper[0]^.typ=top_reg) and
                     is_segment_reg(taicpu(hp).oper[0]^.reg)
@@ -717,14 +718,14 @@ implementation
 
                { added prefix instructions, must be on same line as opcode }
                if (taicpu(hp).ops = 0) and
-                  ((taicpu(hp).opcode = A_REP) or
-                   (taicpu(hp).opcode = A_LOCK) or
-                   (taicpu(hp).opcode =  A_REPE) or
-                   (taicpu(hp).opcode =  A_REPNZ) or
-                   (taicpu(hp).opcode =  A_REPZ) or
-                   (taicpu(hp).opcode = A_REPNE)) then
+                  ((fixed_opcode = A_REP) or
+                   (fixed_opcode = A_LOCK) or
+                   (fixed_opcode =  A_REPE) or
+                   (fixed_opcode =  A_REPNZ) or
+                   (fixed_opcode =  A_REPZ) or
+                   (fixed_opcode = A_REPNE)) then
                 Begin
-                  prefix:=std_op2str[taicpu(hp).opcode]+#9;
+                  prefix:=std_op2str[fixed_opcode]+#9;
                   { there can be a stab inbetween when the opcode was on
                     a different line in the source code }
                   repeat
@@ -750,20 +751,20 @@ implementation
                 prefix:= '';
                if (target_asm.id = as_i386_wasm) and
                  (taicpu(hp).opsize=S_W) and
-                 (taicpu(hp).opcode=A_PUSH) and
+                 (fixed_opcode=A_PUSH) and
                  (taicpu(hp).oper[0]^.typ=top_const) then
                  begin
                    AsmWriteln(#9#9'DB 66h,68h ; pushw imm16');
                    AsmWrite(#9#9'DW');
                  end
                else if (target_asm.id=as_x86_64_masm) and
-                 (taicpu(hp).opcode=A_MOVQ) then
+                 (fixed_opcode=A_MOVQ) then
                  AsmWrite(#9#9'mov')
                else
-                 AsmWrite(#9#9+prefix+std_op2str[taicpu(hp).opcode]+cond2str[taicpu(hp).condition]+suffix);
+                 AsmWrite(#9#9+prefix+std_op2str[fixed_opcode]+cond2str[taicpu(hp).condition]+suffix);
                if taicpu(hp).ops<>0 then
                 begin
-                  if is_calljmp(taicpu(hp).opcode) then
+                  if is_calljmp(fixed_opcode) then
                    begin
                      AsmWrite(#9);
                      WriteOper_jmp(taicpu(hp).oper[0]^,taicpu(hp).opsize);
@@ -776,7 +777,7 @@ implementation
                          AsmWrite(#9)
                         else
                          AsmWrite(',');
-                        WriteOper(taicpu(hp).oper[i]^,taicpu(hp).opsize,taicpu(hp).opcode,(i=2));
+                        WriteOper(taicpu(hp).oper[i]^,taicpu(hp).opsize,fixed_opcode,(i=2));
                       end;
                    end;
                 end;
@@ -963,7 +964,7 @@ implementation
             asmbin : 'tasm';
             asmcmd : '/m2 /ml $ASM $OBJ';
             supported_targets : [system_i386_GO32V2,system_i386_Win32,system_i386_wdosx,system_i386_watcom,system_i386_wince];
-            flags : [af_allowdirect,af_needar,af_labelprefix_only_inside_procedure];
+            flags : [af_needar,af_labelprefix_only_inside_procedure];
             labelprefix : '@@';
             comment : '; ';
             dollarsign: '$';
@@ -976,7 +977,7 @@ implementation
             asmbin : 'masm';
             asmcmd : '/c /Cp $ASM /Fo$OBJ';
             supported_targets : [system_i386_GO32V2,system_i386_Win32,system_i386_wdosx,system_i386_watcom,system_i386_wince];
-            flags : [af_allowdirect,af_needar];
+            flags : [af_needar];
             labelprefix : '@@';
             comment : '; ';
             dollarsign: '$';
@@ -989,7 +990,7 @@ implementation
             asmbin : 'wasm';
             asmcmd : '$ASM -6s -fp6 -ms -zq -Fo=$OBJ';
             supported_targets : [system_i386_watcom];
-            flags : [af_allowdirect,af_needar];
+            flags : [af_needar];
             labelprefix : '@@';
             comment : '; ';
             dollarsign: '$';
@@ -1003,7 +1004,7 @@ implementation
             asmbin : 'ml64';
             asmcmd : '/c /Cp $ASM /Fo$OBJ';
             supported_targets : [system_x86_64_win64];
-            flags : [af_allowdirect,af_needar];
+            flags : [af_needar];
             labelprefix : '@@';
             comment : '; ';
             dollarsign: '$';

+ 166 - 35
compiler/x86/agx86nsm.pas

@@ -31,8 +31,13 @@ interface
       aasmbase,aasmtai,aasmdata,aasmcpu,assemble,cgutils;
 
     type
+
+      { T386NasmAssembler }
+
       T386NasmAssembler = class(texternalassembler)
       private
+        function CodeSectionName: string;
+
         procedure WriteReference(var ref : treference);
         procedure WriteOper(const o:toper;s : topsize; opcode: tasmop;ops:longint;dest : boolean);
         procedure WriteOper_jmp(const o:toper; op : tasmop);
@@ -42,6 +47,7 @@ interface
         procedure WriteAsmList;override;
         procedure WriteExternals;
         procedure WriteSmartExternals;
+        procedure WriteHeader;
       end;
 
 
@@ -64,7 +70,13 @@ interface
 
       nasm_regname_table : array[tregisterindex] of string[7] = (
         {r386nasm.inc contains the Nasm name of each register.}
+{$if defined(x86_64)}
+        {$fatal nasm support not yet implemented for x86_64 }
+{$elseif defined(i386)}
         {$i r386nasm.inc}
+{$elseif defined(i8086)}
+        {$i r8086nasm.inc}
+{$endif}
       );
 
     function nasm_regname(r:Tregister):string;
@@ -281,6 +293,18 @@ interface
                                T386NasmAssembler
  ****************************************************************************}
 
+
+    function T386NasmAssembler.CodeSectionName: string;
+      begin
+{$ifdef i8086}
+        if current_settings.x86memorymodel in x86_far_code_models then
+          result:=current_module.modulename^ + '_TEXT'
+        else
+{$endif}
+          result:='.text';
+      end;
+
+
     procedure T386NasmAssembler.WriteReference(var ref : treference);
       var
         first : boolean;
@@ -352,14 +376,33 @@ interface
                           (opcode = A_LSS) or (opcode = A_LFS) or
                           (opcode = A_LES) or (opcode = A_LDS) or
                          // (opcode = A_SHR) or (opcode = A_SHL) or
-                          (opcode = A_SAR) or (opcode = A_SAL) or
+                         // (opcode = A_SAR) or (opcode = A_SAL) or
                           (opcode = A_OUT) or (opcode = A_IN)) then
                     AsmWrite(sizestr(s,dest));
                   WriteReference(o.ref^);
                 end
+{$ifdef i8086}
+              else if o.ref^.refaddr=addr_dgroup then
+                begin
+                  AsmWrite('dgroup');
+                end
+{$endif i8086}
               else
                 begin
+{$ifdef x86_64}
+                  asmwrite('qword ');
+{$endif}
+{$ifdef i386}
                   asmwrite('dword ');
+{$endif i386}
+{$ifdef i8086}
+                  if o.ref^.refaddr=addr_far then
+                    asmwrite('far ')
+                  else if o.ref^.refaddr=addr_seg then
+                    asmwrite('SEG ')
+                  else
+                    asmwrite('word ');
+{$endif i8086}
                   if assigned(o.ref^.symbol) then
                    begin
                     if SmartAsm then
@@ -385,20 +428,33 @@ interface
           top_reg :
             AsmWrite(nasm_regname(o.reg));
           top_ref :
-            if o.ref^.refaddr=addr_no then
-              WriteReference(o.ref^)
+            if o.ref^.refaddr in [addr_no{$ifdef i8086},addr_far_ref{$endif}] then
+              begin
+{$ifdef i8086}
+                if o.ref^.refaddr=addr_far_ref then
+                  AsmWrite('far ');
+{$endif i8086}
+                WriteReference(o.ref^);
+              end
             else
               begin
+{ NEAR forces NASM to emit near jumps, which are 386+ }
+{$ifndef i8086}
                 if not(
                        (op=A_JCXZ) or (op=A_JECXZ) or
-{$ifdef x86_64}
+    {$ifdef x86_64}
                        (op=A_JRCXZ) or
-{$endif x86_64}
+    {$endif x86_64}
                        (op=A_LOOP) or (op=A_LOOPE) or
                        (op=A_LOOPNE) or (op=A_LOOPNZ) or
                        (op=A_LOOPZ)
                       ) then
                   AsmWrite('NEAR ');
+{$endif i8086}
+{$ifdef i8086}
+                if o.ref^.refaddr=addr_far then
+                  AsmWrite('far ');
+{$endif i8086}
                 AsmWrite(o.ref^.symbol.name);
                 if SmartAsm then
                   AddSymbol(o.ref^.symbol.name,false);
@@ -487,6 +543,8 @@ interface
         if (atype in [sec_rodata,sec_rodata_norel]) and
           (target_info.system=system_i386_go32v2) then
           AsmWrite('.data')
+        else if secnames[atype]='.text' then
+          AsmWrite(CodeSectionName)
         else
           AsmWrite(secnames[atype]);
         if create_smartlink_sections and
@@ -521,6 +579,7 @@ interface
 {$ifdef cpuextended}
       e        : extended;
 {$endif cpuextended}
+      fixed_opcode: TAsmOp;
     begin
       if not assigned(p) then
        exit;
@@ -552,7 +611,7 @@ interface
            ait_regalloc :
              begin
                if (cs_asm_regalloc in current_settings.globalswitches) then
-                 AsmWriteLn(#9#9+target_asm.comment+'Register '+nasm_regname(tai_regalloc(hp).reg)+
+                 AsmWriteLn(#9#9+target_asm.comment+'Register '+nasm_regname(tai_regalloc(hp).reg)+' '+
                    regallocstr[tai_regalloc(hp).ratype]);
              end;
 
@@ -580,7 +639,7 @@ interface
 
            ait_datablock :
              begin
-               if tai_datablock(hp).is_global then
+               if tai_datablock(hp).is_global or SmartAsm then
                 begin
                   AsmWrite(#9'GLOBAL ');
                   AsmWriteLn(tai_datablock(hp).sym.name);
@@ -610,6 +669,28 @@ interface
                  aitconst_128bit:
                     begin
                     end;
+{$ifdef i8086}
+                 aitconst_farptr:
+                   begin
+                     AsmWrite(ait_const2str[aitconst_16bit]);
+                     if assigned(tai_const(hp).sym) then
+                       begin
+                         if SmartAsm then
+                           AddSymbol(tai_const(hp).sym.name,false);
+                         AsmWrite(tai_const(hp).sym.name);
+                         if tai_const(hp).value<>0 then
+                           AsmWrite(tostr_with_plus(tai_const(hp).value));
+                         AsmLn;
+                         AsmWrite(ait_const2str[aitconst_16bit]);
+                         AsmWrite('SEG ');
+                         AsmWrite(tai_const(hp).sym.name);
+                       end
+                     else
+                       AsmWrite(tostr(lo(longint(tai_const(hp).value)))+','+
+                                tostr(hi(longint(tai_const(hp).value))));
+                     AsmLn;
+                   end;
+{$endif i8086}
                  aitconst_32bit,
                  aitconst_16bit,
                  aitconst_8bit,
@@ -834,7 +915,14 @@ interface
            ait_label :
              begin
                if tai_label(hp).labsym.is_used then
-                AsmWriteLn(tai_label(hp).labsym.name+':');
+                 begin
+                   if SmartAsm and (tai_label(hp).labsym.bind=AB_GLOBAL) then
+                     begin
+                       AsmWrite(#9'GLOBAL ');
+                       AsmWriteLn(tai_label(hp).labsym.name);
+                     end;
+                   AsmWriteLn(tai_label(hp).labsym.name+':');
+                 end;
                if SmartAsm then
                  AddSymbol(tai_label(hp).labsym.name,true);
              end;
@@ -843,7 +931,7 @@ interface
              begin
                if tai_symbol(hp).has_value then
                  internalerror(2009090803);
-               if tai_symbol(hp).is_global then
+               if tai_symbol(hp).is_global or SmartAsm then
                 begin
                   AsmWrite(#9'GLOBAL ');
                   AsmWriteLn(tai_symbol(hp).sym.name);
@@ -861,12 +949,12 @@ interface
 
            ait_instruction :
              begin
-               taicpu(hp).CheckNonCommutativeOpcodes;
+               fixed_opcode:=taicpu(hp).FixNonCommutativeOpcodes;
                { We need intel order, no At&t }
                taicpu(hp).SetOperandOrder(op_intel);
                s:='';
-               if ((taicpu(hp).opcode=A_FADDP) or
-                   (taicpu(hp).opcode=A_FMULP))
+               if ((fixed_opcode=A_FADDP) or
+                   (fixed_opcode=A_FMULP))
                   and (taicpu(hp).ops=0) then
                  begin
                    taicpu(hp).allocate_oper(2);
@@ -875,26 +963,28 @@ interface
                    taicpu(hp).oper[1]^.typ:=top_reg;
                    taicpu(hp).oper[1]^.reg:=NR_ST;
                  end;
-               if taicpu(hp).opcode=A_FWAIT then
+               if fixed_opcode=A_FWAIT then
                 AsmWriteln(#9#9'DB'#9'09bh')
                else
                 begin
+{$ifndef i8086}
                   { We need to explicitely set
                     word prefix to get selectors
                     to be pushed in 2 bytes  PM }
                   if (taicpu(hp).opsize=S_W) and
-                     ((taicpu(hp).opcode=A_PUSH) or
-                      (taicpu(hp).opcode=A_POP)) and
+                     ((fixed_opcode=A_PUSH) or
+                      (fixed_opcode=A_POP)) and
                       (taicpu(hp).oper[0]^.typ=top_reg) and
                       (is_segment_reg(taicpu(hp).oper[0]^.reg)) then
                     AsmWriteln(#9#9'DB'#9'066h');
-                  AsmWrite(#9#9+std_op2str[taicpu(hp).opcode]+cond2str[taicpu(hp).condition]);
+{$endif not i8086}
+                  AsmWrite(#9#9+std_op2str[fixed_opcode]+cond2str[taicpu(hp).condition]);
                   if taicpu(hp).ops<>0 then
                    begin
-                     if is_calljmp(taicpu(hp).opcode) then
+                     if is_calljmp(fixed_opcode) then
                       begin
                         AsmWrite(#9);
-                        WriteOper_jmp(taicpu(hp).oper[0]^,taicpu(hp).opcode);
+                        WriteOper_jmp(taicpu(hp).oper[0]^,fixed_opcode);
                       end
                      else
                       begin
@@ -904,7 +994,7 @@ interface
                             AsmWrite(#9)
                            else
                             AsmWrite(',');
-                           WriteOper(taicpu(hp).oper[i]^,taicpu(hp).opsize,taicpu(hp).opcode,taicpu(hp).ops,(i=2));
+                           WriteOper(taicpu(hp).oper[i]^,taicpu(hp).opsize,fixed_opcode,taicpu(hp).ops,(i=2));
                          end;
                       end;
                    end;
@@ -933,6 +1023,7 @@ interface
                     AsmClose;
                     DoAssemble;
                     AsmCreate(tai_cutobject(hp).place);
+                    WriteHeader;
                   end;
                { avoid empty files }
                  while assigned(hp.next) and (tai(hp.next).typ in [ait_cutobject,ait_section,ait_comment]) do
@@ -1009,6 +1100,46 @@ interface
           end;
       end;
 
+    procedure T386NasmAssembler.WriteHeader;
+      begin
+{$ifdef i8086}
+      AsmWriteLn('BITS 16');
+      case current_settings.cputype of
+        cpu_8086: AsmWriteLn('CPU 8086');
+        cpu_186: AsmWriteLn('CPU 186');
+        cpu_286: AsmWriteLn('CPU 286');
+        cpu_386: AsmWriteLn('CPU 386');
+        cpu_Pentium: AsmWriteLn('CPU PENTIUM');
+        cpu_Pentium2: AsmWriteLn('CPU P2');
+        cpu_Pentium3: AsmWriteLn('CPU P3');
+        cpu_Pentium4: AsmWriteLn('CPU P4');
+        cpu_PentiumM: AsmWriteLn('CPU P4');
+        else
+          internalerror(2013050101);
+      end;
+
+      AsmWriteLn('SECTION ' + CodeSectionName + ' use16 class=code');
+      if current_settings.x86memorymodel in x86_near_data_models then
+        begin
+          { NASM complains if you put a missing section in the GROUP directive, so }
+          { we add empty declarations to make sure they exist, even if empty }
+          AsmWriteLn('SECTION .rodata');
+          AsmWriteLn('SECTION .data');
+          AsmWriteLn('SECTION .fpc');
+          { WLINK requires class=bss in order to leave the BSS section out of the executable }
+          AsmWriteLn('SECTION .bss class=bss');
+          { group these sections in the same segment }
+          if current_settings.x86memorymodel=mm_tiny then
+            AsmWriteLn('GROUP dgroup text rodata data fpc bss')
+          else
+            AsmWriteLn('GROUP dgroup rodata data fpc bss');
+        end;
+      AsmWriteLn('SECTION ' + CodeSectionName);
+{$else i8086}
+      AsmWriteLn('BITS 32');
+{$endif i8086}
+      end;
+
 
     procedure T386NasmAssembler.WriteAsmList;
     var
@@ -1018,7 +1149,7 @@ interface
       if current_module.mainsource<>'' then
        comment(v_info,'Start writing nasm-styled assembler output for '+current_module.mainsource);
 {$endif}
-      AsmWriteLn('BITS 32');
+      WriteHeader;
       AsmLn;
 
       WriteExternals;
@@ -1053,9 +1184,9 @@ interface
             id           : as_i386_nasmcoff;
             idtxt  : 'NASMCOFF';
             asmbin : 'nasm';
-            asmcmd : '-f coff -o $OBJ $ASM';
+            asmcmd : '-f coff -o $OBJ -w-orphan-labels $ASM';
             supported_targets : [system_i386_go32v2];
-            flags : [af_allowdirect,af_needar,af_no_debug];
+            flags : [af_needar,af_no_debug];
             labelprefix : '..@';
             comment : '; ';
             dollarsign: '$';
@@ -1066,9 +1197,9 @@ interface
             id           : as_i386_nasmwin32;
             idtxt  : 'NASMWIN32';
             asmbin : 'nasm';
-            asmcmd : '-f win32 -o $OBJ $ASM';
+            asmcmd : '-f win32 -o $OBJ -w-orphan-labels $ASM';
             supported_targets : [system_i386_win32];
-            flags : [af_allowdirect,af_needar,af_no_debug];
+            flags : [af_needar,af_no_debug];
             labelprefix : '..@';
             comment : '; ';
             dollarsign: '$';
@@ -1079,9 +1210,9 @@ interface
             id           : as_i386_nasmobj;
             idtxt  : 'NASMOBJ';
             asmbin : 'nasm';
-            asmcmd : '-f obj -o $OBJ $ASM';
-            supported_targets : [system_i386_embedded];
-            flags : [af_allowdirect,af_needar,af_no_debug];
+            asmcmd : '-f obj -o $OBJ -w-orphan-labels $ASM';
+            supported_targets : [system_i386_embedded, system_i8086_msdos];
+            flags : [af_needar,af_no_debug];
             labelprefix : '..@';
             comment : '; ';
             dollarsign: '$';
@@ -1092,9 +1223,9 @@ interface
             id           : as_i386_nasmwdosx;
             idtxt  : 'NASMWDOSX';
             asmbin : 'nasm';
-            asmcmd : '-f win32 -o $OBJ $ASM';
+            asmcmd : '-f win32 -o $OBJ -w-orphan-labels $ASM';
             supported_targets : [system_i386_wdosx];
-            flags : [af_allowdirect,af_needar,af_no_debug];
+            flags : [af_needar,af_no_debug];
             labelprefix : '..@';
             comment : '; ';
             dollarsign: '$';
@@ -1106,9 +1237,9 @@ interface
             id           : as_i386_nasmelf;
             idtxt  : 'NASMELF';
             asmbin : 'nasm';
-            asmcmd : '-f elf -o $OBJ $ASM';
+            asmcmd : '-f elf -o $OBJ -w-orphan-labels $ASM';
             supported_targets : [system_i386_linux];
-            flags : [af_allowdirect,af_needar,af_no_debug];
+            flags : [af_needar,af_no_debug];
             labelprefix : '..@';
             comment : '; ';
             dollarsign: '$';
@@ -1119,9 +1250,9 @@ interface
             id           : as_i386_nasmbeos;
             idtxt  : 'NASMELF';
             asmbin : 'nasm';
-            asmcmd : '-f elf -o $OBJ $ASM';
+            asmcmd : '-f elf -o $OBJ -w-orphan-labels $ASM';
             supported_targets : [system_i386_beos];
-            flags : [af_allowdirect,af_needar,af_no_debug];
+            flags : [af_needar,af_no_debug];
             labelprefix : '..@';
             comment : '; ';
             dollarsign: '$';
@@ -1132,9 +1263,9 @@ interface
             id           : as_i386_nasmhaiku;
             idtxt  : 'NASMELF';
             asmbin : 'nasm';
-            asmcmd : '-f elf -o $OBJ $ASM';
+            asmcmd : '-f elf -o $OBJ -w-orphan-labels $ASM';
             supported_targets : [system_i386_haiku];
-            flags : [af_allowdirect,af_needar,af_no_debug];
+            flags : [af_needar,af_no_debug];
             labelprefix : '..@';
             comment : '; ';
             dollarsign: '$';

+ 481 - 118
compiler/x86/cgx86.pas

@@ -35,6 +35,9 @@ unit cgx86;
        symconst,symtype,symdef;
 
     type
+
+      { tcgx86 }
+
       tcgx86 = class(tcg)
         rgfpu   : Trgx86fpu;
         procedure done_register_allocators;override;
@@ -53,9 +56,13 @@ unit cgx86;
         procedure inc_fpu_stack;
 
         procedure a_call_name(list : TAsmList;const s : string; weak: boolean);override;
+        procedure a_call_name_near(list : TAsmList;const s : string; weak: boolean);
+        procedure a_call_name_static(list : TAsmList;const s : string);override;
+        procedure a_call_name_static_near(list : TAsmList;const s : string);
         procedure a_call_reg(list : TAsmList;reg : tregister);override;
+        procedure a_call_reg_near(list : TAsmList;reg : tregister);
         procedure a_call_ref(list : TAsmList;ref : treference);override;
-        procedure a_call_name_static(list : TAsmList;const s : string);override;
+        procedure a_call_ref_near(list : TAsmList;ref : treference);
 
         procedure a_op_const_reg(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; reg: TRegister); override;
         procedure a_op_const_ref(list : TAsmList; Op: TOpCG; size: TCGSize; a: tcgint; const ref: TReference); override;
@@ -85,6 +92,8 @@ unit cgx86;
         procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize : tcgsize;reg: tregister; const ref: treference;shuffle : pmmshuffle); override;
         procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); override;
         procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src,dst: tregister;shuffle : pmmshuffle);override;
+        procedure a_opmm_ref_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;const ref : treference;src,dst : tregister;shuffle : pmmshuffle);override;
+        procedure a_opmm_reg_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;src1,src2,dst : tregister;shuffle : pmmshuffle);override;
 
         {  comparison operations }
         procedure a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister;
@@ -108,6 +117,8 @@ unit cgx86;
         procedure g_profilecode(list : TAsmList);override;
         procedure g_stackpointer_alloc(list : TAsmList;localsize : longint);override;
         procedure g_proc_entry(list : TAsmList;localsize : longint;nostackframe:boolean);override;
+        procedure g_save_registers(list: TAsmList); override;
+        procedure g_restore_registers(list: TAsmList); override;
 
         procedure g_overflowcheck(list: TAsmList; const l:tlocation;def:tdef);override;
 
@@ -119,9 +130,9 @@ unit cgx86;
         procedure check_register_size(size:tcgsize;reg:tregister);
 
         procedure opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
+        procedure opmm_loc_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;loc : tlocation;src,dst : tregister;shuffle : pmmshuffle);
 
         function get_darwin_call_stub(const s: string; weak: boolean): tasmsymbol;
-      private
         procedure sizes2load(s1,s2 : tcgsize;var op: tasmop; var s3: topsize);
 
         procedure floatload(list: TAsmList; t : tcgsize;const ref : treference);
@@ -129,22 +140,29 @@ unit cgx86;
         procedure floatloadops(t : tcgsize;var op : tasmop;var s : topsize);
         procedure floatstoreops(t : tcgsize;var op : tasmop;var s : topsize);
 
+        procedure internal_restore_regs(list: TAsmList; use_pop: boolean);
       end;
 
    const
-{$ifdef x86_64}
+{$if defined(x86_64)}
       TCGSize2OpSize: Array[tcgsize] of topsize =
         (S_NO,S_B,S_W,S_L,S_Q,S_XMM,S_B,S_W,S_L,S_Q,S_XMM,
          S_FS,S_FL,S_FX,S_IQ,S_FXX,
          S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
          S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
-{$else x86_64}
+{$elseif defined(i386)}
       TCGSize2OpSize: Array[tcgsize] of topsize =
         (S_NO,S_B,S_W,S_L,S_L,S_T,S_B,S_W,S_L,S_L,S_L,
          S_FS,S_FL,S_FX,S_IQ,S_FXX,
          S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
          S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
-{$endif x86_64}
+{$elseif defined(i8086)}
+      TCGSize2OpSize: Array[tcgsize] of topsize =
+        (S_NO,S_B,S_W,S_W,S_W,S_T,S_B,S_W,S_W,S_W,S_W,
+         S_FS,S_FL,S_FX,S_IQ,S_FXX,
+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
+{$endif}
 
 {$ifndef NOTARGETWIN}
       winstackpagesize = 4096;
@@ -152,6 +170,8 @@ unit cgx86;
 
     function UseAVX: boolean;
 
+    function UseIncDec: boolean;
+
   implementation
 
     uses
@@ -162,9 +182,24 @@ unit cgx86;
 
     function UseAVX: boolean;
       begin
-        Result:=current_settings.fputype in [fpu_avx];
+        Result:=current_settings.fputype in fpu_avx_instructionsets;
       end;
 
+
+    { modern CPUs prefer add/sub over inc/dec because add/sub break instructions dependencies on flags
+      because they modify all flags }
+    function UseIncDec: boolean;
+      begin
+{$if defined(x86_64)}
+        Result:=cs_opt_size in current_settings.optimizerswitches;
+{$elseif defined(i386)}
+        Result:=(cs_opt_size in current_settings.optimizerswitches) or (current_settings.cputype in [cpu_386]);
+{$elseif defined(i8086)}
+        Result:=(cs_opt_size in current_settings.optimizerswitches) or (current_settings.cputype in [cpu_8086..cpu_386]);
+{$endif}
+      end;
+
+
     const
       TOpCG2AsmOp: Array[topcg] of TAsmOp = (A_NONE,A_MOV,A_ADD,A_AND,A_DIV,
                             A_IDIV,A_IMUL,A_MUL,A_NEG,A_NOT,A_OR,
@@ -206,7 +241,8 @@ unit cgx86;
             result:=rg[R_MMREGISTER].getregister(list,R_SUBMMD);
           OS_F32:
             result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
-          OS_M64,
+          OS_M64:
+            result:=rg[R_MMREGISTER].getregister(list,R_SUBQ);
           OS_M128:
             result:=rg[R_MMREGISTER].getregister(list,R_SUBMMWHOLE);
           else
@@ -368,9 +404,15 @@ unit cgx86;
         if (ref.refaddr in [addr_pic,addr_pic_no_got]) then
           exit;
 
-{$ifdef x86_64}
+{$if defined(x86_64)}
         { Only 32bit is allowed }
-        if ((ref.offset<low(longint)) or (ref.offset>high(longint))) then
+        { Note that this isn't entirely correct: for RIP-relative targets/memory models,
+          it is actually (offset+@symbol-RIP) that should fit into 32 bits. Since two last
+          members aren't known until link time, ABIs place very pessimistic limits
+          on offset values, e.g. SysV AMD64 allows +/-$1000000 (16 megabytes) }
+        if ((ref.offset<low(longint)) or (ref.offset>high(longint))) or
+           { absolute address is not a common thing in x64, but nevertheless a possible one }
+           ((ref.base=NR_NO) and (ref.index=NR_NO) and (ref.symbol=nil)) then
           begin
             { Load constant value to register }
             hreg:=GetAddressRegister(list);
@@ -382,7 +424,9 @@ unit cgx86;
                 ref.symbol:=nil;
               end;}
             { Add register to reference }
-            if ref.index=NR_NO then
+            if ref.base=NR_NO then
+              ref.base:=hreg
+            else if ref.index=NR_NO then
               ref.index:=hreg
             else
               begin
@@ -496,7 +540,7 @@ unit cgx86;
 
                 end;
           end;
-{$else x86_64}
+{$elseif defined(i386)}
         add_hreg:=false;
         if (target_info.system in [system_i386_darwin,system_i386_iphonesim]) then
           begin
@@ -552,7 +596,25 @@ unit cgx86;
                 ref.base:=hreg;
               end;
           end;
-{$endif x86_64}
+{$elseif defined(i8086)}
+        { i8086 does not support stack relative addressing }
+        if ref.base = NR_STACK_POINTER_REG then
+          begin
+            href:=ref;
+            href.base:=getaddressregister(list);
+            { let the register allocator find a suitable register for the reference }
+            list.Concat(Taicpu.op_reg_reg(A_MOV, S_W, NR_SP, href.base));
+            ref:=href;
+          end;
+
+        { if there is a segment in an int register, move it to ES }
+        if (ref.segment<>NR_NO) and (not is_segment_reg(ref.segment)) then
+          begin
+            list.concat(taicpu.op_reg(A_PUSH,S_W,ref.segment));
+            list.concat(taicpu.op_reg(A_POP,S_W,NR_ES));
+            ref.segment:=NR_ES;
+          end;
+{$endif}
       end;
 
 
@@ -694,7 +756,7 @@ unit cgx86;
           current_asmdata.asmlists[al_imports]:=TAsmList.create;
 
         new_section(current_asmdata.asmlists[al_imports],sec_stub,'',0);
-        result := current_asmdata.RefAsmSymbol(stubname);
+        result := current_asmdata.DefineAsmSymbol(stubname,AB_LOCAL,AT_FUNCTION);
         current_asmdata.asmlists[al_imports].concat(Tai_symbol.Create(result,0));
         { register as a weak symbol if necessary }
         if weak then
@@ -709,6 +771,12 @@ unit cgx86;
 
 
     procedure tcgx86.a_call_name(list : TAsmList;const s : string; weak: boolean);
+      begin
+        a_call_name_near(list,s,weak);
+      end;
+
+
+    procedure tcgx86.a_call_name_near(list : TAsmList;const s : string; weak: boolean);
       var
         sym : tasmsymbol;
         r : treference;
@@ -743,6 +811,12 @@ unit cgx86;
 
 
     procedure tcgx86.a_call_name_static(list : TAsmList;const s : string);
+      begin
+        a_call_name_static_near(list,s);
+      end;
+
+
+    procedure tcgx86.a_call_name_static_near(list : TAsmList;const s : string);
       var
         sym : tasmsymbol;
         r : treference;
@@ -755,12 +829,24 @@ unit cgx86;
 
 
     procedure tcgx86.a_call_reg(list : TAsmList;reg : tregister);
+      begin
+        a_call_reg_near(list,reg);
+      end;
+
+
+    procedure tcgx86.a_call_reg_near(list: TAsmList; reg: tregister);
       begin
         list.concat(taicpu.op_reg(A_CALL,S_NO,reg));
       end;
 
 
     procedure tcgx86.a_call_ref(list : TAsmList;ref : treference);
+      begin
+        a_call_ref_near(list,ref);
+      end;
+
+
+    procedure tcgx86.a_call_ref_near(list: TAsmList; ref: treference);
       begin
         list.concat(taicpu.op_ref(A_CALL,S_NO,ref));
       end;
@@ -995,7 +1081,7 @@ unit cgx86;
                     { Convert thread local address to a process global addres
                       as we cannot handle far pointers.}
                     case target_info.system of
-                      system_i386_linux:
+                      system_i386_linux,system_i386_android:
                         if segment=NR_GS then
                           begin
                             reference_reset_symbol(tmpref,current_asmdata.RefAsmSymbol('___fpc_threadvar_offset'),0,ref.alignment);
@@ -1004,17 +1090,6 @@ unit cgx86;
                           end
                         else
                           cgmessage(cg_e_cant_use_far_pointer_there);
-                      system_i386_win32:
-                        if segment=NR_FS then
-                          begin
-                            allocallcpuregisters(list);
-                            a_call_name(list,'GetTls',false);
-                            deallocallcpuregisters(list);
-                            list.concat(Taicpu.op_reg_reg(A_ADD,tcgsize2opsize[OS_ADDR],NR_EAX,r));
-                          end
-                        else
-                          cgmessage(cg_e_cant_use_far_pointer_there);
-
                       else
                         cgmessage(cg_e_cant_use_far_pointer_there);
                     end;
@@ -1091,12 +1166,18 @@ unit cgx86;
 
     function get_scalar_mm_op(fromsize,tosize : tcgsize) : tasmop;
       const
-        convertop : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
+        convertopsse : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
           (A_MOVSS,A_CVTSS2SD,A_NONE,A_NONE,A_NONE),
           (A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
+        convertopavx : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
+          (A_VMOVSS,A_VCVTSS2SD,A_NONE,A_NONE,A_NONE),
+          (A_VCVTSD2SS,A_VMOVSD,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
       begin
         { we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
           OS_32/OS_64 (record in memory/LOC_REFERENCE) }
@@ -1108,14 +1189,24 @@ unit cgx86;
             OS_64:
               tosize:=OS_F64;
           end;
-        if (fromsize in [low(convertop)..high(convertop)]) and
-           (tosize in [low(convertop)..high(convertop)]) then
-          result:=convertop[fromsize,tosize]
+        if (fromsize in [low(convertopsse)..high(convertopsse)]) and
+           (tosize in [low(convertopsse)..high(convertopsse)]) then
+          begin
+            if UseAVX then
+              result:=convertopavx[fromsize,tosize]
+            else
+              result:=convertopsse[fromsize,tosize];
+          end
         { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
           OS_64 (record in memory/LOC_REFERENCE) }
         else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
                 (fromsize=OS_M64) then
-          result:=A_MOVQ
+          begin
+            if UseAVX then
+              result:=A_VMOVQ
+            else
+              result:=A_MOVQ;
+          end
         else
           internalerror(2010060104);
         if result=A_NONE then
@@ -1126,6 +1217,7 @@ unit cgx86;
     procedure tcgx86.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle);
       var
         instr : taicpu;
+        op : TAsmOp;
       begin
         if shuffle=nil then
           begin
@@ -1147,8 +1239,33 @@ unit cgx86;
           end
         else if shufflescalar(shuffle) then
           begin
-            instr:=taicpu.op_reg_reg(get_scalar_mm_op(fromsize,tosize),S_NO,reg1,reg2);
-            case get_scalar_mm_op(fromsize,tosize) of
+            op:=get_scalar_mm_op(fromsize,tosize);
+
+            { MOVAPD/MOVAPS are normally faster }
+            if op=A_MOVSD then
+              op:=A_MOVAPD
+            else if op=A_MOVSS then
+              op:=A_MOVAPS
+            { VMOVSD/SS is not available with two register operands }
+            else if op=A_VMOVSD then
+              op:=A_VMOVAPD
+            else if op=A_VMOVSS then
+              op:=A_VMOVAPS;
+
+            { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+            if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+              instr:=taicpu.op_reg_reg_reg(op,S_NO,reg1,reg2,reg2)
+            else
+              instr:=taicpu.op_reg_reg(op,S_NO,reg1,reg2);
+
+            case op of
+              A_VMOVAPD,
+              A_VMOVAPS,
+              A_VMOVSS,
+              A_VMOVSD,
+              A_VMOVQ,
+              A_MOVAPD,
+              A_MOVAPS,
               A_MOVSS,
               A_MOVSD,
               A_MOVQ:
@@ -1164,6 +1281,7 @@ unit cgx86;
     procedure tcgx86.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle);
        var
          tmpref  : treference;
+         op : tasmop;
        begin
          tmpref:=ref;
          make_simple_ref(list,tmpref);
@@ -1180,7 +1298,15 @@ unit cgx86;
 {$endif x86_64}
            end
          else if shufflescalar(shuffle) then
-           list.concat(taicpu.op_ref_reg(get_scalar_mm_op(fromsize,tosize),S_NO,tmpref,reg))
+           begin
+             op:=get_scalar_mm_op(fromsize,tosize);
+
+             { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+             if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+               list.concat(taicpu.op_ref_reg_reg(op,S_NO,tmpref,reg,reg))
+             else
+               list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg))
+           end
          else
            internalerror(200312252);
        end;
@@ -1190,6 +1316,7 @@ unit cgx86;
        var
          hreg : tregister;
          tmpref  : treference;
+         op : tasmop;
        begin
          tmpref:=ref;
          make_simple_ref(list,tmpref);
@@ -1210,8 +1337,15 @@ unit cgx86;
              if tcgsize2size[tosize]<>tcgsize2size[fromsize] then
                begin
                  hreg:=getmmregister(list,tosize);
-                 list.concat(taicpu.op_reg_reg(get_scalar_mm_op(fromsize,tosize),S_NO,reg,hreg));
-                 list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref));
+                 op:=get_scalar_mm_op(fromsize,tosize);
+
+                 { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+                 if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+                   list.concat(taicpu.op_reg_reg_reg(op,S_NO,reg,hreg,hreg))
+                 else
+                   list.concat(taicpu.op_reg_reg(op,S_NO,reg,hreg));
+
+                 list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref))
                end
              else
                list.concat(taicpu.op_reg_ref(get_scalar_mm_op(fromsize,tosize),S_NO,reg,tmpref));
@@ -1243,6 +1377,103 @@ unit cgx86;
      end;
 
 
+    procedure tcgx86.opmm_loc_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;src,dst: tregister; shuffle : pmmshuffle);
+      const
+        opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
+          ( { scalar }
+            ( { OS_F32 }
+              A_NOP,A_NOP,A_VADDSS,A_NOP,A_VDIVSS,A_NOP,A_NOP,A_VMULSS,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBSS,A_NOP,A_NOP,A_NOP
+            ),
+            ( { OS_F64 }
+              A_NOP,A_NOP,A_VADDSD,A_NOP,A_VDIVSD,A_NOP,A_NOP,A_VMULSD,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBSD,A_NOP,A_NOP,A_NOP
+            )
+          ),
+          ( { vectorized/packed }
+            { because the logical packed single instructions have shorter op codes, we use always
+              these
+            }
+            ( { OS_F32 }
+              A_NOP,A_NOP,A_VADDPS,A_NOP,A_VDIVPS,A_NOP,A_NOP,A_VMULPS,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBPS,A_VXORPS,A_NOP,A_NOP
+            ),
+            ( { OS_F64 }
+              A_NOP,A_NOP,A_VADDPD,A_NOP,A_VDIVPD,A_NOP,A_NOP,A_VMULPD,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBPD,A_VXORPD,A_NOP,A_NOP
+            )
+          )
+        );
+
+      var
+        resultreg : tregister;
+        asmop : tasmop;
+      begin
+        { this is an internally used procedure so the parameters have
+          some constrains
+        }
+        if loc.size<>size then
+          internalerror(2013061108);
+        resultreg:=dst;
+        { deshuffle }
+        //!!!
+        if (shuffle<>nil) and not(shufflescalar(shuffle)) then
+          begin
+            internalerror(2013061107);
+          end
+        else if (shuffle=nil) then
+          asmop:=opmm2asmop[1,size,op]
+        else if shufflescalar(shuffle) then
+          begin
+            asmop:=opmm2asmop[0,size,op];
+            { no scalar operation available? }
+            if asmop=A_NOP then
+              begin
+                { do vectorized and shuffle finally }
+                internalerror(2010060102);
+              end;
+          end
+        else
+          internalerror(2013061106);
+        if asmop=A_NOP then
+          internalerror(2013061105);
+        case loc.loc of
+          LOC_CREFERENCE,LOC_REFERENCE:
+            begin
+              make_simple_ref(current_asmdata.CurrAsmList,loc.reference);
+              list.concat(taicpu.op_ref_reg_reg(asmop,S_NO,loc.reference,src,resultreg));
+            end;
+          LOC_CMMREGISTER,LOC_MMREGISTER:
+            list.concat(taicpu.op_reg_reg_reg(asmop,S_NO,loc.register,src,resultreg));
+          else
+            internalerror(2013061104);
+        end;
+        { shuffle }
+        if resultreg<>dst then
+          begin
+            internalerror(2013061103);
+          end;
+      end;
+
+
+    procedure tcgx86.a_opmm_reg_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src1,src2,dst: tregister;shuffle : pmmshuffle);
+      var
+        l : tlocation;
+      begin
+        l.loc:=LOC_MMREGISTER;
+        l.register:=src1;
+        l.size:=size;
+        opmm_loc_reg_reg(list,op,size,l,src2,dst,shuffle);
+      end;
+
+
+    procedure tcgx86.a_opmm_ref_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; src,dst: tregister;shuffle : pmmshuffle);
+      var
+        l : tlocation;
+      begin
+        l.loc:=LOC_REFERENCE;
+        l.reference:=ref;
+        l.size:=size;
+        opmm_loc_reg_reg(list,op,size,l,src,dst,shuffle);
+      end;
+
+
     procedure tcgx86.opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
       const
         opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
@@ -1266,7 +1497,6 @@ unit cgx86;
             )
           )
         );
-
       var
         resultreg : tregister;
         asmop : tasmop;
@@ -1386,11 +1616,14 @@ unit cgx86;
           OP_ADD, OP_AND, OP_OR, OP_SUB, OP_XOR:
             if not(cs_check_overflow in current_settings.localswitches) and
                (a = 1) and
-               (op in [OP_ADD,OP_SUB]) then
-              if op = OP_ADD then
-                list.concat(taicpu.op_reg(A_INC,TCgSize2OpSize[size],reg))
-              else
-                list.concat(taicpu.op_reg(A_DEC,TCgSize2OpSize[size],reg))
+               (op in [OP_ADD,OP_SUB]) and
+               UseIncDec then
+               begin
+                 if op = OP_ADD then
+                   list.concat(taicpu.op_reg(A_INC,TCgSize2OpSize[size],reg))
+                 else
+                   list.concat(taicpu.op_reg(A_DEC,TCgSize2OpSize[size],reg))
+               end
             else if (a = 0) then
               if (op <> OP_AND) then
                 exit
@@ -1412,17 +1645,33 @@ unit cgx86;
               list.concat(taicpu.op_const_reg(TOpCG2AsmOp[op],TCgSize2OpSize[size],aint(a),reg));
           OP_SHL,OP_SHR,OP_SAR,OP_ROL,OP_ROR:
             begin
-{$ifdef x86_64}
+{$if defined(x86_64)}
               if (a and 63) <> 0 Then
                 list.concat(taicpu.op_const_reg(TOpCG2AsmOp[op],TCgSize2OpSize[size],a and 63,reg));
               if (a shr 6) <> 0 Then
                 internalerror(200609073);
-{$else x86_64}
+{$elseif defined(i386)}
               if (a and 31) <> 0 Then
                 list.concat(taicpu.op_const_reg(TOpCG2AsmOp[op],TCgSize2OpSize[size],a and 31,reg));
               if (a shr 5) <> 0 Then
                 internalerror(200609071);
-{$endif x86_64}
+{$elseif defined(i8086)}
+              if (a shr 5) <> 0 Then
+                internalerror(2013043002);
+              a := a and 31;
+              if a <> 0 Then
+                begin
+                  if (current_settings.cputype < cpu_186) and (a <> 1) then
+                    begin
+                      getcpuregister(list,NR_CL);
+                      a_load_const_reg(list,OS_8,a,NR_CL);
+                      list.concat(taicpu.op_reg_reg(TOpCG2AsmOp[op],TCgSize2OpSize[size],NR_CL,reg));
+                      ungetcpuregister(list,NR_CL);
+                    end
+                  else
+                    list.concat(taicpu.op_const_reg(TOpCG2AsmOp[op],TCgSize2OpSize[size],a,reg));
+                end;
+{$endif}
             end
           else internalerror(200609072);
         end;
@@ -1501,11 +1750,14 @@ unit cgx86;
           OP_ADD, OP_AND, OP_OR, OP_SUB, OP_XOR:
             if not(cs_check_overflow in current_settings.localswitches) and
                (a = 1) and
-               (op in [OP_ADD,OP_SUB]) then
-              if op = OP_ADD then
-                list.concat(taicpu.op_ref(A_INC,TCgSize2OpSize[size],tmpref))
-              else
-                list.concat(taicpu.op_ref(A_DEC,TCgSize2OpSize[size],tmpref))
+               (op in [OP_ADD,OP_SUB]) and
+               UseIncDec then
+               begin
+                 if op = OP_ADD then
+                   list.concat(taicpu.op_ref(A_INC,TCgSize2OpSize[size],tmpref))
+                 else
+                   list.concat(taicpu.op_ref(A_DEC,TCgSize2OpSize[size],tmpref))
+               end
             else if (a = 0) then
               if (op <> OP_AND) then
                 exit
@@ -1540,6 +1792,14 @@ unit cgx86;
 
 
     procedure tcgx86.a_op_reg_reg(list : TAsmList; Op: TOpCG; size: TCGSize; src, dst: TRegister);
+      const
+{$if defined(cpu64bitalu) or defined(cpu32bitalu)}
+        REGCX=NR_ECX;
+        REGCX_Size = OS_32;
+{$elseif defined(cpu16bitalu)}
+        REGCX=NR_CX;
+        REGCX_Size = OS_16;
+{$endif}
       var
         dstsize: topsize;
         instr:Taicpu;
@@ -1561,10 +1821,10 @@ unit cgx86;
           OP_SHR,OP_SHL,OP_SAR,OP_ROL,OP_ROR:
             begin
               { Use ecx to load the value, that allows better coalescing }
-              getcpuregister(list,NR_ECX);
-              a_load_reg_reg(list,size,OS_32,src,NR_ECX);
+              getcpuregister(list,REGCX);
+              a_load_reg_reg(list,size,REGCX_Size,src,REGCX);
               list.concat(taicpu.op_reg_reg(Topcg2asmop[op],tcgsize2opsize[size],NR_CL,dst));
-              ungetcpuregister(list,NR_ECX);
+              ungetcpuregister(list,REGCX);
             end;
           else
             begin
@@ -1807,15 +2067,25 @@ unit cgx86;
     procedure Tcgx86.g_concatcopy(list:TAsmList;const source,dest:Treference;len:tcgint);
 
     const
-{$ifdef cpu64bitalu}
+{$if defined(cpu64bitalu)}
         REGCX=NR_RCX;
         REGSI=NR_RSI;
         REGDI=NR_RDI;
-{$else cpu64bitalu}
+        copy_len_sizes = [1, 2, 4, 8];
+        push_segment_size = S_L;
+{$elseif defined(cpu32bitalu)}
         REGCX=NR_ECX;
         REGSI=NR_ESI;
         REGDI=NR_EDI;
-{$endif cpu64bitalu}
+        copy_len_sizes = [1, 2, 4];
+        push_segment_size = S_L;
+{$elseif defined(cpu16bitalu)}
+        REGCX=NR_CX;
+        REGSI=NR_SI;
+        REGDI=NR_DI;
+        copy_len_sizes = [1, 2];
+        push_segment_size = S_W;
+{$endif}
 
     type  copymode=(copy_move,copy_mmx,copy_string);
 
@@ -1839,7 +2109,7 @@ unit cgx86;
         cm:=copy_string;
       if (cs_opt_size in current_settings.optimizerswitches) and
          not((len<=16) and (cm=copy_mmx)) and
-         not(len in [1,2,4{$ifdef x86_64},8{$endif x86_64}]) then
+         not(len in copy_len_sizes) then
         cm:=copy_string;
       if (source.segment<>NR_NO) or
          (dest.segment<>NR_NO) then
@@ -1863,11 +2133,13 @@ unit cgx86;
                     copysize:=2;
                     cgsize:=OS_16;
                   end
+{$if defined(cpu32bitalu) or defined(cpu64bitalu)}
                 else if len<8 then
                   begin
                     copysize:=4;
                     cgsize:=OS_32;
                   end
+{$endif cpu32bitalu or cpu64bitalu}
 {$ifdef cpu64bitalu}
                 else if len<16 then
                   begin
@@ -1929,15 +2201,23 @@ unit cgx86;
           begin
             getcpuregister(list,REGDI);
             if (dest.segment=NR_NO) then
-              a_loadaddr_ref_reg(list,dest,REGDI)
+              begin
+                a_loadaddr_ref_reg(list,dest,REGDI);
+{$ifdef volatile_es}
+                list.concat(taicpu.op_reg(A_PUSH,push_segment_size,NR_DS));
+                list.concat(taicpu.op_reg(A_POP,push_segment_size,NR_ES));
+{$endif volatile_es}
+              end
             else
               begin
                 dstref:=dest;
                 dstref.segment:=NR_NO;
                 a_loadaddr_ref_reg(list,dstref,REGDI);
-                list.concat(taicpu.op_reg(A_PUSH,S_L,NR_ES));
-                list.concat(taicpu.op_reg(A_PUSH,S_L,dest.segment));
-                list.concat(taicpu.op_reg(A_POP,S_L,NR_ES));
+{$ifndef volatile_es}
+                list.concat(taicpu.op_reg(A_PUSH,push_segment_size,NR_ES));
+{$endif not volatile_es}
+                list.concat(taicpu.op_reg(A_PUSH,push_segment_size,dest.segment));
+                list.concat(taicpu.op_reg(A_POP,push_segment_size,NR_ES));
               end;
             getcpuregister(list,REGSI);
             if (source.segment=NR_NO) then
@@ -1953,9 +2233,8 @@ unit cgx86;
               end;
 
             getcpuregister(list,REGCX);
-{$ifdef i386}
-           list.concat(Taicpu.op_none(A_CLD,S_NO));
-{$endif i386}
+            if ts_cld in current_settings.targetswitches then
+              list.concat(Taicpu.op_none(A_CLD,S_NO));
             if (cs_opt_size in current_settings.optimizerswitches) and
                (len>sizeof(aint)+(sizeof(aint) div 2)) then
               begin
@@ -1974,11 +2253,13 @@ unit cgx86;
                   end;
                 if helpsize>0 then
                   begin
-{$ifdef cpu64bitalu}
+{$if defined(cpu64bitalu)}
                     list.concat(Taicpu.op_none(A_MOVSQ,S_NO))
-{$else}
+{$elseif defined(cpu32bitalu)}
                     list.concat(Taicpu.op_none(A_MOVSD,S_NO));
-{$endif cpu64bitalu}
+{$elseif defined(cpu16bitalu)}
+                    list.concat(Taicpu.op_none(A_MOVSW,S_NO));
+{$endif}
                   end;
                 if len>=4 then
                   begin
@@ -1997,9 +2278,11 @@ unit cgx86;
             ungetcpuregister(list,REGSI);
             ungetcpuregister(list,REGDI);
             if (source.segment<>NR_NO) then
-              list.concat(taicpu.op_reg(A_POP,S_L,NR_DS));
+              list.concat(taicpu.op_reg(A_POP,push_segment_size,NR_DS));
+{$ifndef volatile_es}
             if (dest.segment<>NR_NO) then
-              list.concat(taicpu.op_reg(A_POP,S_L,NR_ES));
+              list.concat(taicpu.op_reg(A_POP,push_segment_size,NR_ES));
+{$endif not volatile_es}
           end;
         end;
     end;
@@ -2060,6 +2343,16 @@ unit cgx86;
 
 
     procedure tcgx86.g_stackpointer_alloc(list : TAsmList;localsize : longint);
+
+      procedure decrease_sp(a : tcgint);
+        var
+          href : treference;
+        begin
+          reference_reset_base(href,NR_STACK_POINTER_REG,-a,0);
+          { normally, lea is a better choice than a sub to adjust the stack pointer }
+          list.concat(Taicpu.op_ref_reg(A_LEA,TCGSize2OpSize[OS_ADDR],href,NR_STACK_POINTER_REG));
+        end;
+
 {$ifdef x86}
 {$ifndef NOTARGETWIN}
       var
@@ -2080,7 +2373,7 @@ unit cgx86;
              begin
                if localsize div winstackpagesize<=5 then
                  begin
-                    list.concat(Taicpu.Op_const_reg(A_SUB,S_L,localsize-4,NR_ESP));
+                    decrease_sp(localsize-4);
                     for i:=1 to localsize div winstackpagesize do
                       begin
                          reference_reset_base(href,NR_ESP,localsize-i*winstackpagesize,4);
@@ -2091,18 +2384,25 @@ unit cgx86;
                else
                  begin
                     current_asmdata.getjumplabel(again);
-                    getcpuregister(list,NR_EDI);
+                    { Using a_reg_alloc instead of getcpuregister, so this procedure
+                      does not change "used_in_proc" state of EDI and therefore can be
+                      called after saving registers with "push" instruction
+                      without creating an unbalanced "pop edi" in epilogue }
+                    a_reg_alloc(list,NR_EDI);
                     list.concat(Taicpu.op_reg(A_PUSH,S_L,NR_EDI));
                     list.concat(Taicpu.op_const_reg(A_MOV,S_L,localsize div winstackpagesize,NR_EDI));
                     a_label(list,again);
-                    list.concat(Taicpu.op_const_reg(A_SUB,S_L,winstackpagesize-4,NR_ESP));
+                    decrease_sp(winstackpagesize-4);
                     list.concat(Taicpu.op_reg(A_PUSH,S_L,NR_EAX));
-                    list.concat(Taicpu.op_reg(A_DEC,S_L,NR_EDI));
+                    if UseIncDec then
+                      list.concat(Taicpu.op_reg(A_DEC,S_L,NR_EDI))
+                    else
+                      list.concat(Taicpu.op_const_reg(A_SUB,S_L,1,NR_EDI));
                     a_jmp_cond(list,OC_NE,again);
-                    list.concat(Taicpu.op_const_reg(A_SUB,S_L,localsize mod winstackpagesize - 4,NR_ESP));
+                    decrease_sp(localsize mod winstackpagesize-4);
                     reference_reset_base(href,NR_ESP,localsize-4,4);
                     list.concat(Taicpu.op_ref_reg(A_MOV,S_L,href,NR_EDI));
-                    ungetcpuregister(list,NR_EDI);
+                    a_reg_dealloc(list,NR_EDI);
                  end
              end
            else
@@ -2117,7 +2417,7 @@ unit cgx86;
              begin
                if localsize div winstackpagesize<=5 then
                  begin
-                    list.concat(Taicpu.Op_const_reg(A_SUB,S_Q,localsize,NR_RSP));
+                    decrease_sp(localsize);
                     for i:=1 to localsize div winstackpagesize do
                       begin
                          reference_reset_base(href,NR_RSP,localsize-i*winstackpagesize+4,4);
@@ -2132,19 +2432,22 @@ unit cgx86;
                     getcpuregister(list,NR_R10);
                     list.concat(Taicpu.op_const_reg(A_MOV,S_Q,localsize div winstackpagesize,NR_R10));
                     a_label(list,again);
-                    list.concat(Taicpu.op_const_reg(A_SUB,S_Q,winstackpagesize,NR_RSP));
+                    decrease_sp(winstackpagesize);
                     reference_reset_base(href,NR_RSP,0,4);
                     list.concat(Taicpu.op_reg_ref(A_MOV,S_L,NR_EAX,href));
-                    list.concat(Taicpu.op_reg(A_DEC,S_Q,NR_R10));
+                    if UseIncDec then
+                      list.concat(Taicpu.op_reg(A_DEC,S_Q,NR_R10))
+                    else
+                      list.concat(Taicpu.op_const_reg(A_SUB,S_Q,1,NR_R10));
                     a_jmp_cond(list,OC_NE,again);
-                    list.concat(Taicpu.op_const_reg(A_SUB,S_Q,localsize mod winstackpagesize,NR_RSP));
+                    decrease_sp(localsize mod winstackpagesize);
                     ungetcpuregister(list,NR_R10);
                  end
              end
            else
 {$endif NOTARGETWIN}
 {$endif x86_64}
-            list.concat(Taicpu.Op_const_reg(A_SUB,tcgsize2opsize[OS_ADDR],localsize,NR_STACK_POINTER_REG));
+            decrease_sp(localsize);
          end;
       end;
 
@@ -2153,12 +2456,48 @@ unit cgx86;
       var
         stackmisalignment: longint;
         para: tparavarsym;
+        regsize: longint;
+{$ifdef i8086}
+        dgroup: treference;
+{$endif i8086}
+
+      procedure push_regs;
+        var
+          r: longint;
+        begin
+          regsize:=0;
+          for r := low(saved_standard_registers) to high(saved_standard_registers) do
+            if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then
+              begin
+                inc(regsize,sizeof(aint));
+                list.concat(Taicpu.Op_reg(A_PUSH,tcgsize2opsize[OS_ADDR],newreg(R_INTREGISTER,saved_standard_registers[r],R_SUBWHOLE)));
+              end;
+        end;
+
       begin
+{$ifdef i8086}
+        { interrupt support for i8086 }
+        if po_interrupt in current_procinfo.procdef.procoptions then
+          begin
+            list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_AX));
+            list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_BX));
+            list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_CX));
+            list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_DX));
+            list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_SI));
+            list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_DI));
+            list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_DS));
+            list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_ES));
+            reference_reset(dgroup,0);
+            dgroup.refaddr:=addr_dgroup;
+            list.concat(Taicpu.Op_ref_reg(A_MOV,S_W,dgroup,NR_AX));
+            list.concat(Taicpu.Op_reg_reg(A_MOV,S_W,NR_AX,NR_DS));
+          end;
+{$endif i8086}
 {$ifdef i386}
         { interrupt support for i386 }
         if (po_interrupt in current_procinfo.procdef.procoptions) and
            { this messes up stack alignment }
-           not(target_info.system in [system_i386_darwin,system_i386_iphonesim]) then
+           not(target_info.system in [system_i386_darwin,system_i386_iphonesim,system_i386_android]) then
           begin
             { .... also the segment registers }
             list.concat(Taicpu.Op_reg(A_PUSH,S_W,NR_GS));
@@ -2182,48 +2521,24 @@ unit cgx86;
             stackmisalignment := sizeof(pint);
             list.concat(tai_regalloc.alloc(current_procinfo.framepointer,nil));
             if current_procinfo.framepointer=NR_STACK_POINTER_REG then
-              CGmessage(cg_d_stackframe_omited)
+              begin
+{$ifdef i386}
+                if (not paramanager.use_fixed_stack) then
+                  push_regs;
+{$endif i386}
+                CGmessage(cg_d_stackframe_omited);
+              end
             else
               begin
                 { push <frame_pointer> }
                 inc(stackmisalignment,sizeof(pint));
                 include(rg[R_INTREGISTER].preserved_by_proc,RS_FRAME_POINTER_REG);
                 list.concat(Taicpu.op_reg(A_PUSH,tcgsize2opsize[OS_ADDR],NR_FRAME_POINTER_REG));
-                if (target_info.system=system_x86_64_win64) then
-                  begin
-                    list.concat(cai_seh_directive.create_reg(ash_pushreg,NR_FRAME_POINTER_REG));
-                    include(current_procinfo.flags,pi_has_unwind_info);
-                  end;
                 { Return address and FP are both on stack }
                 current_asmdata.asmcfi.cfa_def_cfa_offset(list,2*sizeof(pint));
                 current_asmdata.asmcfi.cfa_offset(list,NR_FRAME_POINTER_REG,-(2*sizeof(pint)));
-                if current_procinfo.procdef.proctypeoption<>potype_exceptfilter then
-                  list.concat(Taicpu.op_reg_reg(A_MOV,tcgsize2opsize[OS_ADDR],NR_STACK_POINTER_REG,NR_FRAME_POINTER_REG))
-                else
-                  begin
-                    { load framepointer from hidden $parentfp parameter }
-                    para:=tparavarsym(current_procinfo.procdef.paras[0]);
-                    if not (vo_is_parentfp in para.varoptions) then
-                      InternalError(201201142);
-                    if (para.paraloc[calleeside].location^.loc<>LOC_REGISTER) or
-                       (para.paraloc[calleeside].location^.next<>nil) then
-                      InternalError(201201143);
-                    list.concat(Taicpu.op_reg_reg(A_MOV,tcgsize2opsize[OS_ADDR],
-                      para.paraloc[calleeside].location^.register,NR_FRAME_POINTER_REG));
-                    { Need only as much stack space as necessary to do the calls.
-                      Exception filters don't have own local vars, and temps are 'mapped'
-                      to the parent procedure.
-                      maxpushedparasize is already aligned at least on x86_64. }
-                    localsize:=current_procinfo.maxpushedparasize;
-                  end;
+                list.concat(Taicpu.op_reg_reg(A_MOV,tcgsize2opsize[OS_ADDR],NR_STACK_POINTER_REG,NR_FRAME_POINTER_REG));
                 current_asmdata.asmcfi.cfa_def_cfa_register(list,NR_FRAME_POINTER_REG);
-                {
-                  TODO: current framepointer handling is not compatible with Win64 at all:
-                  Win64 expects FP to point to the top or into the middle of local area.
-                  In FPC it points to the bottom, making it impossible to generate
-                  UWOP_SET_FPREG unwind code if local area is > 240 bytes.
-                  So for now pretend we never have a framepointer.
-                }
               end;
 
             { allocate stackframe space }
@@ -2239,17 +2554,65 @@ unit cgx86;
                 if current_procinfo.framepointer=NR_STACK_POINTER_REG then
                   current_asmdata.asmcfi.cfa_def_cfa_offset(list,localsize+sizeof(pint));
                 current_procinfo.final_localsize:=localsize;
-                if (target_info.system=system_x86_64_win64) then
-                  begin
-                    if localsize<>0 then
-                      list.concat(cai_seh_directive.create_offset(ash_stackalloc,localsize));
-                    include(current_procinfo.flags,pi_has_unwind_info);
-                  end;
               end;
+
+{$ifdef i386}
+            if (not paramanager.use_fixed_stack) and
+               (current_procinfo.framepointer<>NR_STACK_POINTER_REG) then
+              begin
+                regsize:=0;
+                push_regs;
+                reference_reset_base(current_procinfo.save_regs_ref,
+                  current_procinfo.framepointer,
+                  -(localsize+regsize),sizeof(aint));
+              end;
+{$endif i386}
           end;
       end;
 
 
+    procedure tcgx86.g_save_registers(list: TAsmList);
+      begin
+{$ifdef i386}
+        if paramanager.use_fixed_stack then
+{$endif i386}
+          inherited g_save_registers(list);
+      end;
+
+
+    procedure tcgx86.g_restore_registers(list: TAsmList);
+      begin
+{$ifdef i386}
+        if paramanager.use_fixed_stack then
+{$endif i386}
+          inherited g_restore_registers(list);
+      end;
+
+
+    procedure tcgx86.internal_restore_regs(list: TAsmList; use_pop: boolean);
+      var
+        r: longint;
+        hreg: tregister;
+        href: treference;
+      begin
+        href:=current_procinfo.save_regs_ref;
+        for r:=high(saved_standard_registers) downto low(saved_standard_registers) do
+          if saved_standard_registers[r] in rg[R_INTREGISTER].used_in_proc then
+            begin
+              hreg:=newreg(R_INTREGISTER,saved_standard_registers[r],R_SUBWHOLE);
+              { Allocate register so the optimizer does not remove the load }
+              a_reg_alloc(list,hreg);
+              if use_pop then
+                list.concat(Taicpu.Op_reg(A_POP,tcgsize2opsize[OS_ADDR],hreg))
+              else
+                begin
+                  a_load_ref_reg(list,OS_ADDR,OS_ADDR,href,hreg);
+                  inc(href.offset,sizeof(aint));
+                end;
+            end;
+      end;
+
+
     { produces if necessary overflowcode }
     procedure tcgx86.g_overflowcheck(list: TAsmList; const l:tlocation;def:tdef);
       var

+ 131 - 35
compiler/x86/cpubase.pas

@@ -1,7 +1,7 @@
 {
     Copyright (c) 1998-2002 by Florian Klaempfl and Peter Vreman
 
-    Contains the base types for the i386 and x86-64 architecture
+    Contains the base types for the i8086, i386 and x86-64 architecture
 
     * This code was inspired by the NASM sources
       The Netwide Assembler is Copyright (c) 1996 Simon Tatham and
@@ -35,7 +35,7 @@ interface
 
 uses
   cutils,cclasses,
-  globtype,
+  globtype,globals,
   cgbase
   ;
 
@@ -45,11 +45,13 @@ uses
 *****************************************************************************}
 
     type
-{$ifdef x86_64}
+{$if defined(x86_64)}
       TAsmOp={$i x8664op.inc}
-{$else x86_64}
+{$elseif defined(i386)}
       TAsmOp={$i i386op.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+      TAsmOp={$i i8086op.inc}
+{$endif}
 
       { This should define the array of instructions as string }
         op2strtable=array[tasmop] of string[16];
@@ -91,6 +93,15 @@ uses
       RS_EDI        = RS_RDI;
       RS_EBP        = RS_RBP;
       RS_ESP        = RS_RSP;
+      { create aliases to allow code sharing between i386 and i8086 }
+      RS_AX        = RS_RAX;
+      RS_BX        = RS_RBX;
+      RS_CX        = RS_RCX;
+      RS_DX        = RS_RDX;
+      RS_SI        = RS_RSI;
+      RS_DI        = RS_RDI;
+      RS_BP        = RS_RBP;
+      RS_SP        = RS_RSP;
 
       { Number of first imaginary register }
       first_int_imreg     = $10;
@@ -136,57 +147,71 @@ uses
 {$endif x86_64}
 
       { The subregister that specifies the entire register and an address }
-{$ifdef x86_64}
+{$if defined(x86_64)}
       { Hammer }
       R_SUBWHOLE    = R_SUBQ;
       R_SUBADDR     = R_SUBQ;
-{$else x86_64}
+{$elseif defined(i386)}
       { i386 }
       R_SUBWHOLE    = R_SUBD;
       R_SUBADDR     = R_SUBD;
-{$endif x86_64}
+{$elseif defined(i8086)}
+      { i8086 }
+      R_SUBWHOLE    = R_SUBW;
+      R_SUBADDR     = R_SUBW;
+{$endif}
 
       { Available Registers }
-{$ifdef x86_64}
+{$if defined(x86_64)}
       {$i r8664con.inc}
-{$else x86_64}
+{$elseif defined(i386)}
       {$i r386con.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+      {$i r8086con.inc}
+{$endif}
 
     type
       { Number of registers used for indexing in tables }
-{$ifdef x86_64}
+{$if defined(x86_64)}
       tregisterindex=0..{$i r8664nor.inc}-1;
-{$else x86_64}
+{$elseif defined(i386)}
       tregisterindex=0..{$i r386nor.inc}-1;
-{$endif x86_64}
+{$elseif defined(i8086)}
+      tregisterindex=0..{$i r8086nor.inc}-1;
+{$endif}
 
     const
 { TODO: Calculate bsstart}
       regnumber_count_bsstart = 64;
 
       regnumber_table : array[tregisterindex] of tregister = (
-{$ifdef x86_64}
+{$if defined(x86_64)}
         {$i r8664num.inc}
-{$else x86_64}
+{$elseif defined(i386)}
         {$i r386num.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+        {$i r8086num.inc}
+{$endif}
       );
 
       regstabs_table : array[tregisterindex] of shortint = (
-{$ifdef x86_64}
+{$if defined(x86_64)}
         {$i r8664stab.inc}
-{$else x86_64}
+{$elseif defined(i386)}
         {$i r386stab.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+        {$i r8086stab.inc}
+{$endif}
       );
 
       regdwarf_table : array[tregisterindex] of shortint = (
-{$ifdef x86_64}
+{$if defined(x86_64)}
         {$i r8664dwrf.inc}
-{$else x86_64}
+{$elseif defined(i386)}
         {$i r386dwrf.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+        {$i r8086dwrf.inc}
+{$endif}
       );
 
       RS_DEFAULTFLAGS = RS_FLAGS;
@@ -261,14 +286,22 @@ uses
     function inverse_cond(const c: TAsmCond): TAsmCond; {$ifdef USEINLINE}inline;{$endif USEINLINE}
     function conditions_equal(const c1, c2: TAsmCond): boolean; {$ifdef USEINLINE}inline;{$endif USEINLINE}
 
+    { checks whether two segment registers are normally equal in the current memory model }
+    function segment_regs_equal(r1,r2:tregister):boolean;
+
+{$ifdef i8086}
+    { returns the next virtual register }
+    function GetNextReg(const r : TRegister) : TRegister;
+{$endif i8086}
+
 implementation
 
     uses
       rgbase,verbose;
 
     const
-    {$ifdef x86_64}
-      std_regname_table : array[tregisterindex] of string[7] = (
+    {$if defined(x86_64)}
+      std_regname_table : TRegNameTable = (
         {$i r8664std.inc}
       );
 
@@ -278,8 +311,8 @@ implementation
       std_regname_index : array[tregisterindex] of tregisterindex = (
         {$i r8664sri.inc}
       );
-    {$else x86_64}
-      std_regname_table : array[tregisterindex] of string[7] = (
+    {$elseif defined(i386)}
+      std_regname_table : TRegNameTable = (
         {$i r386std.inc}
       );
 
@@ -290,7 +323,19 @@ implementation
       std_regname_index : array[tregisterindex] of tregisterindex = (
         {$i r386sri.inc}
       );
-    {$endif x86_64}
+    {$elseif defined(i8086)}
+      std_regname_table : TRegNameTable = (
+        {$i r8086std.inc}
+      );
+
+      regnumber_index : array[tregisterindex] of tregisterindex = (
+        {$i r8086rni.inc}
+      );
+
+      std_regname_index : array[tregisterindex] of tregisterindex = (
+        {$i r8086sri.inc}
+      );
+    {$endif}
 
 
 {*****************************************************************************
@@ -398,9 +443,9 @@ implementation
       begin
         case o of
           A_CALL,
-{$ifdef i386}
+{$if defined(i386) or defined(i8086)}
           A_JCXZ,
-{$endif i386}
+{$endif defined(i386) or defined(i8086)}
           A_JECXZ,
 {$ifdef x86_64}
           A_JRCXZ,
@@ -458,10 +503,9 @@ implementation
       begin
         { for the name the sub reg doesn't matter }
         hr:=r;
-        case getsubreg(hr) of
-          R_SUBMMS,R_SUBMMD,R_SUBMMWHOLE:
-            setsubreg(hr,R_SUBMMX);
-        end;
+        if (getregtype(hr)=R_MMREGISTER) and
+           (getsubreg(hr)<>R_SUBMMY) then
+          setsubreg(hr,R_SUBMMX);
         result:=findreg_by_number_table(hr,regnumber_index);
       end;
 
@@ -478,7 +522,7 @@ implementation
       begin
         if getregtype(r) in [R_MMREGISTER,R_MMXREGISTER] then
           r:=newreg(getregtype(r),getsupreg(r),R_SUBNONE);
-        p:=findreg_by_number_table(r,regnumber_index);
+        p:=findreg_by_number(r);
         if p<>0 then
           result:=std_regname_table[p]
         else
@@ -512,4 +556,56 @@ implementation
       end;
 
 
+    function segment_regs_equal(r1, r2: tregister): boolean;
+      begin
+        if not is_segment_reg(r1) or not is_segment_reg(r2) then
+          internalerror(2013062301);
+        { every segment register is equal to itself }
+        if r1=r2 then
+          exit(true);
+{$if defined(i8086)}
+        case current_settings.x86memorymodel of
+          mm_tiny:
+            begin
+              { CS=DS=SS }
+              if ((r1=NR_CS) or (r1=NR_DS) or (r1=NR_SS)) and
+                 ((r2=NR_CS) or (r2=NR_DS) or (r2=NR_SS)) then
+                exit(true);
+              { the remaining are distinct from each other }
+              exit(false);
+            end;
+          mm_small,mm_medium:
+            begin
+              { DS=SS }
+              if ((r1=NR_DS) or (r1=NR_SS)) and
+                 ((r2=NR_DS) or (r2=NR_SS)) then
+                exit(true);
+              { the remaining are distinct from each other }
+              exit(false);
+            end;
+          mm_compact,mm_large,mm_huge: internalerror(2013062303);
+          else
+            internalerror(2013062302);
+        end;
+{$elseif defined(i386) or defined(x86_64)}
+        { DS=SS=ES }
+        if ((r1=NR_DS) or (r1=NR_SS) or (r1=NR_ES)) and
+           ((r2=NR_DS) or (r2=NR_SS) or (r2=NR_ES)) then
+          exit(true);
+        { the remaining are distinct from each other }
+        exit(false);
+{$endif}
+      end;
+
+
+{$ifdef i8086}
+    function GetNextReg(const r: TRegister): TRegister;
+      begin
+        if getsupreg(r)<first_int_imreg then
+          internalerror(2013051401);
+        result:=TRegister(longint(r)+1);
+      end;
+{$endif i8086}
+
+
 end.

+ 22 - 9
compiler/x86/itcpugas.pas

@@ -32,15 +32,17 @@ interface
       TAttSuffix = (AttSufNONE,AttSufINT,AttSufFPU,AttSufFPUint,AttSufINTdual,AttSufMM);
 
     const
-{$ifdef x86_64}
-      {x86att.inc contains the name for each x86-64 mnemonic}
+      { include mnemonic strings }
+{$if defined(x86_64)}
       gas_op2str:op2strtable={$i x8664att.inc}
       gas_needsuffix:array[tasmop] of TAttSuffix={$i x8664ats.inc}
-{$else x86_64}
-      {x86att.inc contains the name for each i386 mnemonic}
+{$elseif defined(i386)}
       gas_op2str:op2strtable={$i i386att.inc}
       gas_needsuffix:array[tasmop] of TAttSuffix={$i i386atts.inc}
-{$endif x86_64}
+{$elseif defined(i8086)}
+      gas_op2str:op2strtable={$i i8086att.inc}
+      gas_needsuffix:array[tasmop] of TAttSuffix={$i i8086atts.inc}
+{$endif}
 
 {$ifdef x86_64}
      gas_opsize2str : array[topsize] of string[2] = ('',
@@ -78,7 +80,7 @@ interface
        '','','',
        't',
         'x',
-        'y'   
+        'y'
      );
      { suffix-to-opsize conversion tables, used in asmreadrer }
      att_sizesuffixstr : array[0..11] of string[2] = (
@@ -106,7 +108,7 @@ implementation
       cutils,verbose;
 
     const
-    {$ifdef x86_64}
+    {$if defined(x86_64)}
       att_regname_table : array[tregisterindex] of string[7] = (
         {r8664att.inc contains the AT&T name of each register.}
         {$i r8664att.inc}
@@ -117,7 +119,7 @@ implementation
          ATT name.}
         {$i r8664ari.inc}
       );
-    {$else x86_64}
+    {$elseif defined(i386)}
       att_regname_table : array[tregisterindex] of string[7] = (
         {r386att.inc contains the AT&T name of each register.}
         {$i r386att.inc}
@@ -128,7 +130,18 @@ implementation
          ATT name.}
         {$i r386ari.inc}
       );
-    {$endif x86_64}
+    {$elseif defined(i8086)}
+      att_regname_table : array[tregisterindex] of string[7] = (
+        {r8086att.inc contains the AT&T name of each register.}
+        {$i r8086att.inc}
+      );
+
+      att_regname_index : array[tregisterindex] of tregisterindex = (
+        {r8086ari.inc contains an index which sorts att_regname_table by
+         ATT name.}
+        {$i r8086ari.inc}
+      );
+    {$endif}
 
     function findreg_by_attname(const s:string):byte;
       var

+ 21 - 18
compiler/x86/itx86int.pas

@@ -39,7 +39,7 @@ implementation
       cpubase;
 
     const
-    {$ifdef x86_64}
+    {$if defined(x86_64)}
       int_regname_table : array[tregisterindex] of string[7] = (
         {$i r8664int.inc}
       );
@@ -47,7 +47,7 @@ implementation
       int_regname_index : array[tregisterindex] of tregisterindex = (
         {$i r8664iri.inc}
       );
-    {$else x86_64}
+    {$elseif defined(i386)}
       int_regname_table : array[tregisterindex] of string[7] = (
         {$i r386int.inc}
       );
@@ -55,29 +55,32 @@ implementation
       int_regname_index : array[tregisterindex] of tregisterindex = (
         {$i r386iri.inc}
       );
-    {$endif x86_64}
+    {$elseif defined(i8086)}
+      int_regname_table : array[tregisterindex] of string[7] = (
+        {$i r8086int.inc}
+      );
+
+      int_regname_index : array[tregisterindex] of tregisterindex = (
+        {$i r8086iri.inc}
+      );
+    {$endif}
 
 
     function findreg_by_intname(const s:string):integer;
       var
-        i,p : integer;
-        s1: string;
-
         l,r,m: integer;
       begin
         {Binary search.}
-        p:=0;
-        i := (high(tregisterindex) + 1) shr 1;
-           l := 0;
-           r := high(tregisterindex) + 1;
-           while l < r do
-           begin
-              m := (l + r) div 2;
-              if int_regname_table[int_regname_index[m]] < s then l := m + 1
-              else r := m;
-           end;
-
-        if int_regname_table[int_regname_index[r]]=s then
+        l := 0;
+        r := high(tregisterindex) + 1;
+        while l < r do
+          begin
+            m := (l + r) div 2;
+            if int_regname_table[int_regname_index[m]] < s then l := m + 1
+            else r := m;
+          end;
+
+        if (r<=high(tregisterindex)) and (int_regname_table[int_regname_index[r]]=s) then
           findreg_by_intname:=int_regname_index[r]
         else
           findreg_by_intname:=0;

+ 283 - 64
compiler/x86/nx86add.pas

@@ -41,10 +41,15 @@ unit nx86add;
         procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);
 
         procedure second_cmpfloatsse;
+        procedure second_cmpfloatavx;
+
         procedure second_addfloatsse;
+        procedure second_addfloatavx;
       public
         procedure second_addfloat;override;
+{$ifndef i8086}
         procedure second_addsmallset;override;
+{$endif not i8086}
         procedure second_add64bit;override;
         procedure second_cmpfloat;override;
         procedure second_cmpsmallset;override;
@@ -112,7 +117,7 @@ unit nx86add;
               if extra_not then
                 cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_NOT,opsize,left.location.register,left.location.register);
               r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
-              hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,hlcg.tcgsize2orddef(opsize),right.location,r);
+              hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
               emit_reg_reg(op,TCGSize2Opsize[opsize],left.location.register,r);
               cg.a_load_reg_reg(current_asmdata.CurrAsmList,opsize,opsize,r,left.location.register);
             end
@@ -138,7 +143,8 @@ unit nx86add;
                  if (op=A_SUB) and
                     (right.location.loc=LOC_CONSTANT) and
                     (right.location.value=1) and
-                    not(cs_check_overflow in current_settings.localswitches) then
+                    not(cs_check_overflow in current_settings.localswitches) and
+                    UseIncDec then
                   begin
                     emit_reg(A_DEC,TCGSize2Opsize[opsize],left.location.register);
                   end
@@ -155,7 +161,7 @@ unit nx86add;
                    if extra_not then
                      begin
                         r:=cg.getintregister(current_asmdata.CurrAsmList,opsize);
-                        hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,hlcg.tcgsize2orddef(opsize),right.location,r);
+                        hlcg.a_load_loc_reg(current_asmdata.CurrAsmList,right.resultdef,cgsize_orddef(opsize),right.location,r);
                         emit_reg(A_NOT,TCGSize2Opsize[opsize],r);
                         emit_reg_reg(A_AND,TCGSize2Opsize[opsize],r,left.location.register);
                      end
@@ -332,6 +338,7 @@ unit nx86add;
                                 AddSmallSet
 *****************************************************************************}
 
+{$ifndef i8086}
     procedure tx86addnode.second_addsmallset;
       var
         setbase : aint;
@@ -437,8 +444,9 @@ unit nx86add;
 
         { fix the changed opsize we did above because of the missing btsb }
         if opsize<>int_cgsize(resultdef.size) then
-          hlcg.location_force_reg(current_asmdata.CurrAsmList,location,opdef,hlcg.tcgsize2orddef(int_cgsize(resultdef.size)),false);
+          hlcg.location_force_reg(current_asmdata.CurrAsmList,location,opdef,cgsize_orddef(int_cgsize(resultdef.size)),false);
       end;
+{$endif not i8086}
 
 
     procedure tx86addnode.second_cmpsmallset;
@@ -722,8 +730,8 @@ unit nx86add;
             if nf_swapped in flags then
               swapleftright;
 
-            location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
-            location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,true);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
             location:=left.location;
             if is_double(resultdef) then
               begin
@@ -762,7 +770,8 @@ unit nx86add;
         { we can use only right as left operand if the operation is commutative }
         else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
           begin
-            location.register:=right.location.register;
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+            cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.location.size,location.size,right.location.register,location.register,mms_movescalar);
             { force floating point reg. location to be written to memory,
               we don't force it to mm register because writing to memory
               allows probably shorter code because there is no direct fpu->mm register
@@ -774,11 +783,20 @@ unit nx86add;
           end
         else
           begin
-            if (nf_swapped in flags) then
+            if nf_swapped in flags then
               swapleftright;
 
-            location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
-            location.register:=left.location.register;
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+            cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
+
             { force floating point reg. location to be written to memory,
               we don't force it to mm register because writing to memory
               allows probably shorter code because there is no direct fpu->mm register
@@ -786,11 +804,170 @@ unit nx86add;
             }
             if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
               hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+
             cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,op,location.size,right.location,location.register,mms_movescalar);
           end;
       end;
 
 
+    procedure tx86addnode.second_addfloatavx;
+      var
+        op : topcg;
+        sqr_sum : boolean;
+        tmp : tnode;
+      begin
+        sqr_sum:=false;
+{$ifdef dummy}
+        if (current_settings.fputype>=fpu_sse3) and
+           use_vectorfpu(resultdef) and
+           (nodetype in [addn,subn]) and
+          (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
+          (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
+          begin
+            sqr_sum:=true;
+            tmp:=tinlinenode(left).left;
+            tinlinenode(left).left:=nil;
+            left.free;
+            left:=tmp;
+
+            tmp:=tinlinenode(right).left;
+            tinlinenode(right).left:=nil;
+            right.free;
+            right:=tmp;
+          end;
+{$endif dummy}
+
+        pass_left_right;
+        check_left_and_right_fpureg(false);
+
+        if (nf_swapped in flags) then
+          { can't use swapleftright if both are on the fpu stack, since then }
+          { both are "R_ST" -> nothing would change -> manually switch       }
+          if (left.location.loc = LOC_FPUREGISTER) and
+             (right.location.loc = LOC_FPUREGISTER) then
+            emit_none(A_FXCH,S_NO)
+          else
+            swapleftright;
+
+        case nodetype of
+          addn :
+            op:=OP_ADD;
+          muln :
+            op:=OP_MUL;
+          subn :
+            op:=OP_SUB;
+          slashn :
+            op:=OP_DIV;
+          else
+            internalerror(200312231);
+        end;
+
+        location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+
+        if sqr_sum then
+          begin
+            if nf_swapped in flags then
+              swapleftright;
+
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
+            location:=left.location;
+            if is_double(resultdef) then
+              begin
+                current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
+                case nodetype of
+                  addn:
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
+                  subn:
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
+                  else
+                    internalerror(201108162);
+                end;
+              end
+            else
+              begin
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
+                { ensure that bits 64..127 contain valid values }
+                current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
+                { the data is now in bits 0..32 and 64..95 }
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
+                case nodetype of
+                  addn:
+                    begin
+                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
+                    end;
+                  subn:
+                    begin
+                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
+                    end;
+                  else
+                    internalerror(201108163);
+                end;
+              end
+          end
+        { left*2 ? }
+        else if (nodetype=muln) and is_constrealnode(right) and is_number_float(trealconstnode(right).value_real) and (trealconstnode(right).value_real=2) then
+          begin
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+            cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
+              left.location.register,
+              left.location.register,
+              location.register,
+              mms_movescalar);
+          end
+        { right*2 ? }
+        else if (nodetype=muln) and is_constrealnode(left) and is_number_float(trealconstnode(left).value_real) and (trealconstnode(left).value_real=2) then
+          begin
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,right.location.size);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
+            cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_ADD,location.size,
+              right.location.register,
+              right.location.register,
+              location.register,
+              mms_movescalar);
+          end
+        { we can use only right as left operand if the operation is commutative }
+        else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
+          begin
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+            cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
+              left.location,
+              right.location.register,
+              location.register,
+              mms_movescalar);
+          end
+        else
+          begin
+            if (nf_swapped in flags) then
+              swapleftright;
+
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+            cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
+              right.location,
+              left.location.register,
+              location.register,
+              mms_movescalar);
+          end;
+      end;
+
+
     procedure tx86addnode.second_cmpfloatsse;
       var
         op : tasmop;
@@ -832,7 +1009,72 @@ unit nx86add;
           end
         else
           begin
-            location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+            case right.location.loc of
+              LOC_REFERENCE,LOC_CREFERENCE:
+                begin
+                  tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
+                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register));
+                end;
+              LOC_MMREGISTER,LOC_CMMREGISTER:
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register));
+              else
+                internalerror(200402223);
+            end;
+          end;
+        location.resflags:=getresflags(true);
+      end;
+
+
+    procedure tx86addnode.second_cmpfloatavx;
+      var
+        op : tasmop;
+      begin
+        if is_single(left.resultdef) then
+          op:=A_VCOMISS
+        else if is_double(left.resultdef) then
+          op:=A_VCOMISD
+        else
+          internalerror(200402222);
+        pass_left_right;
+
+        location_reset(location,LOC_FLAGS,def_cgsize(resultdef));
+        { we can use only right as left operand if the operation is commutative }
+        if (right.location.loc=LOC_MMREGISTER) then
+          begin
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+            case left.location.loc of
+              LOC_REFERENCE,LOC_CREFERENCE:
+                begin
+                  tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
+                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register));
+                end;
+              LOC_MMREGISTER,LOC_CMMREGISTER:
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register));
+              else
+                internalerror(200402221);
+            end;
+            if nf_swapped in flags then
+              exclude(flags,nf_swapped)
+            else
+              include(flags,nf_swapped)
+          end
+        else
+          begin
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
             { force floating point reg. location to be written to memory,
               we don't force it to mm register because writing to memory
               allows probably shorter code because there is no direct fpu->mm register
@@ -908,7 +1150,10 @@ unit nx86add;
       begin
         if use_vectorfpu(resultdef) then
           begin
-            second_addfloatsse;
+            if UseAVX then
+              second_addfloatavx
+            else
+              second_addfloatsse;
             exit;
           end;
 
@@ -947,12 +1192,17 @@ unit nx86add;
 
 
     procedure tx86addnode.second_cmpfloat;
+{$ifdef i8086}
       var
-        resflags   : tresflags;
+        tmpref: treference;
+{$endif i8086}
       begin
         if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
           begin
-            second_cmpfloatsse;
+            if UseAVX then
+              second_cmpfloatavx
+            else
+              second_cmpfloatsse;
             exit;
           end;
 
@@ -967,32 +1217,25 @@ unit nx86add;
             tcgx86(cg).dec_fpu_stack;
 
             { load fpu flags }
-            cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
-            emit_reg(A_FSTSW,S_NO,NR_AX);
-            emit_none(A_SAHF,S_NO);
-            cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
-            if nf_swapped in flags then
-             begin
-               case nodetype of
-                   equaln : resflags:=F_E;
-                 unequaln : resflags:=F_NE;
-                      ltn : resflags:=F_A;
-                     lten : resflags:=F_AE;
-                      gtn : resflags:=F_B;
-                     gten : resflags:=F_BE;
-               end;
-             end
+{$ifdef i8086}
+            if current_settings.cputype < cpu_286 then
+              begin
+                tg.gettemp(current_asmdata.CurrAsmList,2,2,tt_normal,tmpref);
+                emit_ref(A_FNSTSW,S_NO,tmpref);
+                cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
+                emit_ref_reg(A_MOV,S_W,tmpref,NR_AX);
+                emit_none(A_SAHF,S_NO);
+                cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
+                tg.ungettemp(current_asmdata.CurrAsmList,tmpref);
+              end
             else
-             begin
-               case nodetype of
-                   equaln : resflags:=F_E;
-                 unequaln : resflags:=F_NE;
-                      ltn : resflags:=F_B;
-                     lten : resflags:=F_BE;
-                      gtn : resflags:=F_A;
-                     gten : resflags:=F_AE;
-               end;
-             end;
+{$endif i8086}
+              begin
+                cg.getcpuregister(current_asmdata.CurrAsmList,NR_AX);
+                emit_reg(A_FNSTSW,S_NO,NR_AX);
+                emit_none(A_SAHF,S_NO);
+                cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_AX);
+              end;
           end
         else
 {$endif x86_64}
@@ -1002,34 +1245,10 @@ unit nx86add;
             current_asmdata.CurrAsmList.concat(taicpu.op_reg(A_FSTP,S_NO,NR_ST0));
             tcgx86(cg).dec_fpu_stack;
             tcgx86(cg).dec_fpu_stack;
-
-            { load fpu flags }
-            if nf_swapped in flags then
-             begin
-               case nodetype of
-                   equaln : resflags:=F_E;
-                 unequaln : resflags:=F_NE;
-                      ltn : resflags:=F_A;
-                     lten : resflags:=F_AE;
-                      gtn : resflags:=F_B;
-                     gten : resflags:=F_BE;
-               end;
-             end
-            else
-             begin
-               case nodetype of
-                   equaln : resflags:=F_E;
-                 unequaln : resflags:=F_NE;
-                      ltn : resflags:=F_B;
-                     lten : resflags:=F_BE;
-                      gtn : resflags:=F_A;
-                     gten : resflags:=F_AE;
-               end;
-             end;
           end;
 
         location_reset(location,LOC_FLAGS,OS_NO);
-        location.resflags:=resflags;
+        location.resflags:=getresflags(true);
       end;
 
 

+ 16 - 1
compiler/x86/nx86cal.pas

@@ -28,6 +28,7 @@ interface
 { $define AnsiStrRef}
 
     uses
+      symdef,
       ncgcal;
 
     type
@@ -37,6 +38,7 @@ interface
        tx86callnode = class(tcgcallnode)
         protected
          procedure do_release_unused_return_value;override;
+         procedure set_result_location(realresdef: tstoreddef);override;
        end;
 
 
@@ -44,7 +46,7 @@ implementation
 
     uses
       cgobj,
-      cgbase,cpubase,cgx86,cga;
+      cgbase,cgutils,cpubase,cgx86,cga;
 
 
 {*****************************************************************************
@@ -66,4 +68,17 @@ implementation
       end;
 
 
+  procedure tx86callnode.set_result_location(realresdef: tstoreddef);
+    begin
+      if (retloc.location^.loc=LOC_FPUREGISTER) then
+        begin
+          tcgx86(cg).inc_fpu_stack;
+          location_reset(location,LOC_FPUREGISTER,retloc.location^.size);
+          location.register:=retloc.location^.register;
+        end
+      else
+        inherited set_result_location(realresdef);
+    end;
+
+
 end.

+ 105 - 26
compiler/x86/nx86cnv.pas

@@ -63,7 +63,7 @@ implementation
       symconst,symdef,
       cgbase,cga,procinfo,pass_1,pass_2,
       ncon,ncal,ncnv,
-      cpubase,
+      cpubase,cpuinfo,
       cgutils,cgobj,hlcgobj,cgx86,ncgutil,
       tgobj;
 
@@ -89,6 +89,7 @@ implementation
         hreg2,
         hregister : tregister;
         href      : treference;
+        i         : integer;
 {$endif not cpu64bitalu}
         resflags  : tresflags;
         hlabel,oldTrueLabel,oldFalseLabel : tasmlabel;
@@ -130,13 +131,16 @@ implementation
             LOC_REFERENCE :
               begin
 {$ifndef cpu64bitalu}
-                if left.location.size in [OS_64,OS_S64] then
+                if left.location.size in [OS_64,OS_S64{$ifdef cpu16bitalu},OS_32,OS_S32{$endif}] then
                  begin
                    hregister:=cg.getintregister(current_asmdata.CurrAsmList,OS_INT);
-                   cg.a_load_ref_reg(current_asmdata.CurrAsmList,OS_32,OS_32,left.location.reference,hregister);
+                   cg.a_load_ref_reg(current_asmdata.CurrAsmList,OS_INT,OS_INT,left.location.reference,hregister);
                    href:=left.location.reference;
-                   inc(href.offset,4);
-                   cg.a_op_ref_reg(current_asmdata.CurrAsmList,OP_OR,OS_32,href,hregister);
+                   for i:=2 to tcgsize2size[left.location.size] div tcgsize2size[OS_INT] do
+                     begin
+                       inc(href.offset,tcgsize2size[OS_INT]);
+                       cg.a_op_ref_reg(current_asmdata.CurrAsmList,OP_OR,OS_INT,href,hregister);
+                     end;
                  end
                 else
 {$endif not cpu64bitalu}
@@ -151,7 +155,7 @@ implementation
               end;
             LOC_REGISTER,LOC_CREGISTER :
               begin
-{$ifndef cpu64bitalu}
+{$if defined(cpu32bitalu)}
                 if left.location.size in [OS_64,OS_S64] then
                  begin
                    hregister:=cg.getintregister(current_asmdata.CurrAsmList,OS_32);
@@ -159,7 +163,20 @@ implementation
                    cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_OR,OS_32,left.location.register64.reghi,hregister);
                  end
                 else
-{$endif not cpu64bitalu}
+{$elseif defined(cpu16bitalu)}
+                if left.location.size in [OS_64,OS_S64] then
+                 begin
+                   hregister:=cg.getintregister(current_asmdata.CurrAsmList,OS_16);
+                   cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_16,OS_16,left.location.register64.reglo,hregister);
+                   cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_OR,OS_16,GetNextReg(left.location.register64.reglo),hregister);
+                   cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_OR,OS_16,left.location.register64.reghi,hregister);
+                   cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_OR,OS_16,GetNextReg(left.location.register64.reghi),hregister);
+                 end
+                else
+                  if left.location.size in [OS_32,OS_S32] then
+                    cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_OR,OS_16,left.location.register,GetNextReg(left.location.register))
+                else
+{$endif}
                   cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_OR,left.location.size,left.location.register,left.location.register);
               end;
             LOC_JUMP :
@@ -241,7 +258,13 @@ implementation
          op: tasmop;
          opsize: topsize;
          signtested : boolean;
+         use_bt: boolean;  { true = use BT (386+), false = use TEST (286-) }
       begin
+{$ifdef i8086}
+        use_bt:=current_settings.cputype>=cpu_386;
+{$else i8086}
+        use_bt:=true;
+{$endif i8086}
         if not(left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_REFERENCE,LOC_CREFERENCE]) then
           hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,left.resultdef,false);
         if use_vectorfpu(resultdef) and
@@ -253,14 +276,25 @@ implementation
           begin
             location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
-            case location.size of
-              OS_F32:
-                op:=A_CVTSI2SS;
-              OS_F64:
-                op:=A_CVTSI2SD;
-              else
-                internalerror(2007120902);
-            end;
+            if UseAVX then
+              case location.size of
+                OS_F32:
+                  op:=A_VCVTSI2SS;
+                OS_F64:
+                  op:=A_VCVTSI2SD;
+                else
+                  internalerror(2007120902);
+              end
+            else
+              case location.size of
+                OS_F32:
+                  op:=A_CVTSI2SS;
+                OS_F64:
+                  op:=A_CVTSI2SD;
+                else
+                  internalerror(2007120902);
+              end;
+
             { don't use left.location.size, because that one may be OS_32/OS_64
               if the lower bound of the orddef >= 0
             }
@@ -278,11 +312,19 @@ implementation
                 begin
                   href:=left.location.reference;
                   tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,href);
-                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,opsize,href,location.register));
+                  if UseAVX then
+                    { VCVTSI2.. requires a second source operand to copy bits 64..127 }
+                    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(op,opsize,href,location.register,location.register))
+                  else
+                    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,opsize,href,location.register));
                 end;
               LOC_REGISTER,
               LOC_CREGISTER:
-                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,opsize,left.location.register,location.register));
+                if UseAVX then
+                    { VCVTSI2.. requires a second source operand to copy bits 64..127 }
+                  current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,opsize,left.location.register,location.register,location.register))
+                else
+                  current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,opsize,left.location.register,location.register));
             end;
           end
         else
@@ -290,11 +332,24 @@ implementation
             location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
             if (left.location.loc=LOC_REGISTER) and (torddef(left.resultdef).ordtype=u64bit) then
               begin
-    {$ifdef cpu64bitalu}
-                emit_const_reg(A_BT,S_Q,63,left.location.register);
-    {$else cpu64bitalu}
-                emit_const_reg(A_BT,S_L,31,left.location.register64.reghi);
-    {$endif cpu64bitalu}
+                if use_bt then
+                  begin
+    {$if defined(cpu64bitalu)}
+                    emit_const_reg(A_BT,S_Q,63,left.location.register);
+    {$elseif defined(cpu32bitalu)}
+                    emit_const_reg(A_BT,S_L,31,left.location.register64.reghi);
+    {$elseif defined(cpu16bitalu)}
+                    emit_const_reg(A_BT,S_W,15,GetNextReg(left.location.register64.reghi));
+    {$endif}
+                  end
+                else
+                  begin
+    {$ifdef i8086}
+                    emit_const_reg(A_TEST,S_W,aint($8000),GetNextReg(left.location.register64.reghi));
+    {$else i8086}
+                    internalerror(2013052510);
+    {$endif i8086}
+                  end;
                 signtested:=true;
               end
             else
@@ -341,13 +396,37 @@ implementation
     
                    if not(signtested) then
                      begin
-                       inc(leftref.offset,4);
-                       emit_const_ref(A_BT,S_L,31,leftref);
-                       dec(leftref.offset,4);
+                       if use_bt then
+                         begin
+           {$if defined(cpu64bitalu) or defined(cpu32bitalu)}
+                           inc(leftref.offset,4);
+                           emit_const_ref(A_BT,S_L,31,leftref);
+                           dec(leftref.offset,4);
+           {$elseif defined(cpu16bitalu)}
+                           inc(leftref.offset,6);
+                           emit_const_ref(A_BT,S_W,15,leftref);
+                           dec(leftref.offset,6);
+           {$endif}
+                         end
+                       else
+                         begin
+           {$ifdef i8086}
+                           { reading a byte, instead of word is faster on a true }
+                           { 8088, because of the 8-bit data bus }
+                           inc(leftref.offset,7);
+                           emit_const_ref(A_TEST,S_B,aint($80),leftref);
+                           dec(leftref.offset,7);
+           {$else i8086}
+                           internalerror(2013052511);
+           {$endif i8086}
+                         end;
                      end;
     
                    current_asmdata.CurrAsmList.concat(taicpu.op_ref(A_FILD,S_IQ,leftref));
-                   cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NC,l2);
+                   if use_bt then
+                     cg.a_jmp_flags(current_asmdata.CurrAsmList,F_NC,l2)
+                   else
+                     cg.a_jmp_flags(current_asmdata.CurrAsmList,F_E,l2);
                    new_section(current_asmdata.asmlists[al_typedconsts],sec_rodata_norel,l1.name,const_align(sizeof(pint)));
                    current_asmdata.asmlists[al_typedconsts].concat(Tai_label.Create(l1));
                    { I got this constant from a test program (FK) }

+ 3 - 3
compiler/x86/nx86con.pas

@@ -1,5 +1,5 @@
 {
-    Copyright (c) 1998-2002 by Florian Klaempfl
+    Copyright (c) 1998-2012 by Florian Klaempfl and others
 
     Generate i386 assembler for constants
 
@@ -44,13 +44,13 @@ implementation
       cga,cgx86,cgobj,cgbase,cgutils;
 
 {*****************************************************************************
-                           TI386REALCONSTNODE
+                           TX86REALCONSTNODE
 *****************************************************************************}
 
     function tx86realconstnode.pass_1 : tnode;
       begin
          result:=nil;
-         if is_number_float(value_real) and not(use_vectorfpu(resultdef)) and (value_real=1.0) or (value_real=0.0) then
+         if is_number_float(value_real) and not(use_vectorfpu(resultdef)) and ((value_real=1.0) or (value_real=0.0)) then
            expectloc:=LOC_FPUREGISTER
          else
            expectloc:=LOC_CREFERENCE;

+ 140 - 54
compiler/x86/nx86inl.pas

@@ -60,10 +60,12 @@ interface
 
           procedure second_prefetch;override;
 
+{$ifndef i8086}
           procedure second_abs_long;override;
+{$endif not i8086}
           procedure second_popcnt;override;
        private
-          procedure load_fpu_location;
+          procedure load_fpu_location(lnode: tnode);
        end;
 
 implementation
@@ -129,12 +131,28 @@ implementation
 
      function tx86inlinenode.first_cos_real : tnode;
       begin
+{$ifdef i8086}
+        { FCOS is 387+ }
+        if current_settings.cputype < cpu_386 then
+          begin
+            result := inherited;
+            exit;
+          end;
+{$endif i8086}
         expectloc:=LOC_FPUREGISTER;
         first_cos_real := nil;
       end;
 
      function tx86inlinenode.first_sin_real : tnode;
       begin
+{$ifdef i8086}
+        { FSIN is 387+ }
+        if current_settings.cputype < cpu_386 then
+          begin
+            result := inherited;
+            exit;
+          end;
+{$endif i8086}
         expectloc:=LOC_FPUREGISTER;
         first_sin_real := nil;
       end;
@@ -196,28 +214,28 @@ implementation
        end;
 
      { load the FPU into the an fpu register }
-     procedure tx86inlinenode.load_fpu_location;
+     procedure tx86inlinenode.load_fpu_location(lnode: tnode);
        begin
          location_reset(location,LOC_FPUREGISTER,def_cgsize(resultdef));
          location.register:=NR_FPU_RESULT_REG;
-         secondpass(left);
-         case left.location.loc of
+         secondpass(lnode);
+         case lnode.location.loc of
            LOC_FPUREGISTER:
              ;
            LOC_CFPUREGISTER:
              begin
-               cg.a_loadfpu_reg_reg(current_asmdata.CurrAsmList,left.location.size,
-                 left.location.size,left.location.register,location.register);
+               cg.a_loadfpu_reg_reg(current_asmdata.CurrAsmList,lnode.location.size,
+                 lnode.location.size,lnode.location.register,location.register);
              end;
            LOC_REFERENCE,LOC_CREFERENCE:
              begin
                cg.a_loadfpu_ref_reg(current_asmdata.CurrAsmList,
-                  left.location.size,left.location.size,
-                  left.location.reference,location.register);
+                  lnode.location.size,lnode.location.size,
+                  lnode.location.reference,location.register);
              end;
            LOC_MMREGISTER,LOC_CMMREGISTER:
              begin
-               location:=left.location;
+               location:=lnode.location;
                location_force_fpureg(current_asmdata.CurrAsmList,location,false);
              end;
            else
@@ -228,7 +246,7 @@ implementation
 
      procedure tx86inlinenode.second_arctan_real;
        begin
-         load_fpu_location;
+         load_fpu_location(left);
          emit_none(A_FLD1,S_NO);
          emit_none(A_FPATAN,S_NO);
        end;
@@ -241,7 +259,7 @@ implementation
          if use_vectorfpu(resultdef) then
            begin
              secondpass(left);
-             location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
              location:=left.location;
              case tfloatdef(resultdef).floattype of
                s32real:
@@ -256,7 +274,7 @@ implementation
            end
          else
            begin
-             load_fpu_location;
+             load_fpu_location(left);
              emit_none(A_FABS,S_NO);
            end;
        end;
@@ -268,22 +286,32 @@ implementation
          if use_vectorfpu(left.resultdef) then
            begin
              secondpass(left);
-             location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
              location_reset(location,LOC_REGISTER,OS_S64);
              location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
-             case left.location.size of
-               OS_F32:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_Q,left.location.register,location.register));
-               OS_F64:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_Q,left.location.register,location.register));
-               else
-                 internalerror(2007031402);
-             end;
+             if UseAVX then
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031402);
+               end
+             else
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031402);
+               end;
            end
          else
 {$endif x86_64}
           begin
-            load_fpu_location;
+            load_fpu_location(left);
             location_reset_ref(location,LOC_REFERENCE,OS_S64,0);
             tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
             emit_ref(A_FISTP,S_IQ,location.reference);
@@ -302,24 +330,34 @@ implementation
            not((left.location.loc=LOC_FPUREGISTER) and (current_settings.fputype>=fpu_sse3)) then
            begin
              secondpass(left);
-             location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
              location_reset(location,LOC_REGISTER,OS_S64);
              location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
-             case left.location.size of
-               OS_F32:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_Q,left.location.register,location.register));
-               OS_F64:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_Q,left.location.register,location.register));
-               else
-                 internalerror(2007031401);
-             end;
+             if UseAVX then
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031401);
+               end
+             else
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031401);
+               end;
            end
          else
 {$endif x86_64}
           begin
             if (current_settings.fputype>=fpu_sse3) then
               begin
-                load_fpu_location;
+                load_fpu_location(left);
                 location_reset_ref(location,LOC_REFERENCE,OS_S64,0);
                 tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
                 emit_ref(A_FISTTP,S_IQ,location.reference);
@@ -332,7 +370,7 @@ implementation
                 emit_ref(A_FNSTCW,S_NO,newcw);
                 emit_ref(A_FNSTCW,S_NO,oldcw);
                 emit_const_ref(A_OR,S_W,$0f00,newcw);
-                load_fpu_location;
+                load_fpu_location(left);
                 emit_ref(A_FLDCW,S_NO,newcw);
                 location_reset_ref(location,LOC_REFERENCE,OS_S64,0);
                 tg.GetTemp(current_asmdata.CurrAsmList,resultdef.size,resultdef.alignment,tt_normal,location.reference);
@@ -353,13 +391,24 @@ implementation
          if use_vectorfpu(resultdef) then
            begin
              secondpass(left);
-             location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
-             location:=left.location;
-             cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location,left.location.register,mms_movescalar);
+             location_reset(location,LOC_MMREGISTER,left.location.size);
+             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+             if UseAVX then
+               begin
+                 hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+                 cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location.register,left.location.register,location.register,mms_movescalar);
+               end
+             else
+               begin
+                 if left.location.loc in [LOC_CFPUREGISTER,LOC_FPUREGISTER] then
+                   hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+                 cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
+                 cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,location.register,location.register,mms_movescalar);
+               end;
            end
          else
            begin
-             load_fpu_location;
+             load_fpu_location(left);
              emit_reg_reg(A_FMUL,S_NO,NR_ST0,NR_ST0);
            end;
        end;
@@ -370,27 +419,38 @@ implementation
          if use_vectorfpu(resultdef) then
            begin
              secondpass(left);
-             location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
-             location:=left.location;
-             case tfloatdef(resultdef).floattype of
-               s32real:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_XMM,location.register,location.register));
-               s64real:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_XMM,location.register,location.register));
-               else
-                 internalerror(200510031);
-             end;
+             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+             location_reset(location,LOC_MMREGISTER,left.location.size);
+             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+             if UseAVX then
+               case tfloatdef(resultdef).floattype of
+                 s32real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSS,S_XMM,left.location.register,location.register,location.register));
+                 s64real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSD,S_XMM,left.location.register,location.register,location.register));
+                 else
+                   internalerror(200510031);
+               end
+             else
+               case tfloatdef(resultdef).floattype of
+                 s32real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_XMM,left.location.register,location.register));
+                 s64real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_XMM,left.location.register,location.register));
+                 else
+                   internalerror(200510031);
+               end;
            end
          else
            begin
-             load_fpu_location;
+             load_fpu_location(left);
              emit_none(A_FSQRT,S_NO);
            end;
        end;
 
      procedure tx86inlinenode.second_ln_real;
        begin
-         load_fpu_location;
+         load_fpu_location(left);
          emit_none(A_FLDLN2,S_NO);
          emit_none(A_FXCH,S_NO);
          emit_none(A_FYL2X,S_NO);
@@ -398,13 +458,29 @@ implementation
 
      procedure tx86inlinenode.second_cos_real;
        begin
-         load_fpu_location;
+{$ifdef i8086}
+       { FCOS is 387+ }
+       if current_settings.cputype < cpu_386 then
+         begin
+           inherited;
+           exit;
+         end;
+{$endif i8086}
+         load_fpu_location(left);
          emit_none(A_FCOS,S_NO);
        end;
 
      procedure tx86inlinenode.second_sin_real;
        begin
-         load_fpu_location;
+{$ifdef i8086}
+       { FSIN is 387+ }
+       if current_settings.cputype < cpu_386 then
+         begin
+           inherited;
+           exit;
+         end;
+{$endif i8086}
+         load_fpu_location(left);
          emit_none(A_FSIN,S_NO)
        end;
 
@@ -413,9 +489,9 @@ implementation
          ref : treference;
          r : tregister;
        begin
-{$ifdef i386}
+{$if defined(i386) or defined(i8086)}
          if current_settings.cputype>=cpu_Pentium3 then
-{$endif i386}
+{$endif i386 or i8086}
            begin
              secondpass(left);
              case left.location.loc of
@@ -434,6 +510,7 @@ implementation
        end;
 
 
+{$ifndef i8086}
     procedure tx86inlinenode.second_abs_long;
       var
         hregister : tregister;
@@ -470,6 +547,7 @@ implementation
             current_asmdata.CurrAsmList.concat(hp);
           end;
       end;
+{$endif not i8086}
 
 {*****************************************************************************
                      INCLUDE/EXCLUDE GENERIC HANDLING
@@ -487,6 +565,14 @@ implementation
          opsize,
          orgsize: tcgsize;
         begin
+{$ifdef i8086}
+          { BTS and BTR are 386+ }
+          if current_settings.cputype < cpu_386 then
+            begin
+              inherited;
+              exit;
+            end;
+{$endif i8086}
           if is_smallset(tcallparanode(left).resultdef) then
             begin
               opdef:=tcallparanode(left).resultdef;
@@ -572,7 +658,7 @@ implementation
 
         if not(left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_REFERENCE,LOC_CREFERENCE]) or
            (left.location.size<>opsize) then
-          hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,hlcg.tcgsize2orddef(opsize),true);
+          hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,cgsize_orddef(opsize),true);
 
         location_reset(location,LOC_REGISTER,opsize);
         location.register:=cg.getintregister(current_asmdata.CurrAsmList,opsize);

+ 9 - 5
compiler/x86/nx86mat.pas

@@ -154,14 +154,11 @@ interface
 
         if expectloc=LOC_MMREGISTER then
           begin
-            location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,false);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
             location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
 
             { make life of register allocator easier }
             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
-            cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),left.location.register,location.register,mms_movescalar);
-
-            reg:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
 
             current_asmdata.getdatalabel(l1);
             new_section(current_asmdata.asmlists[al_typedconsts],sec_rodata_norel,l1.name,const_align(sizeof(pint)));
@@ -179,9 +176,16 @@ interface
             end;
 
             reference_reset_symbol(href,l1,0,resultdef.alignment);
+            reg:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
             cg.a_loadmm_ref_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),href,reg,mms_movescalar);
 
-            cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,location.register,nil);
+            if UseAVX then
+              cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,left.location.register,location.register,nil)
+            else
+              begin
+                cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),left.location.register,location.register,mms_movescalar);
+                cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,location.register,nil);
+              end;
           end
         else
           begin

+ 39 - 1
compiler/x86/nx86mem.pas

@@ -30,8 +30,14 @@ interface
       node,nmem,ncgmem;
 
     type
+      tx86derefnode = class(tcgderefnode)
+        procedure pass_generate_code;override;
+      end;
+
       tx86vecnode = class(tcgvecnode)
+{$ifndef i8086}
         procedure update_reference_reg_mul(maybe_const_reg:tregister;l:aint);override;
+{$endif not i8086}
       end;
 
 implementation
@@ -39,12 +45,42 @@ implementation
     uses
       cutils,verbose,
       aasmtai,aasmdata,
-      cgutils,cgobj;
+      cgutils,cgobj,
+      symconst,symdef;
+
+{*****************************************************************************
+                           TX86DEREFNODE
+*****************************************************************************}
+
+     procedure tx86derefnode.pass_generate_code;
+       begin
+         inherited pass_generate_code;
+         case tpointerdef(left.resultdef).x86pointertyp of
+           x86pt_near: ;
+           x86pt_near_cs: location.reference.segment:=NR_CS;
+           x86pt_near_ds: location.reference.segment:=NR_DS;
+           x86pt_near_ss: location.reference.segment:=NR_SS;
+           x86pt_near_es: location.reference.segment:=NR_ES;
+           x86pt_near_fs: location.reference.segment:=NR_FS;
+           x86pt_near_gs: location.reference.segment:=NR_GS;
+{$ifdef i8086}
+           x86pt_far,
+           x86pt_huge: {do nothing; handled in ti8086derefnode};
+{$else i8086}
+           x86pt_far: internalerror(2013050401);
+           x86pt_huge: internalerror(2013050402);
+{$endif i8086}
+           else
+             internalerror(2013050403);
+         end;
+       end;
+
 
 {*****************************************************************************
                              TX86VECNODE
 *****************************************************************************}
 
+{$ifndef i8086}
      { this routine must, like any other routine, not change the contents }
      { of base/index registers of references, as these may be regvars.    }
      { The register allocator can coalesce one LOC_REGISTER being moved   }
@@ -102,7 +138,9 @@ implementation
          end;
          location.reference.index:=hreg;
        end;
+{$endif not i8086}
 
 begin
+   cderefnode:=tx86derefnode;
    cvecnode:=tx86vecnode;
 end.

+ 219 - 14
compiler/x86/nx86set.pas

@@ -61,11 +61,7 @@ implementation
 
     function tx86casenode.has_jumptable : boolean;
       begin
-{$ifdef i386}
         has_jumptable:=true;
-{$else}
-        has_jumptable:=false;
-{$endif}
       end;
 
 
@@ -89,13 +85,21 @@ implementation
             i:=last.svalue+1;
             while i<=t^._low.svalue-1 do
               begin
+{$ifdef i8086}
+                list.concat(Tai_const.Create_sym_near(elselabel));
+{$else i8086}
                 list.concat(Tai_const.Create_sym(elselabel));
+{$endif i8086}
                 inc(i);
               end;
             i:=t^._low.svalue;
             while i<=t^._high.svalue do
               begin
+{$ifdef i8086}
+                list.concat(Tai_const.Create_sym_near(blocklabel(t^.blockid)));
+{$else i8086}
                 list.concat(Tai_const.Create_sym(blocklabel(t^.blockid)));
+{$endif i8086}
                 inc(i);
               end;
             last:=t^._high;
@@ -122,7 +126,11 @@ implementation
         reference_reset_symbol(href,table,0,sizeof(pint));
         href.offset:=(-aint(min_))*sizeof(aint);
         href.index:=indexreg;
+{$ifdef i8086}
+        cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SHL,OS_INT,1,indexreg);
+{$else i8086}
         href.scalefactor:=sizeof(aint);
+{$endif i8086}
         emit_ref(A_JMP,S_NO,href);
         { generate jump table }
         if (target_info.system in [system_i386_darwin,system_i386_iphonesim]) then
@@ -186,16 +194,27 @@ implementation
                       { present label then the lower limit can be checked    }
                       { immediately. else check the range in between:       }
 
-                      cg.a_op_const_reg(current_asmdata.CurrAsmList, OP_SUB, opcgsize, aint(t^._low.svalue-last.svalue), hregister);
-                      { no jump necessary here if the new range starts at }
-                      { at the value following the previous one           }
+                      { we need to use A_SUB, if cond_lt uses the carry flags
+                        because A_DEC does not set the correct flags, therefor
+                        using a_op_const_reg(OP_SUB) is not possible }
+                      if (cond_lt in [F_C,F_NC,F_A,F_AE,F_B,F_BE]) and (aint(t^._low.svalue-last.svalue)=1) then
+                        emit_const_reg(A_SUB,TCGSize2OpSize[opcgsize],aint(t^._low.svalue-last.svalue),hregister)
+                      else
+                        cg.a_op_const_reg(current_asmdata.CurrAsmList, OP_SUB, opcgsize, aint(t^._low.svalue-last.svalue), hregister);
+                      { no jump necessary here if the new range starts at
+                        at the value following the previous one           }
                       if ((t^._low-last) <> 1) or
                          (not lastrange) then
                         cg.a_jmp_flags(current_asmdata.CurrAsmList,cond_lt,elselabel);
                     end;
-                  {we need to use A_SUB, because A_DEC does not set the correct flags, therefor
-                   using a_op_const_reg(OP_SUB) is not possible }
-                  emit_const_reg(A_SUB,TCGSize2OpSize[opcgsize],aint(t^._high.svalue-t^._low.svalue),hregister);
+                  { we need to use A_SUB, if cond_le uses the carry flags
+                    because A_DEC does not set the correct flags, therefor
+                    using a_op_const_reg(OP_SUB) is not possible }
+                  if (cond_le in [F_C,F_NC,F_A,F_AE,F_B,F_BE]) and (aint(t^._high.svalue-t^._low.svalue)=1) then
+                    emit_const_reg(A_SUB,TCGSize2OpSize[opcgsize],aint(t^._high.svalue-t^._low.svalue),hregister)
+                  else
+                    cg.a_op_const_reg(current_asmdata.CurrAsmList, OP_SUB, opcgsize, aint(t^._high.svalue-t^._low.svalue), hregister);
+
                   cg.a_jmp_flags(current_asmdata.CurrAsmList,cond_le,blocklabel(t^.blockid));
                   last:=t^._high;
                   lastrange:=true;
@@ -218,7 +237,11 @@ implementation
                 cond_le:=F_BE;
              end;
            { do we need to generate cmps? }
+{$ifdef i8086}
+           if (with_sign and (min_label<0)) or (opcgsize in [OS_32, OS_S32]) then
+{$else i8086}
            if (with_sign and (min_label<0)) then
+{$endif i8086}
              genlinearcmplist(hp)
            else
              begin
@@ -269,6 +292,9 @@ implementation
 {$ifdef CORRECT_SET_IN_FPC}
          AM         : tasmop;
 {$endif CORRECT_SET_IN_FPC}
+{$ifdef i8086}
+         extra_offset_reg: TRegister;
+{$endif i8086}
 
          function analizeset(Aset:pconstset;is_small:boolean):boolean;
            var
@@ -329,6 +355,25 @@ implementation
              analizeset:=true;
            end;
 
+{$ifdef i8086}
+         procedure add_extra_offset(offset_reg:TRegister;var ref:treference);
+           var
+             reg: TRegister;
+           begin
+             if ref.index=NR_NO then
+               ref.index:=offset_reg
+             else if ref.base=NR_NO then
+               ref.base:=offset_reg
+             else
+               begin
+                 reg:=cg.getaddressregister(current_asmdata.CurrAsmList);
+                 cg.a_load_reg_reg(current_asmdata.CurrAsmList,OS_ADDR,OS_ADDR,ref.index,reg);
+                 cg.a_op_reg_reg(current_asmdata.CurrAsmList,OP_ADD,OS_ADDR,offset_reg,reg);
+                 ref.index:=reg;
+               end;
+           end;
+{$endif i8086}
+
        begin
          { We check first if we can generate jumps, this can be done
            because the resultdef is already set in firstpass }
@@ -337,8 +382,8 @@ implementation
            to 32 bits, the left side may also not contain higher values or be signed !! }
          use_small:=is_smallset(right.resultdef) and
                     not is_signed(left.resultdef) and
-                    ((left.resultdef.typ=orddef) and (torddef(left.resultdef).high.svalue<32) or
-                     (left.resultdef.typ=enumdef) and (tenumdef(left.resultdef).max<32));
+                    ((left.resultdef.typ=orddef) and (torddef(left.resultdef).high.svalue<{$ifdef i8086}16{$else}32{$endif}) or
+                     (left.resultdef.typ=enumdef) and (tenumdef(left.resultdef).max<{$ifdef i8086}16{$else}32{$endif}));
 
          { Can we generate jumps? Possible for all types of sets }
          genjumps:=(right.nodetype=setconstn) and
@@ -364,10 +409,14 @@ implementation
           swapleftright;
 
          orgopsize := def_cgsize(left.resultdef);
+{$ifdef i8086}
+         opsize := OS_16;
+{$else i8086}
          opsize := OS_32;
+{$endif i8086}
          if is_signed(left.resultdef) then
            opsize := tcgsize(ord(opsize)+(ord(OS_S8)-ord(OS_8)));
-         opdef:=hlcg.tcgsize2orddef(opsize);
+         opdef:=cgsize_orddef(opsize);
 
          if not(left.location.loc in [LOC_REGISTER,LOC_CREGISTER,LOC_REFERENCE,LOC_CREFERENCE,LOC_CONSTANT]) then
            hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,true);
@@ -486,6 +535,38 @@ implementation
                 end
                else
                 begin
+{$ifdef i8086}
+                  cg.getcpuregister(current_asmdata.CurrAsmList,NR_CX);
+                  if TCGSize2Size[left.location.size] > 2 then
+                    left.location.size := OS_16;
+                  cg.a_load_loc_reg(current_asmdata.CurrAsmList,OS_16,left.location,NR_CX);
+
+                  register_maybe_adjust_setbase(current_asmdata.CurrAsmList,left.location,setbase);
+                  if (tcgsize2size[right.location.size] < 2) or
+                     (right.location.loc = LOC_CONSTANT) then
+                    hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,u16inttype,true);
+
+                  hreg:=cg.getintregister(current_asmdata.CurrAsmList,OS_16);
+                  emit_const_reg(A_MOV,S_W,1,hreg);
+                  emit_reg_reg(A_SHL,S_W,NR_CL,hreg);
+
+                  case right.location.loc of
+                    LOC_REGISTER,
+                    LOC_CREGISTER :
+                      begin
+                        emit_reg_reg(A_TEST,S_W,hreg,right.location.register);
+                      end;
+                     LOC_CREFERENCE,
+                     LOC_REFERENCE :
+                       begin
+                         emit_reg_ref(A_TEST,S_W,hreg,right.location.reference);
+                       end;
+                     else
+                       internalerror(2002032210);
+                  end;
+                  cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_CX);
+                  location.resflags:=F_NE;
+{$else i8086}
                   hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,u32inttype,true);
                   register_maybe_adjust_setbase(current_asmdata.CurrAsmList,left.location,setbase);
                   if (tcgsize2size[right.location.size] < 4) or
@@ -508,12 +589,44 @@ implementation
                        internalerror(2002032210);
                   end;
                   location.resflags:=F_C;
+{$endif i8086}
                 end;
              end
             else
              begin
                if right.location.loc=LOC_CONSTANT then
                 begin
+{$ifdef i8086}
+                  location.resflags:=F_NE;
+                  current_asmdata.getjumplabel(l);
+                  current_asmdata.getjumplabel(l2);
+
+                  { load constants to a register }
+                  if (left.location.loc=LOC_CONSTANT) or
+                     (setbase<>0) then
+                    begin
+                      hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,true);
+                      register_maybe_adjust_setbase(current_asmdata.CurrAsmList,left.location,setbase);
+                    end;
+
+                  cg.getcpuregister(current_asmdata.CurrAsmList,NR_CX);
+                  if TCGSize2Size[left.location.size] > 2 then
+                    left.location.size := OS_16;
+                  cg.a_load_loc_reg(current_asmdata.CurrAsmList,OS_16,left.location,NR_CX);
+                  cg.a_cmp_const_reg_label(current_asmdata.CurrAsmList,opsize,OC_BE,15,NR_CX,l);
+                  { set the zero flag }
+                  current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_TEST,S_B,0,NR_AL));
+                  cg.a_jmp_always(current_asmdata.CurrAsmList,l2);
+
+                  hreg:=cg.getintregister(current_asmdata.CurrAsmList,OS_16);
+                  cg.a_label(current_asmdata.CurrAsmList,l);
+                  emit_const_reg(A_MOV,S_W,1,hreg);
+                  emit_reg_reg(A_SHL,S_W,NR_CL,hreg);
+                  cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_CX);
+                  emit_const_reg(A_TEST,S_W,right.location.value,hreg);
+
+                  cg.a_label(current_asmdata.CurrAsmList,l2);
+{$else i8086}
                   location.resflags:=F_C;
                   current_asmdata.getjumplabel(l);
                   current_asmdata.getjumplabel(l2);
@@ -561,6 +674,7 @@ implementation
                        end;
                   end;
                   cg.a_label(current_asmdata.CurrAsmList,l2);
+{$endif i8086}
                 end { of right.location.loc=LOC_CONSTANT }
                { do search in a normal set which could have >32 elementsm
                  but also used if the left side contains values > 32 or < 0 }
@@ -575,7 +689,7 @@ implementation
                     LOC_REFERENCE,LOC_CREFERENCE:
                       begin
                         inc(right.location.reference.offset,(left.location.value-setbase) shr 3);
-                        emit_const_ref(A_TEST,S_B,1 shl (left.location.value and 7),right.location.reference);
+                        emit_const_ref(A_TEST,S_B,1 shl ((left.location.value-setbase) and 7),right.location.reference);
                       end;
                     LOC_REGISTER,LOC_CREGISTER:
                       begin
@@ -587,6 +701,96 @@ implementation
                 end
                else
                 begin
+{$ifdef i8086}
+                  hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
+                  register_maybe_adjust_setbase(current_asmdata.CurrAsmList,left.location,setbase);
+
+                  if TCGSize2Size[left.location.size] > 2 then
+                    left.location.size := OS_16;
+
+                  if not use_small then
+                    begin
+                      extra_offset_reg:=cg.getintregister(current_asmdata.CurrAsmList,OS_16);
+                      cg.a_load_loc_reg(current_asmdata.CurrAsmList,OS_16,left.location,extra_offset_reg);
+                      cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SHR,OS_16,4,extra_offset_reg);
+                      cg.a_op_const_reg(current_asmdata.CurrAsmList,OP_SHL,OS_16,1,extra_offset_reg);
+                    end
+                  else
+                    extra_offset_reg:=NR_NO;
+
+                  cg.getcpuregister(current_asmdata.CurrAsmList,NR_CX);
+                  cg.a_load_loc_reg(current_asmdata.CurrAsmList,OS_16,left.location,NR_CX);
+                  if not use_small then
+                    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_AND,S_B,15,NR_CL));
+
+                  pleftreg:=cg.getintregister(current_asmdata.CurrAsmList,OS_16);
+
+                  if (right.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
+                    hlcg.location_force_reg(current_asmdata.CurrAsmList,right.location,right.resultdef,opdef,true);
+
+                  if (opsize >= OS_S8) or { = if signed }
+                     ((left.resultdef.typ=orddef) and
+                      ((torddef(left.resultdef).low < int64(tsetdef(right.resultdef).setbase)) or
+                       (torddef(left.resultdef).high > int64(tsetdef(right.resultdef).setmax)))) or
+                     ((left.resultdef.typ=enumdef) and
+                      ((tenumdef(left.resultdef).min < aint(tsetdef(right.resultdef).setbase)) or
+                       (tenumdef(left.resultdef).max > aint(tsetdef(right.resultdef).setmax)))) then
+                   begin
+
+                    { we have to check if the value is < 0 or > setmax }
+
+                    current_asmdata.getjumplabel(l);
+                    current_asmdata.getjumplabel(l2);
+
+                    { BE will be false for negative values }
+                    cg.a_cmp_const_reg_label(current_asmdata.CurrAsmList,opsize,OC_BE,tsetdef(right.resultdef).setmax-tsetdef(right.resultdef).setbase,pleftreg,l);
+                    { set the zero flag }
+                    current_asmdata.CurrAsmList.concat(taicpu.op_const_reg(A_TEST,S_B,0,NR_AL));
+                    cg.a_jmp_always(current_asmdata.CurrAsmList,l2);
+
+                    cg.a_label(current_asmdata.CurrAsmList,l);
+
+                    emit_const_reg(A_MOV,S_W,1,pleftreg);
+                    emit_reg_reg(A_SHL,S_W,NR_CL,pleftreg);
+                    cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_CX);
+                    case right.location.loc of
+                      LOC_REGISTER, LOC_CREGISTER :
+                        emit_reg_reg(A_TEST,S_W,pleftreg,right.location.register);
+                      LOC_CREFERENCE, LOC_REFERENCE :
+                        begin
+                          if not use_small then
+                            add_extra_offset(extra_offset_reg,right.location.reference);
+                          emit_reg_ref(A_TEST,S_W,pleftreg,right.location.reference);
+                        end;
+                    else
+                      internalerror(2007020301);
+                    end;
+
+                    cg.a_label(current_asmdata.CurrAsmList,l2);
+
+                    location.resflags:=F_NE;
+
+                   end
+                  else
+                   begin
+                      emit_const_reg(A_MOV,S_W,1,pleftreg);
+                      emit_reg_reg(A_SHL,S_W,NR_CL,pleftreg);
+                      cg.ungetcpuregister(current_asmdata.CurrAsmList,NR_CX);
+                      case right.location.loc of
+                        LOC_REGISTER, LOC_CREGISTER :
+                          emit_reg_reg(A_TEST,S_W,pleftreg,right.location.register);
+                        LOC_CREFERENCE, LOC_REFERENCE :
+                          begin
+                            if not use_small then
+                              add_extra_offset(extra_offset_reg,right.location.reference);
+                            emit_reg_ref(A_TEST,S_W,pleftreg,right.location.reference);
+                          end;
+                      else
+                        internalerror(2007020302);
+                      end;
+                      location.resflags:=F_NE;
+                   end;
+{$else i8086}
                   hlcg.location_force_reg(current_asmdata.CurrAsmList,left.location,left.resultdef,opdef,false);
                   register_maybe_adjust_setbase(current_asmdata.CurrAsmList,left.location,setbase);
                   if (right.location.loc in [LOC_REGISTER,LOC_CREGISTER]) then
@@ -642,6 +846,7 @@ implementation
                       end;
                       location.resflags:=F_C;
                    end;
+{$endif i8086}
                 end;
              end;
           end;

+ 9 - 2
compiler/x86/rax86.pas

@@ -345,7 +345,6 @@ var
   memrefsize: integer;
   memopsize: integer;
   memoffset: asizeint;
-  s1: string;
 begin
   ExistsMemRefNoSize := false;
   ExistsMemRef       := false;
@@ -789,6 +788,10 @@ begin
                   case tx86operand(operands[2]).opsize of
                     S_L :
                       opsize:=S_WL;
+{$ifdef x86_64}
+                    S_Q :
+                      opsize:=S_WQ;
+{$endif}
                   end;
                 S_B :
                   begin
@@ -797,6 +800,10 @@ begin
                         opsize:=S_BW;
                       S_L :
                         opsize:=S_BL;
+{$ifdef x86_64}
+                      S_Q :
+                        opsize:=S_BQ;
+{$endif}
                     end;
                   end;
               end;
@@ -1099,7 +1106,7 @@ begin
      if someone uses this in assembler code
      FPC itself does not use it at all PM }
    if (opcode=A_ENTER) and
-      (target_info.system in [system_i386_linux,system_i386_FreeBSD]) then
+      (target_info.system in [system_i386_linux,system_i386_FreeBSD,system_i386_android]) then
      Message(asmr_w_enter_not_supported_by_linux);
 
 

+ 26 - 2
compiler/x86/rax86att.pas

@@ -334,6 +334,7 @@ Implementation
         relsym: string;
         asmsymtyp: tasmsymtype;
         l: aint;
+        sym: tasmsymbol;
       begin
         case actasmtoken of
           AS_AT:
@@ -352,7 +353,27 @@ Implementation
 {$ifdef i386}
                   if actasmpattern='GOT' then
 {$endif i386}
+{$ifdef i8086}
+                  if actasmpattern='GOT' then
+{$endif i8086}
                     begin
+                      case oper.opr.typ of
+                        OPR_SYMBOL:
+                          begin
+                            sym:=oper.opr.symbol;
+                            if oper.opr.symofs<>0 then
+                              Message(asmr_e_invalid_reference_syntax);
+                            oper.opr.typ:=OPR_REFERENCE;
+                            fillchar(oper.opr.ref,sizeof(oper.opr.ref),0);
+                            oper.opr.ref.symbol:=sym;
+                          end;
+                        OPR_REFERENCE:
+                          begin
+                            { ok }
+                          end;
+                        else
+                          Message(asmr_e_invalid_reference_syntax)
+                      end;
                       oper.opr.ref.refaddr:=addr_pic;
                       consume(AS_ID);
                     end
@@ -882,8 +903,11 @@ Implementation
                 begin
                   actopcode:=tasmop(PtrUInt(iasmops.Find(copy(s,1,len))));
 
-                  if (actopcode = A_NONE) and
-                     (upper(s) = 'MOVSD') then actopcode := A_MOVSD;
+                  { movsd needs special handling because it has two namings in at&t syntax (movsl for string handling and
+                    movsd for the sse instruction) while only one in intel syntax (movsd, both string and sse)
+                    this cannot be expressed by the instruction table format so we have to hack around this here }
+                  if (actopcode = A_NONE) and (upper(s) = 'MOVSD') then
+                    actopcode := A_MOVSD;
 
                   { two-letter suffix is allowed by just a few instructions (movsx,movzx),
                     and it is always required whenever allowed }

+ 16 - 6
compiler/x86/rax86int.pas

@@ -51,6 +51,7 @@ Unit Rax86int;
          actasmtoken : tasmtoken;
          prevasmtoken : tasmtoken;
          ActOpsize : topsize;
+         inexpression : boolean;
          constructor create;override;
          function is_asmopcode(const s: string):boolean;
          function is_asmoperator(const s: string):boolean;
@@ -80,13 +81,13 @@ Unit Rax86int;
        cutils,
        { global }
        globals,verbose,
-       systems,
+       systems,cpuinfo,
        { aasm }
        aasmtai,aasmdata,aasmcpu,
        { symtable }
        symconst,symbase,symtype,symsym,symdef,symtable,
        { parser }
-       scanner,
+       scanner,pbase,
        { register allocator }
        rabase,rautils,itx86int,
        { codegen }
@@ -129,9 +130,6 @@ Unit Rax86int;
         'and','or','xor','wrt','..gotpcrel'
       );
 
-    var
-      inexpression   : boolean;
-
     constructor tx86intreader.create;
       var
         i : tasmop;
@@ -2130,6 +2128,17 @@ Unit Rax86int;
         if (instr.ops=1) and
            (instr.operands[1].typesize<>0) then
           instr.operands[1].setsize(instr.operands[1].typesize,false);
+{$ifdef i8086}
+        { convert 'call symbol' to 'call far symbol' for memory models with far code }
+        for i:=1 to operandnum do
+          with instr.operands[i].opr do
+            if (instr.opcode=A_CALL) and (typ=OPR_SYMBOL) and (symbol<>nil) and (symbol.typ<>AT_DATA) then
+              if current_settings.x86memorymodel in x86_far_code_models then
+                begin
+                  instr.operands[i].InitRef;
+                  ref.refaddr:=addr_far;
+                end;
+{$endif i8086}
       end;
 
 
@@ -2222,7 +2231,8 @@ Unit Rax86int;
       { setup label linked list }
       LocalLabelList:=TLocalLabelList.Create;
       { we might need to know which parameters are passed in registers }
-      current_procinfo.generate_parameter_info;
+      if not parse_generic then
+        current_procinfo.generate_parameter_info;
       { start tokenizer }
       c:=current_scanner.asmgetcharstart;
       gettoken;

+ 149 - 120
compiler/x86/rgx86.pas

@@ -115,6 +115,7 @@ implementation
 
       var
         n,replaceoper : longint;
+        is_subh: Boolean;
       begin
         result:=false;
         with instr do
@@ -133,156 +134,184 @@ implementation
                 end;
               2,3 :
                 begin
-                  { We can handle opcodes with 2 and 3 operands the same way. The opcodes
-                    with 3 registers are shrd/shld, where the 3rd operand is const or CL,
-                    that doesn't need spilling.
-                    However, due to AT&T order inside the compiler, the 3rd operand is
-                    numbered 0, so look at operand no. 1 and 2 if we have 3 operands by
-                    adding a "n". }
-                  n:=0;
-                  if ops=3 then
-                    n:=1;
-                  if (oper[n+0]^.typ=top_reg) and
-                     (oper[n+1]^.typ=top_reg) and
-                     ((getregtype(oper[n+0]^.reg)<>regtype) or
-                      (getregtype(oper[n+1]^.reg)<>regtype) or
-                      (get_alias(getsupreg(oper[n+0]^.reg))<>get_alias(getsupreg(oper[n+1]^.reg)))) then
+                  { avx instruction?
+                    currently this rule is sufficient but it might be extended }
+                  if (ops=3) and (opcode<>A_SHRD) and (opcode<>A_SHLD) then
                     begin
-                      if (getregtype(oper[n+0]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[n+0]^.reg))=orgreg) then
-                        replaceoper:=0+n
-                      else if (getregtype(oper[n+1]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[n+1]^.reg))=orgreg) then
-                        replaceoper:=1+n;
+                      { avx instructions allow only the first operand (at&t counting) to be a register operand }
+                      { all operands must be registers ... }
+                      if (oper[0]^.typ=top_reg) and
+                         (oper[1]^.typ=top_reg) and
+                         (oper[2]^.typ=top_reg) and
+                         { but they must be different }
+                         ((getregtype(oper[1]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[0]^.reg))<>get_alias(getsupreg(oper[1]^.reg)))
+                         ) and
+                         ((getregtype(oper[2]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[0]^.reg))<>get_alias(getsupreg(oper[2]^.reg)))
+                         ) and
+                         (get_alias(getsupreg(oper[0]^.reg))=orgreg) then
+                        replaceoper:=0;
                     end
-                  else if (oper[n+0]^.typ=top_reg) and
-                     (oper[n+1]^.typ=top_const) then
-                    begin
-                      if (getregtype(oper[0+n]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[0+n]^.reg))=orgreg) then
-                        replaceoper:=0+n
-                      else
-                        internalerror(200704282);
-                    end
-                  else if (oper[n+0]^.typ=top_const) and
-                     (oper[n+1]^.typ=top_reg) then
+                  else
                     begin
-                      if (getregtype(oper[1+n]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[1+n]^.reg))=orgreg) then
-                        replaceoper:=1+n
-                      else
-                        internalerror(200704283);
-                    end;
-                  case replaceoper of
-                    0 :
-                      begin
-                        { Some instructions don't allow memory references
-                          for source }
-                        case instr.opcode of
-                          A_BT,
-                          A_BTS,
-                          A_BTC,
-                          A_BTR,
-
-                          { shufp* would require 16 byte alignment for memory locations so we force the source
-                            operand into a register }
-                          A_SHUFPD,
-                          A_SHUFPS :
-                            replaceoper:=-1;
+                      { We can handle opcodes with 2 and shrd/shld the same way, where the 3rd operand is const or CL,
+                        that doesn't need spilling.
+                        However, due to AT&T order inside the compiler, the 3rd operand is
+                        numbered 0, so look at operand no. 1 and 2 if we have 3 operands by
+                        adding a "n". }
+                      n:=0;
+                      if ops=3 then
+                        n:=1;
+                      if (oper[n+0]^.typ=top_reg) and
+                         (oper[n+1]^.typ=top_reg) and
+                         ((getregtype(oper[n+0]^.reg)<>regtype) or
+                          (getregtype(oper[n+1]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[n+0]^.reg))<>get_alias(getsupreg(oper[n+1]^.reg)))) then
+                        begin
+                          if (getregtype(oper[n+0]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[n+0]^.reg))=orgreg) then
+                            replaceoper:=0+n
+                          else if (getregtype(oper[n+1]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[n+1]^.reg))=orgreg) then
+                            replaceoper:=1+n;
+                        end
+                      else if (oper[n+0]^.typ=top_reg) and
+                         (oper[n+1]^.typ=top_const) then
+                        begin
+                          if (getregtype(oper[0+n]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[0+n]^.reg))=orgreg) then
+                            replaceoper:=0+n
+                          else
+                            internalerror(200704282);
+                        end
+                      else if (oper[n+0]^.typ=top_const) and
+                         (oper[n+1]^.typ=top_reg) then
+                        begin
+                          if (getregtype(oper[1+n]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[1+n]^.reg))=orgreg) then
+                            replaceoper:=1+n
+                          else
+                            internalerror(200704283);
                         end;
-                      end;
-                    1 :
-                      begin
-                        { Some instructions don't allow memory references
-                          for destination }
-                        case instr.opcode of
-                          A_CMOVcc,
-                          A_MOVZX,
-                          A_MOVSX,
-                          A_MOVSXD,
-                          A_MULSS,
-                          A_MULSD,
-                          A_SUBSS,
-                          A_SUBSD,
-                          A_ADDSD,
-                          A_ADDSS,
-                          A_DIVSD,
-                          A_DIVSS,
-                          A_SHLD,
-                          A_SHRD,
-                          A_COMISD,
-                          A_COMISS,
-                          A_CVTDQ2PD,
-                          A_CVTDQ2PS,
-                          A_CVTPD2DQ,
-                          A_CVTPD2PI,
-                          A_CVTPD2PS,
-                          A_CVTPI2PD,
-                          A_CVTPS2DQ,
-                          A_CVTPS2PD,
-                          A_CVTSD2SI,
-                          A_CVTSD2SS,
-                          A_CVTSI2SD,
-                          A_CVTSS2SD,
-                          A_CVTTPD2PI,
-                          A_CVTTPD2DQ,
-                          A_CVTTPS2DQ,
-                          A_CVTTSD2SI,
-                          A_CVTPI2PS,
-                          A_CVTPS2PI,
-                          A_CVTSI2SS,
-                          A_CVTSS2SI,
-                          A_CVTTPS2PI,
-                          A_CVTTSS2SI,
-                          A_IMUL,
-                          A_XORPD,
-                          A_XORPS,
-                          A_ORPD,
-                          A_ORPS,
-                          A_ANDPD,
-                          A_ANDPS,
-                          A_UNPCKLPS,
-                          A_UNPCKHPS,
-                          A_SHUFPD,
-                          A_SHUFPS:
-
-                            replaceoper:=-1;
+                      case replaceoper of
+                        0 :
+                          begin
+                            { Some instructions don't allow memory references
+                              for source }
+                            case instr.opcode of
+                              A_BT,
+                              A_BTS,
+                              A_BTC,
+                              A_BTR,
+
+                              { shufp* would require 16 byte alignment for memory locations so we force the source
+                                operand into a register }
+                              A_SHUFPD,
+                              A_SHUFPS :
+                                replaceoper:=-1;
+                            end;
+                          end;
+                        1 :
+                          begin
+                            { Some instructions don't allow memory references
+                              for destination }
+                            case instr.opcode of
+                              A_CMOVcc,
+                              A_MOVZX,
+                              A_MOVSX,
+                              A_MOVSXD,
+                              A_MULSS,
+                              A_MULSD,
+                              A_SUBSS,
+                              A_SUBSD,
+                              A_ADDSD,
+                              A_ADDSS,
+                              A_DIVSD,
+                              A_DIVSS,
+                              A_SHLD,
+                              A_SHRD,
+                              A_COMISD,
+                              A_COMISS,
+                              A_CVTDQ2PD,
+                              A_CVTDQ2PS,
+                              A_CVTPD2DQ,
+                              A_CVTPD2PI,
+                              A_CVTPD2PS,
+                              A_CVTPI2PD,
+                              A_CVTPS2DQ,
+                              A_CVTPS2PD,
+                              A_CVTSD2SI,
+                              A_CVTSD2SS,
+                              A_CVTSI2SD,
+                              A_CVTSS2SD,
+                              A_CVTTPD2PI,
+                              A_CVTTPD2DQ,
+                              A_CVTTPS2DQ,
+                              A_CVTTSD2SI,
+                              A_CVTPI2PS,
+                              A_CVTPS2PI,
+                              A_CVTSI2SS,
+                              A_CVTSS2SI,
+                              A_CVTTPS2PI,
+                              A_CVTTSS2SI,
+                              A_IMUL,
+                              A_XORPD,
+                              A_XORPS,
+                              A_ORPD,
+                              A_ORPS,
+                              A_ANDPD,
+                              A_ANDPS,
+                              A_UNPCKLPS,
+                              A_UNPCKHPS,
+                              A_SHUFPD,
+                              A_SHUFPS:
+
+                                replaceoper:=-1;
 {$ifdef x86_64}
-                          A_MOV:
-                             { 64 bit constants can only be moved into registers }
-                             if (oper[0]^.typ=top_const) and
-                                (oper[1]^.typ=top_reg) and
-                                ((oper[0]^.val<low(longint)) or
-                                 (oper[0]^.val>high(longint))) then
-                               replaceoper:=-1;
+                              A_MOV:
+                                 { 64 bit constants can only be moved into registers }
+                                 if (oper[0]^.typ=top_const) and
+                                    (oper[1]^.typ=top_reg) and
+                                    ((oper[0]^.val<low(longint)) or
+                                     (oper[0]^.val>high(longint))) then
+                                   replaceoper:=-1;
 {$endif x86_64}
+                            end;
+                          end;
                         end;
-                      end;
                     end;
                 end;
              end;
 
-            {$ifdef x86_64}
+{$ifdef x86_64}
             { 32 bit operations on 32 bit registers on x86_64 can result in
               zeroing the upper 32 bits of the register. This does not happen
               with memory operations, so we have to perform these calculations
               in registers.  }
             if (instr.opsize=S_L) then
               replaceoper:=-1;
-            {$endif x86_64}
+{$endif x86_64}
 
             { Replace register with spill reference }
             if replaceoper<>-1 then
               begin
+                is_subh:=getsubreg(oper[replaceoper]^.reg)=R_SUBH;
                 oper[replaceoper]^.typ:=top_ref;
                 new(oper[replaceoper]^.ref);
                 oper[replaceoper]^.ref^:=spilltemp;
+                if is_subh then
+                  inc(oper[replaceoper]^.ref^.offset);
                 { memory locations aren't guaranteed to be aligned }
                 case opcode of
                   A_MOVAPS:
                     opcode:=A_MOVSS;
                   A_MOVAPD:
                     opcode:=A_MOVSD;
+                  A_VMOVAPS:
+                    opcode:=A_VMOVSS;
+                  A_VMOVAPD:
+                    opcode:=A_VMOVSD;
                 end;
                 result:=true;
               end;

+ 49 - 10
compiler/x86/x86ins.dat

@@ -3453,22 +3453,22 @@ void                   \326\1\xA7                                    X86_64
 
 
 [VADDPD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \361\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \361\362\364\370\1\x58\75\120        AVX,SANDYBRIDGE
 
 [VADDPS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \362\370\1\x58\75\120                AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \362\364\370\1\x58\75\120            AVX,SANDYBRIDGE
 
 [VADDSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem64                  \334\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 
 [VADDSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem32                  \333\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 
@@ -3919,7 +3919,7 @@ rm64,xmmreg                          \361\362\363\370\1\x7E\101           AVX,SA
 xmmreg,rm64                          \361\362\363\370\1\x6E\110           AVX,SANDYBRIDGE
 
 [VMOVSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x10\75\120            AVX,SANDYBRIDGE
 xmmreg,mem64                         \334\362\370\1\x10\110               AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x11\75\102            AVX,SANDYBRIDGE
@@ -3936,7 +3936,7 @@ xmmreg,xmmrm                         \333\362\370\1\x12\110               AVX,SA
 ymmreg,ymmrm                         \333\362\364\370\1\x12\110           AVX,SANDYBRIDGE
 
 [VMOVSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x10\75\120            AVX,SANDYBRIDGE
 xmmreg,mem64                         \333\362\370\1\x10\110               AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x11\75\102            AVX,SANDYBRIDGE
@@ -3961,22 +3961,22 @@ ymmrm,ymmreg                         \362\364\370\1\x11\101               AVX,SA
 xmmreg,xmmreg,xmmrm,imm8             \361\362\372\1\x42\75\120\27         AVX,SANDYBRIDGE
 
 [VMULPD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \361\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \361\362\364\370\1\x59\75\120        AVX,SANDYBRIDGE
 
 [VMULPS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \362\370\1\x59\75\120                AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \362\364\370\1\x59\75\120            AVX,SANDYBRIDGE
 
 [VMULSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem64                  \334\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 
 [VMULSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem32                  \333\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 
@@ -4673,3 +4673,42 @@ void                                 \362\364\370\1\x77                   AVX,SA
 [VZEROUPPER]
 (Ch_All, Ch_None, Ch_None)
 void                                 \362\370\1\x77                       AVX,SANDYBRIDGE
+
+;*******************************************************************************
+;********** BMI1 ***************************************************************
+;*******************************************************************************
+
+[ANDN]
+(Ch_Wop1, Ch_Rop2, Ch_Rop3)
+reg32,reg32,rm32                      \362\371\1\xf2\75\120               BMI1
+reg64,reg64,rm64                      \362\363\371\1\xf2\75\120           BMI1,X86_64
+
+[BEXTR]
+(Ch_Wop1, Ch_Rop2, Ch_Rop3)
+reg32,rm32,reg32                      \362\371\1\xf7\76\110               BMI1
+reg64,rm64,reg64                      \362\363\371\1\xf7\76\110           BMI1,X86_64
+
+;*******************************************************************************
+;********** BMI2 ***************************************************************
+;*******************************************************************************
+
+[RORX]
+(Ch_Wop1, Ch_Rop2, Ch_None)
+reg32,rm32,imm8                      \334\362\372\1\xf0\110\26            BMI2
+reg64,rm64,imm8                      \334\362\363\372\1\xf0\110\26        BMI2,X86_64
+
+[SARX]
+(Ch_Wop1, Ch_Rop2, Ch_Rop3)
+reg32,rm32,reg32                      \333\362\371\1\xf7\76\110           BMI2
+reg64,rm64,reg64                      \333\362\363\371\1\xf7\76\110       BMI2,X86_64
+
+[SHLX]
+(Ch_Wop1, Ch_Rop2, Ch_Rop3)
+reg32,rm32,reg32                      \361\362\371\1\xf7\76\110           BMI2
+reg64,rm64,reg64                      \361\362\363\371\1\xf7\76\110       BMI2,X86_64
+
+[SHRX]
+(Ch_Wop1, Ch_Rop2, Ch_Rop3)
+reg32,rm32,reg32                      \334\362\371\1\xf7\76\110           BMI2
+reg64,rm64,reg64                      \334\362\363\371\1\xf7\76\110       BMI2,X86_64
+

+ 2 - 1
compiler/x86/x86reg.dat

@@ -85,7 +85,6 @@ NR_ES,$05000003,es,%es,es,es,-1,-1,-1,OT_REG_DESS,0
 NR_SS,$05000004,ss,%ss,ss,ss,-1,-1,-1,OT_REG_DESS,2
 NR_FS,$05000005,fs,%fs,fs,fs,-1,-1,-1,OT_REG_FSGS,4
 NR_GS,$05000006,gs,%gs,gs,gs,-1,-1,-1,OT_REG_FSGS,5
-NR_FLAGS,$05000007,flags,%flags,flags,flags,-1,-1,-1,OT_NONE,0
 
 NR_DR0,$05000007,dr0,%dr0,dr0,dr0,-1,-1,-1,OT_REG_DREG,0
 NR_DR1,$05000008,dr1,%dr1,dr1,dr1,-1,-1,-1,OT_REG_DREG,1
@@ -103,6 +102,8 @@ NR_TR5,$05000013,tr5,%tr5,tr5,tr5,-1,-1,-1,OT_REG_TREG,5
 NR_TR6,$05000014,tr6,%tr6,tr6,tr6,-1,-1,-1,OT_REG_TREG,6
 NR_TR7,$05000015,tr7,%tr7,tr7,tr7,-1,-1,-1,OT_REG_TREG,7
 
+NR_FLAGS,$05000016,flags,%flags,flags,flags,-1,-1,-1,OT_NONE,0
+
 NR_ST0,$02000000,st(0),%st(0),st(0),st0,12,11,33,OT_FPU0,0
 NR_ST1,$02000001,st(1),%st(1),st(1),st1,13,12,34,OT_FPUREG,1
 NR_ST2,$02000002,st(2),%st(2),st(2),st2,14,13,35,OT_FPUREG,2