2
0
Эх сурвалжийг харах

* basic avx support for floating point operations (use -Cfavx to activate)

git-svn-id: trunk@24896 -
florian 12 жил өмнө
parent
commit
e81d2d1f3b

+ 33 - 1
compiler/cgobj.pas

@@ -52,8 +52,10 @@ unit cgobj;
           by Free Pascal. For 32-bit processors, the base class
           by Free Pascal. For 32-bit processors, the base class
           should be @link(tcg64f32) and not @var(tcg).
           should be @link(tcg64f32) and not @var(tcg).
        }
        }
+
+       { tcg }
+
        tcg = class
        tcg = class
-       public
           { how many times is this current code executed }
           { how many times is this current code executed }
           executionweight : longint;
           executionweight : longint;
           alignment : talignment;
           alignment : talignment;
@@ -271,6 +273,9 @@ unit cgobj;
           procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); virtual;
           procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); virtual;
           procedure a_opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; reg: tregister;shuffle : pmmshuffle); virtual;
           procedure a_opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; reg: tregister;shuffle : pmmshuffle); virtual;
           procedure a_opmm_reg_ref(list: TAsmList; Op: TOpCG; size : tcgsize;reg: tregister;const ref: treference; shuffle : pmmshuffle); virtual;
           procedure a_opmm_reg_ref(list: TAsmList; Op: TOpCG; size : tcgsize;reg: tregister;const ref: treference; shuffle : pmmshuffle); virtual;
+          procedure a_opmm_loc_reg_reg(list: TAsmList;Op : TOpCG;size : tcgsize;const loc : tlocation;src,dst : tregister;shuffle : pmmshuffle); virtual;
+          procedure a_opmm_reg_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src1,src2,dst: tregister;shuffle : pmmshuffle); virtual;
+          procedure a_opmm_ref_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; src,dst: tregister;shuffle : pmmshuffle); virtual;
 
 
           procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); virtual;
           procedure a_loadmm_intreg_reg(list: TAsmList; fromsize, tosize : tcgsize; intreg, mmreg: tregister; shuffle: pmmshuffle); virtual;
           procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize; mmreg, intreg: tregister; shuffle : pmmshuffle); virtual;
           procedure a_loadmm_reg_intreg(list: TAsmList; fromsize, tosize : tcgsize; mmreg, intreg: tregister; shuffle : pmmshuffle); virtual;
@@ -2061,6 +2066,33 @@ implementation
       end;
       end;
 
 
 
 
+    procedure tcg.a_opmm_loc_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const loc: tlocation; src,dst: tregister;shuffle : pmmshuffle);
+      begin
+        case loc.loc of
+          LOC_CMMREGISTER,LOC_MMREGISTER:
+            a_opmm_reg_reg_reg(list,op,size,loc.register,src,dst,shuffle);
+          LOC_CREFERENCE,LOC_REFERENCE:
+            a_opmm_ref_reg_reg(list,op,size,loc.reference,src,dst,shuffle);
+          else
+            internalerror(200312232);
+        end;
+      end;
+
+
+    procedure tcg.a_opmm_reg_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;
+      src1,src2,dst : tregister;shuffle : pmmshuffle);
+      begin
+        internalerror(2013061102);
+      end;
+
+
+    procedure tcg.a_opmm_ref_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;
+      const ref : treference;src,dst : tregister;shuffle : pmmshuffle);
+      begin
+        internalerror(2013061101);
+      end;
+
+
     procedure tcg.g_concatcopy_unaligned(list : TAsmList;const source,dest : treference;len : tcgint);
     procedure tcg.g_concatcopy_unaligned(list : TAsmList;const source,dest : treference;len : tcgint);
       begin
       begin
         g_concatcopy(list,source,dest,len);
         g_concatcopy(list,source,dest,len);

+ 8 - 4
compiler/i386/cpuinfo.pas

@@ -59,7 +59,8 @@ Type
       fpu_ssse3,
       fpu_ssse3,
       fpu_sse41,
       fpu_sse41,
       fpu_sse42,
       fpu_sse42,
-      fpu_avx
+      fpu_avx,
+      fpu_avx2
      );
      );
 
 
 
 
@@ -96,11 +97,14 @@ Const
      'SSSE3',
      'SSSE3',
      'SSE41',
      'SSE41',
      'SSE42',
      'SSE42',
-     'AVX'
+     'AVX',
+     'AVX2'
    );
    );
 
 
-   sse_singlescalar : set of tfputype = [fpu_sse,fpu_sse2,fpu_sse3];
-   sse_doublescalar : set of tfputype = [fpu_sse2,fpu_sse3];
+   sse_singlescalar = [fpu_sse..fpu_avx2];
+   sse_doublescalar = [fpu_sse2..fpu_avx2];
+
+   fpu_avx_instructionsets = [fpu_avx,fpu_avx2];
 
 
    { Supported optimizations, only used for information }
    { Supported optimizations, only used for information }
    supported_optimizerswitches = genericlevel1optimizerswitches+
    supported_optimizerswitches = genericlevel1optimizerswitches+

+ 10 - 10
compiler/i386/i386prop.inc

@@ -685,6 +685,10 @@
 (Ch: (Ch_RRAX, Ch_WMemEDI, Ch_RWRDI)),
 (Ch: (Ch_RRAX, Ch_WMemEDI, Ch_RWRDI)),
 (Ch: (Ch_WRAX, Ch_RWRSI, Ch_None)),
 (Ch: (Ch_WRAX, Ch_RWRSI, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -772,21 +776,17 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),

+ 61 - 29
compiler/x86/aasmcpu.pas

@@ -296,7 +296,7 @@ interface
          constructor op_reg_reg_reg(op : tasmop;_size : topsize;_op1,_op2,_op3 : tregister);
          constructor op_reg_reg_reg(op : tasmop;_size : topsize;_op1,_op2,_op3 : tregister);
          constructor op_const_reg_reg(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;_op3 : tregister);
          constructor op_const_reg_reg(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;_op3 : tregister);
          constructor op_const_ref_reg(op : tasmop;_size : topsize;_op1 : aint;const _op2 : treference;_op3 : tregister);
          constructor op_const_ref_reg(op : tasmop;_size : topsize;_op1 : aint;const _op2 : treference;_op3 : tregister);
-         constructor op_reg_reg_ref(op : tasmop;_size : topsize;_op1,_op2 : tregister; const _op3 : treference);
+         constructor op_ref_reg_reg(op : tasmop;_size : topsize;const _op1 : treference;_op2,_op3 : tregister);
          constructor op_const_reg_ref(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;const _op3 : treference);
          constructor op_const_reg_ref(op : tasmop;_size : topsize;_op1 : aint;_op2 : tregister;const _op3 : treference);
 
 
          { this is for Jmp instructions }
          { this is for Jmp instructions }
@@ -375,7 +375,8 @@ implementation
        systems,
        systems,
        procinfo,
        procinfo,
        itcpugas,
        itcpugas,
-       symsym;
+       symsym,
+       cpuinfo;
 
 
 {*****************************************************************************
 {*****************************************************************************
                               Instruction table
                               Instruction table
@@ -813,14 +814,14 @@ implementation
       end;
       end;
 
 
 
 
-    constructor taicpu.op_reg_reg_ref(op : tasmop;_size : topsize;_op1,_op2 : tregister;const _op3 : treference);
+    constructor taicpu.op_ref_reg_reg(op : tasmop;_size : topsize;const _op1 : treference;_op2,_op3 : tregister);
       begin
       begin
          inherited create(op);
          inherited create(op);
          init(_size);
          init(_size);
          ops:=3;
          ops:=3;
-         loadreg(0,_op1);
+         loadref(0,_op1);
          loadreg(1,_op2);
          loadreg(1,_op2);
-         loadref(2,_op3);
+         loadreg(2,_op3);
       end;
       end;
 
 
 
 
@@ -2874,7 +2875,9 @@ implementation
                  (oper[0]^.reg=oper[1]^.reg)
                  (oper[0]^.reg=oper[1]^.reg)
                 ) or
                 ) or
                 (((opcode=A_MOVSS) or (opcode=A_MOVSD) or (opcode=A_MOVQ) or
                 (((opcode=A_MOVSS) or (opcode=A_MOVSD) or (opcode=A_MOVQ) or
-                  (opcode=A_MOVAPS) or (OPCODE=A_MOVAPD)) and
+                  (opcode=A_MOVAPS) or (OPCODE=A_MOVAPD) or
+                  (opcode=A_VMOVSS) or (opcode=A_VMOVSD) or (opcode=A_VMOVQ) or
+                  (opcode=A_VMOVAPS) or (OPCODE=A_VMOVAPD)) and
                  (regtype = R_MMREGISTER) and
                  (regtype = R_MMREGISTER) and
                  (ops=2) and
                  (ops=2) and
                  (oper[0]^.typ=top_reg) and
                  (oper[0]^.typ=top_reg) and
@@ -2929,8 +2932,11 @@ implementation
       begin
       begin
         { the information in the instruction table is made for the string copy
         { the information in the instruction table is made for the string copy
           operation MOVSD so hack here (FK)
           operation MOVSD so hack here (FK)
+
+          VMOVSS and VMOVSD has two and three operand flavours, this cannot modelled by x86ins.dat
+          so fix it here (FK)
         }
         }
-        if (opcode=A_MOVSD) and (ops=2) then
+        if ((opcode=A_MOVSD) or (opcode=A_VMOVSS) or (opcode=A_VMOVSD)) and (ops=2) then
           begin
           begin
             case opnr of
             case opnr of
               0:
               0:
@@ -2961,17 +2967,30 @@ implementation
               result:=taicpu.op_ref_reg(A_MOV,reg2opsize(r),tmpref,r);
               result:=taicpu.op_ref_reg(A_MOV,reg2opsize(r),tmpref,r);
             end;
             end;
           R_MMREGISTER :
           R_MMREGISTER :
-            case getsubreg(r) of
-              R_SUBMMD:
-                result:=taicpu.op_ref_reg(A_MOVSD,reg2opsize(r),ref,r);
-              R_SUBMMS:
-                result:=taicpu.op_ref_reg(A_MOVSS,reg2opsize(r),ref,r);
-              R_SUBQ,
-              R_SUBMMWHOLE:
-                result:=taicpu.op_ref_reg(A_MOVQ,S_NO,ref,r);
-              else
-                internalerror(200506043);
-            end;
+            if current_settings.fputype in fpu_avx_instructionsets then
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_ref_reg(A_VMOVSD,reg2opsize(r),ref,r);
+                R_SUBMMS:
+                  result:=taicpu.op_ref_reg(A_VMOVSS,reg2opsize(r),ref,r);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_ref_reg(A_VMOVQ,S_NO,ref,r);
+                else
+                  internalerror(200506043);
+              end
+            else
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_ref_reg(A_MOVSD,reg2opsize(r),ref,r);
+                R_SUBMMS:
+                  result:=taicpu.op_ref_reg(A_MOVSS,reg2opsize(r),ref,r);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_ref_reg(A_MOVQ,S_NO,ref,r);
+                else
+                  internalerror(200506043);
+              end;
           else
           else
             internalerror(200401041);
             internalerror(200401041);
         end;
         end;
@@ -3002,17 +3021,30 @@ implementation
               result:=taicpu.op_reg_ref(A_MOV,size,r,tmpref);
               result:=taicpu.op_reg_ref(A_MOV,size,r,tmpref);
             end;
             end;
           R_MMREGISTER :
           R_MMREGISTER :
-            case getsubreg(r) of
-              R_SUBMMD:
-                result:=taicpu.op_reg_ref(A_MOVSD,reg2opsize(r),r,ref);
-              R_SUBMMS:
-                result:=taicpu.op_reg_ref(A_MOVSS,reg2opsize(r),r,ref);
-              R_SUBQ,
-              R_SUBMMWHOLE:
-                result:=taicpu.op_reg_ref(A_MOVQ,S_NO,r,ref);
-              else
-                internalerror(200506042);
-            end;
+            if current_settings.fputype in fpu_avx_instructionsets then
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_reg_ref(A_VMOVSD,reg2opsize(r),r,ref);
+                R_SUBMMS:
+                  result:=taicpu.op_reg_ref(A_VMOVSS,reg2opsize(r),r,ref);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_reg_ref(A_VMOVQ,S_NO,r,ref);
+                else
+                  internalerror(200506042);
+              end
+            else
+              case getsubreg(r) of
+                R_SUBMMD:
+                  result:=taicpu.op_reg_ref(A_MOVSD,reg2opsize(r),r,ref);
+                R_SUBMMS:
+                  result:=taicpu.op_reg_ref(A_MOVSS,reg2opsize(r),r,ref);
+                R_SUBQ,
+                R_SUBMMWHOLE:
+                  result:=taicpu.op_reg_ref(A_MOVQ,S_NO,r,ref);
+                else
+                  internalerror(200506042);
+              end;
           else
           else
             internalerror(200401041);
             internalerror(200401041);
         end;
         end;

+ 162 - 12
compiler/x86/cgx86.pas

@@ -92,6 +92,8 @@ unit cgx86;
         procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize : tcgsize;reg: tregister; const ref: treference;shuffle : pmmshuffle); override;
         procedure a_loadmm_reg_ref(list: TAsmList; fromsize, tosize : tcgsize;reg: tregister; const ref: treference;shuffle : pmmshuffle); override;
         procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); override;
         procedure a_opmm_ref_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle); override;
         procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src,dst: tregister;shuffle : pmmshuffle);override;
         procedure a_opmm_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src,dst: tregister;shuffle : pmmshuffle);override;
+        procedure a_opmm_ref_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;const ref : treference;src,dst : tregister;shuffle : pmmshuffle);override;
+        procedure a_opmm_reg_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;src1,src2,dst : tregister;shuffle : pmmshuffle);override;
 
 
         {  comparison operations }
         {  comparison operations }
         procedure a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister;
         procedure a_cmp_const_reg_label(list : TAsmList;size : tcgsize;cmp_op : topcmp;a : tcgint;reg : tregister;
@@ -126,9 +128,9 @@ unit cgx86;
         procedure check_register_size(size:tcgsize;reg:tregister);
         procedure check_register_size(size:tcgsize;reg:tregister);
 
 
         procedure opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
         procedure opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
+        procedure opmm_loc_reg_reg(list : TAsmList;Op : TOpCG;size : tcgsize;loc : tlocation;src,dst : tregister;shuffle : pmmshuffle);
 
 
         function get_darwin_call_stub(const s: string; weak: boolean): tasmsymbol;
         function get_darwin_call_stub(const s: string; weak: boolean): tasmsymbol;
-      private
         procedure sizes2load(s1,s2 : tcgsize;var op: tasmop; var s3: topsize);
         procedure sizes2load(s1,s2 : tcgsize;var op: tasmop; var s3: topsize);
 
 
         procedure floatload(list: TAsmList; t : tcgsize;const ref : treference);
         procedure floatload(list: TAsmList; t : tcgsize;const ref : treference);
@@ -175,7 +177,7 @@ unit cgx86;
 
 
     function UseAVX: boolean;
     function UseAVX: boolean;
       begin
       begin
-        Result:=current_settings.fputype in [fpu_avx];
+        Result:=current_settings.fputype in fpu_avx_instructionsets;
       end;
       end;
 
 
     const
     const
@@ -1144,12 +1146,18 @@ unit cgx86;
 
 
     function get_scalar_mm_op(fromsize,tosize : tcgsize) : tasmop;
     function get_scalar_mm_op(fromsize,tosize : tcgsize) : tasmop;
       const
       const
-        convertop : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
+        convertopsse : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
           (A_MOVSS,A_CVTSS2SD,A_NONE,A_NONE,A_NONE),
           (A_MOVSS,A_CVTSS2SD,A_NONE,A_NONE,A_NONE),
           (A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
           (A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
+        convertopavx : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
+          (A_VMOVSS,A_VCVTSS2SD,A_NONE,A_NONE,A_NONE),
+          (A_VCVTSD2SS,A_VMOVSD,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
       begin
       begin
         { we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
         { we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
           OS_32/OS_64 (record in memory/LOC_REFERENCE) }
           OS_32/OS_64 (record in memory/LOC_REFERENCE) }
@@ -1161,14 +1169,24 @@ unit cgx86;
             OS_64:
             OS_64:
               tosize:=OS_F64;
               tosize:=OS_F64;
           end;
           end;
-        if (fromsize in [low(convertop)..high(convertop)]) and
-           (tosize in [low(convertop)..high(convertop)]) then
-          result:=convertop[fromsize,tosize]
+        if (fromsize in [low(convertopsse)..high(convertopsse)]) and
+           (tosize in [low(convertopsse)..high(convertopsse)]) then
+          begin
+            if UseAVX then
+              result:=convertopavx[fromsize,tosize]
+            else
+              result:=convertopsse[fromsize,tosize];
+          end
         { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
         { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
           OS_64 (record in memory/LOC_REFERENCE) }
           OS_64 (record in memory/LOC_REFERENCE) }
         else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
         else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
                 (fromsize=OS_M64) then
                 (fromsize=OS_M64) then
-          result:=A_MOVQ
+          begin
+            if UseAVX then
+              result:=A_VMOVQ
+            else
+              result:=A_MOVQ;
+          end
         else
         else
           internalerror(2010060104);
           internalerror(2010060104);
         if result=A_NONE then
         if result=A_NONE then
@@ -1179,6 +1197,7 @@ unit cgx86;
     procedure tcgx86.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle);
     procedure tcgx86.a_loadmm_reg_reg(list: TAsmList; fromsize, tosize : tcgsize;reg1, reg2: tregister;shuffle : pmmshuffle);
       var
       var
         instr : taicpu;
         instr : taicpu;
+        op : TAsmOp;
       begin
       begin
         if shuffle=nil then
         if shuffle=nil then
           begin
           begin
@@ -1200,8 +1219,26 @@ unit cgx86;
           end
           end
         else if shufflescalar(shuffle) then
         else if shufflescalar(shuffle) then
           begin
           begin
-            instr:=taicpu.op_reg_reg(get_scalar_mm_op(fromsize,tosize),S_NO,reg1,reg2);
+            op:=get_scalar_mm_op(fromsize,tosize);
+
+            { VMOVSD/SS is not available with two register operands }
+            if op=A_VMOVSD then
+              op:=A_VMOVAPD
+            else if op=A_VMOVSS then
+              op:=A_VMOVAPS;
+
+            { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+            if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+              instr:=taicpu.op_reg_reg_reg(op,S_NO,reg1,reg2,reg2)
+            else
+              instr:=taicpu.op_reg_reg(op,S_NO,reg1,reg2);
+
             case get_scalar_mm_op(fromsize,tosize) of
             case get_scalar_mm_op(fromsize,tosize) of
+              A_VMOVAPD,
+              A_VMOVAPS,
+              A_VMOVSS,
+              A_VMOVSD,
+              A_VMOVQ,
               A_MOVSS,
               A_MOVSS,
               A_MOVSD,
               A_MOVSD,
               A_MOVQ:
               A_MOVQ:
@@ -1217,6 +1254,7 @@ unit cgx86;
     procedure tcgx86.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle);
     procedure tcgx86.a_loadmm_ref_reg(list: TAsmList; fromsize, tosize : tcgsize;const ref: treference; reg: tregister;shuffle : pmmshuffle);
        var
        var
          tmpref  : treference;
          tmpref  : treference;
+         op : tasmop;
        begin
        begin
          tmpref:=ref;
          tmpref:=ref;
          make_simple_ref(list,tmpref);
          make_simple_ref(list,tmpref);
@@ -1233,7 +1271,15 @@ unit cgx86;
 {$endif x86_64}
 {$endif x86_64}
            end
            end
          else if shufflescalar(shuffle) then
          else if shufflescalar(shuffle) then
-           list.concat(taicpu.op_ref_reg(get_scalar_mm_op(fromsize,tosize),S_NO,tmpref,reg))
+           begin
+             op:=get_scalar_mm_op(fromsize,tosize);
+
+             { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+             if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+               list.concat(taicpu.op_ref_reg_reg(op,S_NO,tmpref,reg,reg))
+             else
+               list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg))
+           end
          else
          else
            internalerror(200312252);
            internalerror(200312252);
        end;
        end;
@@ -1243,6 +1289,7 @@ unit cgx86;
        var
        var
          hreg : tregister;
          hreg : tregister;
          tmpref  : treference;
          tmpref  : treference;
+         op : tasmop;
        begin
        begin
          tmpref:=ref;
          tmpref:=ref;
          make_simple_ref(list,tmpref);
          make_simple_ref(list,tmpref);
@@ -1263,8 +1310,15 @@ unit cgx86;
              if tcgsize2size[tosize]<>tcgsize2size[fromsize] then
              if tcgsize2size[tosize]<>tcgsize2size[fromsize] then
                begin
                begin
                  hreg:=getmmregister(list,tosize);
                  hreg:=getmmregister(list,tosize);
-                 list.concat(taicpu.op_reg_reg(get_scalar_mm_op(fromsize,tosize),S_NO,reg,hreg));
-                 list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref));
+                 op:=get_scalar_mm_op(fromsize,tosize);
+
+                 { A_VCVTSD2SS and A_VCVTSS2SD require always three operands }
+                 if (op=A_VCVTSD2SS) or (op=A_VCVTSS2SD) then
+                   list.concat(taicpu.op_reg_reg_reg(op,S_NO,reg,hreg,hreg))
+                 else
+                   list.concat(taicpu.op_reg_reg(op,S_NO,reg,hreg));
+
+                 list.concat(taicpu.op_reg_ref(get_scalar_mm_op(tosize,tosize),S_NO,hreg,tmpref))
                end
                end
              else
              else
                list.concat(taicpu.op_reg_ref(get_scalar_mm_op(fromsize,tosize),S_NO,reg,tmpref));
                list.concat(taicpu.op_reg_ref(get_scalar_mm_op(fromsize,tosize),S_NO,reg,tmpref));
@@ -1296,6 +1350,103 @@ unit cgx86;
      end;
      end;
 
 
 
 
+    procedure tcgx86.opmm_loc_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;src,dst: tregister; shuffle : pmmshuffle);
+      const
+        opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
+          ( { scalar }
+            ( { OS_F32 }
+              A_NOP,A_NOP,A_VADDSS,A_NOP,A_VDIVSS,A_NOP,A_NOP,A_VMULSS,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBSS,A_NOP,A_NOP,A_NOP
+            ),
+            ( { OS_F64 }
+              A_NOP,A_NOP,A_VADDSD,A_NOP,A_VDIVSD,A_NOP,A_NOP,A_VMULSD,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBSD,A_NOP,A_NOP,A_NOP
+            )
+          ),
+          ( { vectorized/packed }
+            { because the logical packed single instructions have shorter op codes, we use always
+              these
+            }
+            ( { OS_F32 }
+              A_NOP,A_NOP,A_VADDPS,A_NOP,A_VDIVPS,A_NOP,A_NOP,A_VMULPS,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBPS,A_VXORPS,A_NOP,A_NOP
+            ),
+            ( { OS_F64 }
+              A_NOP,A_NOP,A_VADDPD,A_NOP,A_VDIVPD,A_NOP,A_NOP,A_VMULPD,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_NOP,A_VSUBPD,A_VXORPD,A_NOP,A_NOP
+            )
+          )
+        );
+
+      var
+        resultreg : tregister;
+        asmop : tasmop;
+      begin
+        { this is an internally used procedure so the parameters have
+          some constrains
+        }
+        if loc.size<>size then
+          internalerror(2013061108);
+        resultreg:=dst;
+        { deshuffle }
+        //!!!
+        if (shuffle<>nil) and not(shufflescalar(shuffle)) then
+          begin
+            internalerror(2013061107);
+          end
+        else if (shuffle=nil) then
+          asmop:=opmm2asmop[1,size,op]
+        else if shufflescalar(shuffle) then
+          begin
+            asmop:=opmm2asmop[0,size,op];
+            { no scalar operation available? }
+            if asmop=A_NOP then
+              begin
+                { do vectorized and shuffle finally }
+                internalerror(2010060102);
+              end;
+          end
+        else
+          internalerror(2013061106);
+        if asmop=A_NOP then
+          internalerror(2013061105);
+        case loc.loc of
+          LOC_CREFERENCE,LOC_REFERENCE:
+            begin
+              make_simple_ref(current_asmdata.CurrAsmList,loc.reference);
+              list.concat(taicpu.op_ref_reg_reg(asmop,S_NO,loc.reference,src,resultreg));
+            end;
+          LOC_CMMREGISTER,LOC_MMREGISTER:
+            list.concat(taicpu.op_reg_reg_reg(asmop,S_NO,loc.register,src,resultreg));
+          else
+            internalerror(2013061104);
+        end;
+        { shuffle }
+        if resultreg<>dst then
+          begin
+            internalerror(2013061103);
+          end;
+      end;
+
+
+    procedure tcgx86.a_opmm_reg_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;src1,src2,dst: tregister;shuffle : pmmshuffle);
+      var
+        l : tlocation;
+      begin
+        l.loc:=LOC_MMREGISTER;
+        l.register:=src1;
+        l.size:=size;
+        opmm_loc_reg_reg(list,op,size,l,src2,dst,shuffle);
+      end;
+
+
+    procedure tcgx86.a_opmm_ref_reg_reg(list: TAsmList; Op: TOpCG; size : tcgsize;const ref: treference; src,dst: tregister;shuffle : pmmshuffle);
+      var
+        l : tlocation;
+      begin
+        l.loc:=LOC_REFERENCE;
+        l.reference:=ref;
+        l.size:=size;
+        opmm_loc_reg_reg(list,op,size,l,src,dst,shuffle);
+      end;
+
+
     procedure tcgx86.opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
     procedure tcgx86.opmm_loc_reg(list: TAsmList; Op: TOpCG; size : tcgsize;loc : tlocation;dst: tregister; shuffle : pmmshuffle);
       const
       const
         opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
         opmm2asmop : array[0..1,OS_F32..OS_F64,topcg] of tasmop = (
@@ -1319,7 +1470,6 @@ unit cgx86;
             )
             )
           )
           )
         );
         );
-
       var
       var
         resultreg : tregister;
         resultreg : tregister;
         asmop : tasmop;
         asmop : tasmop;

+ 212 - 2
compiler/x86/nx86add.pas

@@ -41,7 +41,10 @@ unit nx86add;
         procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);
         procedure emit_generic_code(op:TAsmOp;opsize:TCgSize;unsigned,extra_not,mboverflow:boolean);
 
 
         procedure second_cmpfloatsse;
         procedure second_cmpfloatsse;
+        procedure second_cmpfloatavx;
+
         procedure second_addfloatsse;
         procedure second_addfloatsse;
+        procedure second_addfloatavx;
       public
       public
         procedure second_addfloat;override;
         procedure second_addfloat;override;
 {$ifndef i8086}
 {$ifndef i8086}
@@ -794,6 +797,141 @@ unit nx86add;
           end;
           end;
       end;
       end;
 
 
+    procedure tx86addnode.second_addfloatavx;
+      var
+        op : topcg;
+        sqr_sum : boolean;
+        tmp : tnode;
+      begin
+        sqr_sum:=false;
+{$ifdef dummy}
+        if (current_settings.fputype>=fpu_sse3) and
+           use_vectorfpu(resultdef) and
+           (nodetype in [addn,subn]) and
+          (left.nodetype=inlinen) and (tinlinenode(left).inlinenumber=in_sqr_real) and
+          (right.nodetype=inlinen) and (tinlinenode(right).inlinenumber=in_sqr_real) then
+          begin
+            sqr_sum:=true;
+            tmp:=tinlinenode(left).left;
+            tinlinenode(left).left:=nil;
+            left.free;
+            left:=tmp;
+
+            tmp:=tinlinenode(right).left;
+            tinlinenode(right).left:=nil;
+            right.free;
+            right:=tmp;
+          end;
+{$endif dummy}
+
+        pass_left_right;
+        check_left_and_right_fpureg(false);
+
+        if (nf_swapped in flags) then
+          { can't use swapleftright if both are on the fpu stack, since then }
+          { both are "R_ST" -> nothing would change -> manually switch       }
+          if (left.location.loc = LOC_FPUREGISTER) and
+             (right.location.loc = LOC_FPUREGISTER) then
+            emit_none(A_FXCH,S_NO)
+          else
+            swapleftright;
+
+        case nodetype of
+          addn :
+            op:=OP_ADD;
+          muln :
+            op:=OP_MUL;
+          subn :
+            op:=OP_SUB;
+          slashn :
+            op:=OP_DIV;
+          else
+            internalerror(200312231);
+        end;
+
+        location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
+
+        if sqr_sum then
+          begin
+            if nf_swapped in flags then
+              swapleftright;
+
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,right.location,right.resultdef,true);
+            location:=left.location;
+            if is_double(resultdef) then
+              begin
+                current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,right.location.register,location.register));
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPD,S_NO,location.register,location.register));
+                case nodetype of
+                  addn:
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPD,S_NO,location.register,location.register));
+                  subn:
+                    current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPD,S_NO,location.register,location.register));
+                  else
+                    internalerror(201108162);
+                end;
+              end
+            else
+              begin
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_UNPCKLPS,S_NO,right.location.register,location.register));
+                { ensure that bits 64..127 contain valid values }
+                current_asmdata.CurrAsmList.concat(taicpu.op_const_reg_reg(A_SHUFPD,S_NO,%00,location.register,location.register));
+                { the data is now in bits 0..32 and 64..95 }
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_MULPS,S_NO,location.register,location.register));
+                case nodetype of
+                  addn:
+                    begin
+                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HADDPS,S_NO,location.register,location.register));
+                    end;
+                  subn:
+                    begin
+                      current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_HSUBPS,S_NO,location.register,location.register));
+                    end;
+                  else
+                    internalerror(201108163);
+                end;
+              end
+          end
+        { we can use only right as left operand if the operation is commutative }
+        else if (right.location.loc=LOC_MMREGISTER) and (op in [OP_ADD,OP_MUL]) then
+          begin
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+            cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
+              left.location,
+              right.location.register,
+              location.register,
+              mms_movescalar);
+          end
+        else
+          begin
+            if (nf_swapped in flags) then
+              swapleftright;
+
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+            location.register:=cg.getmmregister(current_asmdata.CurrAsmList,left.location.size);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+            cg.a_opmm_loc_reg_reg(current_asmdata.CurrAsmList,op,location.size,
+              right.location,
+              left.location.register,
+              location.register,
+              mms_movescalar);
+          end;
+      end;
+
 
 
     procedure tx86addnode.second_cmpfloatsse;
     procedure tx86addnode.second_cmpfloatsse;
       var
       var
@@ -860,6 +998,72 @@ unit nx86add;
       end;
       end;
 
 
 
 
+
+    procedure tx86addnode.second_cmpfloatavx;
+      var
+        op : tasmop;
+      begin
+        if is_single(left.resultdef) then
+          op:=A_VCOMISS
+        else if is_double(left.resultdef) then
+          op:=A_VCOMISD
+        else
+          internalerror(200402222);
+        pass_left_right;
+
+        location_reset(location,LOC_FLAGS,def_cgsize(resultdef));
+        { we can use only right as left operand if the operation is commutative }
+        if (right.location.loc=LOC_MMREGISTER) then
+          begin
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if left.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,left.location,left.resultdef);
+            case left.location.loc of
+              LOC_REFERENCE,LOC_CREFERENCE:
+                begin
+                  tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,left.location.reference);
+                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,left.location.reference,right.location.register));
+                end;
+              LOC_MMREGISTER,LOC_CMMREGISTER:
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,left.location.register,right.location.register));
+              else
+                internalerror(200402221);
+            end;
+            if nf_swapped in flags then
+              exclude(flags,nf_swapped)
+            else
+              include(flags,nf_swapped)
+          end
+        else
+          begin
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+            { force floating point reg. location to be written to memory,
+              we don't force it to mm register because writing to memory
+              allows probably shorter code because there is no direct fpu->mm register
+              copy instruction
+            }
+            if right.location.loc in [LOC_FPUREGISTER,LOC_CFPUREGISTER] then
+              hlcg.location_force_mem(current_asmdata.CurrAsmList,right.location,right.resultdef);
+            case right.location.loc of
+              LOC_REFERENCE,LOC_CREFERENCE:
+                begin
+                  tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,right.location.reference);
+                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,S_NO,right.location.reference,left.location.register));
+                end;
+              LOC_MMREGISTER,LOC_CMMREGISTER:
+                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,S_NO,right.location.register,left.location.register));
+              else
+                internalerror(200402223);
+            end;
+          end;
+        location.resflags:=getresflags(true);
+      end;
+
+
     procedure tx86addnode.second_opvector;
     procedure tx86addnode.second_opvector;
       var
       var
         op : topcg;
         op : topcg;
@@ -912,7 +1116,10 @@ unit nx86add;
       begin
       begin
         if use_vectorfpu(resultdef) then
         if use_vectorfpu(resultdef) then
           begin
           begin
-            second_addfloatsse;
+            if UseAVX then
+              second_addfloatavx
+            else
+              second_addfloatsse;
             exit;
             exit;
           end;
           end;
 
 
@@ -959,7 +1166,10 @@ unit nx86add;
       begin
       begin
         if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
         if use_vectorfpu(left.resultdef) or use_vectorfpu(right.resultdef) then
           begin
           begin
-            second_cmpfloatsse;
+            if UseAVX then
+              second_cmpfloatavx
+            else
+              second_cmpfloatsse;
             exit;
             exit;
           end;
           end;
 
 

+ 29 - 10
compiler/x86/nx86cnv.pas

@@ -276,14 +276,25 @@ implementation
           begin
           begin
             location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
             location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
-            case location.size of
-              OS_F32:
-                op:=A_CVTSI2SS;
-              OS_F64:
-                op:=A_CVTSI2SD;
-              else
-                internalerror(2007120902);
-            end;
+            if UseAVX then
+              case location.size of
+                OS_F32:
+                  op:=A_VCVTSI2SS;
+                OS_F64:
+                  op:=A_VCVTSI2SD;
+                else
+                  internalerror(2007120902);
+              end
+            else
+              case location.size of
+                OS_F32:
+                  op:=A_CVTSI2SS;
+                OS_F64:
+                  op:=A_CVTSI2SD;
+                else
+                  internalerror(2007120902);
+              end;
+
             { don't use left.location.size, because that one may be OS_32/OS_64
             { don't use left.location.size, because that one may be OS_32/OS_64
               if the lower bound of the orddef >= 0
               if the lower bound of the orddef >= 0
             }
             }
@@ -301,11 +312,19 @@ implementation
                 begin
                 begin
                   href:=left.location.reference;
                   href:=left.location.reference;
                   tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,href);
                   tcgx86(cg).make_simple_ref(current_asmdata.CurrAsmList,href);
-                  current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,opsize,href,location.register));
+                  if UseAVX then
+                    { VCVTSI2.. requires a second source operand to copy bits 64..127 }
+                    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg_reg(op,opsize,href,location.register,location.register))
+                  else
+                    current_asmdata.CurrAsmList.concat(taicpu.op_ref_reg(op,opsize,href,location.register));
                 end;
                 end;
               LOC_REGISTER,
               LOC_REGISTER,
               LOC_CREGISTER:
               LOC_CREGISTER:
-                current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,opsize,left.location.register,location.register));
+                if UseAVX then
+                    { VCVTSI2.. requires a second source operand to copy bits 64..127 }
+                  current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(op,opsize,left.location.register,location.register,location.register))
+                else
+                  current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(op,opsize,left.location.register,location.register));
             end;
             end;
           end
           end
         else
         else

+ 68 - 28
compiler/x86/nx86inl.pas

@@ -289,14 +289,24 @@ implementation
              hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
              hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
              location_reset(location,LOC_REGISTER,OS_S64);
              location_reset(location,LOC_REGISTER,OS_S64);
              location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
              location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
-             case left.location.size of
-               OS_F32:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_Q,left.location.register,location.register));
-               OS_F64:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_Q,left.location.register,location.register));
-               else
-                 internalerror(2007031402);
-             end;
+             if UseAVX then
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031402);
+               end
+             else
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031402);
+               end;
            end
            end
          else
          else
 {$endif x86_64}
 {$endif x86_64}
@@ -323,14 +333,24 @@ implementation
              hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
              hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
              location_reset(location,LOC_REGISTER,OS_S64);
              location_reset(location,LOC_REGISTER,OS_S64);
              location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
              location.register:=cg.getintregister(current_asmdata.CurrAsmList,OS_S64);
-             case left.location.size of
-               OS_F32:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_Q,left.location.register,location.register));
-               OS_F64:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_Q,left.location.register,location.register));
-               else
-                 internalerror(2007031401);
-             end;
+             if UseAVX then
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_VCVTTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031401);
+               end
+             else
+               case left.location.size of
+                 OS_F32:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSS2SI,S_Q,left.location.register,location.register));
+                 OS_F64:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_CVTTSD2SI,S_Q,left.location.register,location.register));
+                 else
+                   internalerror(2007031401);
+               end;
            end
            end
          else
          else
 {$endif x86_64}
 {$endif x86_64}
@@ -371,9 +391,18 @@ implementation
          if use_vectorfpu(resultdef) then
          if use_vectorfpu(resultdef) then
            begin
            begin
              secondpass(left);
              secondpass(left);
-             hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
-             location:=left.location;
-             cg.a_opmm_loc_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location,left.location.register,mms_movescalar);
+             location_reset(location,LOC_MMREGISTER,left.location.size);
+             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+             if UseAVX then
+               begin
+                 hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
+                 cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,left.location.register,left.location.register,location.register,mms_movescalar);
+               end
+             else
+               begin
+                 cg.a_loadmm_loc_reg(current_asmdata.CurrAsmList,location.size,left.location,location.register,mms_movescalar);
+                 cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_MUL,left.location.size,location.register,location.register,mms_movescalar);
+               end;
            end
            end
          else
          else
            begin
            begin
@@ -389,15 +418,26 @@ implementation
            begin
            begin
              secondpass(left);
              secondpass(left);
              hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
              hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
-             location:=left.location;
-             case tfloatdef(resultdef).floattype of
-               s32real:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_XMM,location.register,location.register));
-               s64real:
-                 current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_XMM,location.register,location.register));
-               else
-                 internalerror(200510031);
-             end;
+             location_reset(location,LOC_MMREGISTER,left.location.size);
+             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,location.size);
+             if UseAVX then
+               case tfloatdef(resultdef).floattype of
+                 s32real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSS,S_XMM,left.location.register,location.register,location.register));
+                 s64real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg_reg(A_VSQRTSD,S_XMM,left.location.register,location.register,location.register));
+                 else
+                   internalerror(200510031);
+               end
+             else
+               case tfloatdef(resultdef).floattype of
+                 s32real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSS,S_XMM,left.location.register,location.register));
+                 s64real:
+                   current_asmdata.CurrAsmList.concat(taicpu.op_reg_reg(A_SQRTSD,S_XMM,left.location.register,location.register));
+                 else
+                   internalerror(200510031);
+               end;
            end
            end
          else
          else
            begin
            begin

+ 9 - 5
compiler/x86/nx86mat.pas

@@ -154,14 +154,11 @@ interface
 
 
         if expectloc=LOC_MMREGISTER then
         if expectloc=LOC_MMREGISTER then
           begin
           begin
-            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,false);
+            hlcg.location_force_mmregscalar(current_asmdata.CurrAsmList,left.location,left.resultdef,true);
             location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
             location_reset(location,LOC_MMREGISTER,def_cgsize(resultdef));
 
 
             { make life of register allocator easier }
             { make life of register allocator easier }
             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
             location.register:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
-            cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),left.location.register,location.register,mms_movescalar);
-
-            reg:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
 
 
             current_asmdata.getdatalabel(l1);
             current_asmdata.getdatalabel(l1);
             new_section(current_asmdata.asmlists[al_typedconsts],sec_rodata_norel,l1.name,const_align(sizeof(pint)));
             new_section(current_asmdata.asmlists[al_typedconsts],sec_rodata_norel,l1.name,const_align(sizeof(pint)));
@@ -179,9 +176,16 @@ interface
             end;
             end;
 
 
             reference_reset_symbol(href,l1,0,resultdef.alignment);
             reference_reset_symbol(href,l1,0,resultdef.alignment);
+            reg:=cg.getmmregister(current_asmdata.CurrAsmList,def_cgsize(resultdef));
             cg.a_loadmm_ref_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),href,reg,mms_movescalar);
             cg.a_loadmm_ref_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),href,reg,mms_movescalar);
 
 
-            cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,location.register,nil);
+            if UseAVX then
+              cg.a_opmm_reg_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,left.location.register,location.register,nil)
+            else
+              begin
+                cg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,def_cgsize(resultdef),def_cgsize(resultdef),left.location.register,location.register,mms_movescalar);
+                cg.a_opmm_reg_reg(current_asmdata.CurrAsmList,OP_XOR,left.location.size,reg,location.register,nil);
+              end;
           end
           end
         else
         else
           begin
           begin

+ 145 - 120
compiler/x86/rgx86.pas

@@ -134,143 +134,164 @@ implementation
                 end;
                 end;
               2,3 :
               2,3 :
                 begin
                 begin
-                  { We can handle opcodes with 2 and 3 operands the same way. The opcodes
-                    with 3 registers are shrd/shld, where the 3rd operand is const or CL,
-                    that doesn't need spilling.
-                    However, due to AT&T order inside the compiler, the 3rd operand is
-                    numbered 0, so look at operand no. 1 and 2 if we have 3 operands by
-                    adding a "n". }
-                  n:=0;
-                  if ops=3 then
-                    n:=1;
-                  if (oper[n+0]^.typ=top_reg) and
-                     (oper[n+1]^.typ=top_reg) and
-                     ((getregtype(oper[n+0]^.reg)<>regtype) or
-                      (getregtype(oper[n+1]^.reg)<>regtype) or
-                      (get_alias(getsupreg(oper[n+0]^.reg))<>get_alias(getsupreg(oper[n+1]^.reg)))) then
+                  { avx instruction?
+                    currently this rule is sufficient but it might be extended }
+                  if (ops=3) and (opcode<>A_SHRD) and (opcode<>A_SHLD) then
                     begin
                     begin
-                      if (getregtype(oper[n+0]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[n+0]^.reg))=orgreg) then
-                        replaceoper:=0+n
-                      else if (getregtype(oper[n+1]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[n+1]^.reg))=orgreg) then
-                        replaceoper:=1+n;
+                      { avx instructions allow only the first operand (at&t counting) to be a register operand }
+                      { all operands must be registers ... }
+                      if (oper[0]^.typ=top_reg) and
+                         (oper[1]^.typ=top_reg) and
+                         (oper[2]^.typ=top_reg) and
+                         { but they must be different }
+                         ((getregtype(oper[1]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[0]^.reg))<>get_alias(getsupreg(oper[1]^.reg)))
+                         ) and
+                         ((getregtype(oper[2]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[0]^.reg))<>get_alias(getsupreg(oper[2]^.reg)))
+                         ) and
+                         (get_alias(getsupreg(oper[0]^.reg))=orgreg) then
+                        replaceoper:=0;
                     end
                     end
-                  else if (oper[n+0]^.typ=top_reg) and
-                     (oper[n+1]^.typ=top_const) then
-                    begin
-                      if (getregtype(oper[0+n]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[0+n]^.reg))=orgreg) then
-                        replaceoper:=0+n
-                      else
-                        internalerror(200704282);
-                    end
-                  else if (oper[n+0]^.typ=top_const) and
-                     (oper[n+1]^.typ=top_reg) then
+                  else
                     begin
                     begin
-                      if (getregtype(oper[1+n]^.reg)=regtype) and
-                         (get_alias(getsupreg(oper[1+n]^.reg))=orgreg) then
-                        replaceoper:=1+n
-                      else
-                        internalerror(200704283);
-                    end;
-                  case replaceoper of
-                    0 :
-                      begin
-                        { Some instructions don't allow memory references
-                          for source }
-                        case instr.opcode of
-                          A_BT,
-                          A_BTS,
-                          A_BTC,
-                          A_BTR,
-
-                          { shufp* would require 16 byte alignment for memory locations so we force the source
-                            operand into a register }
-                          A_SHUFPD,
-                          A_SHUFPS :
-                            replaceoper:=-1;
+                      { We can handle opcodes with 2 and shrd/shld the same way, where the 3rd operand is const or CL,
+                        that doesn't need spilling.
+                        However, due to AT&T order inside the compiler, the 3rd operand is
+                        numbered 0, so look at operand no. 1 and 2 if we have 3 operands by
+                        adding a "n". }
+                      n:=0;
+                      if ops=3 then
+                        n:=1;
+                      if (oper[n+0]^.typ=top_reg) and
+                         (oper[n+1]^.typ=top_reg) and
+                         ((getregtype(oper[n+0]^.reg)<>regtype) or
+                          (getregtype(oper[n+1]^.reg)<>regtype) or
+                          (get_alias(getsupreg(oper[n+0]^.reg))<>get_alias(getsupreg(oper[n+1]^.reg)))) then
+                        begin
+                          if (getregtype(oper[n+0]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[n+0]^.reg))=orgreg) then
+                            replaceoper:=0+n
+                          else if (getregtype(oper[n+1]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[n+1]^.reg))=orgreg) then
+                            replaceoper:=1+n;
+                        end
+                      else if (oper[n+0]^.typ=top_reg) and
+                         (oper[n+1]^.typ=top_const) then
+                        begin
+                          if (getregtype(oper[0+n]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[0+n]^.reg))=orgreg) then
+                            replaceoper:=0+n
+                          else
+                            internalerror(200704282);
+                        end
+                      else if (oper[n+0]^.typ=top_const) and
+                         (oper[n+1]^.typ=top_reg) then
+                        begin
+                          if (getregtype(oper[1+n]^.reg)=regtype) and
+                             (get_alias(getsupreg(oper[1+n]^.reg))=orgreg) then
+                            replaceoper:=1+n
+                          else
+                            internalerror(200704283);
                         end;
                         end;
-                      end;
-                    1 :
-                      begin
-                        { Some instructions don't allow memory references
-                          for destination }
-                        case instr.opcode of
-                          A_CMOVcc,
-                          A_MOVZX,
-                          A_MOVSX,
-                          A_MOVSXD,
-                          A_MULSS,
-                          A_MULSD,
-                          A_SUBSS,
-                          A_SUBSD,
-                          A_ADDSD,
-                          A_ADDSS,
-                          A_DIVSD,
-                          A_DIVSS,
-                          A_SHLD,
-                          A_SHRD,
-                          A_COMISD,
-                          A_COMISS,
-                          A_CVTDQ2PD,
-                          A_CVTDQ2PS,
-                          A_CVTPD2DQ,
-                          A_CVTPD2PI,
-                          A_CVTPD2PS,
-                          A_CVTPI2PD,
-                          A_CVTPS2DQ,
-                          A_CVTPS2PD,
-                          A_CVTSD2SI,
-                          A_CVTSD2SS,
-                          A_CVTSI2SD,
-                          A_CVTSS2SD,
-                          A_CVTTPD2PI,
-                          A_CVTTPD2DQ,
-                          A_CVTTPS2DQ,
-                          A_CVTTSD2SI,
-                          A_CVTPI2PS,
-                          A_CVTPS2PI,
-                          A_CVTSI2SS,
-                          A_CVTSS2SI,
-                          A_CVTTPS2PI,
-                          A_CVTTSS2SI,
-                          A_IMUL,
-                          A_XORPD,
-                          A_XORPS,
-                          A_ORPD,
-                          A_ORPS,
-                          A_ANDPD,
-                          A_ANDPS,
-                          A_UNPCKLPS,
-                          A_UNPCKHPS,
-                          A_SHUFPD,
-                          A_SHUFPS:
-
-                            replaceoper:=-1;
+                      case replaceoper of
+                        0 :
+                          begin
+                            { Some instructions don't allow memory references
+                              for source }
+                            case instr.opcode of
+                              A_BT,
+                              A_BTS,
+                              A_BTC,
+                              A_BTR,
+
+                              { shufp* would require 16 byte alignment for memory locations so we force the source
+                                operand into a register }
+                              A_SHUFPD,
+                              A_SHUFPS :
+                                replaceoper:=-1;
+                            end;
+                          end;
+                        1 :
+                          begin
+                            { Some instructions don't allow memory references
+                              for destination }
+                            case instr.opcode of
+                              A_CMOVcc,
+                              A_MOVZX,
+                              A_MOVSX,
+                              A_MOVSXD,
+                              A_MULSS,
+                              A_MULSD,
+                              A_SUBSS,
+                              A_SUBSD,
+                              A_ADDSD,
+                              A_ADDSS,
+                              A_DIVSD,
+                              A_DIVSS,
+                              A_SHLD,
+                              A_SHRD,
+                              A_COMISD,
+                              A_COMISS,
+                              A_CVTDQ2PD,
+                              A_CVTDQ2PS,
+                              A_CVTPD2DQ,
+                              A_CVTPD2PI,
+                              A_CVTPD2PS,
+                              A_CVTPI2PD,
+                              A_CVTPS2DQ,
+                              A_CVTPS2PD,
+                              A_CVTSD2SI,
+                              A_CVTSD2SS,
+                              A_CVTSI2SD,
+                              A_CVTSS2SD,
+                              A_CVTTPD2PI,
+                              A_CVTTPD2DQ,
+                              A_CVTTPS2DQ,
+                              A_CVTTSD2SI,
+                              A_CVTPI2PS,
+                              A_CVTPS2PI,
+                              A_CVTSI2SS,
+                              A_CVTSS2SI,
+                              A_CVTTPS2PI,
+                              A_CVTTSS2SI,
+                              A_IMUL,
+                              A_XORPD,
+                              A_XORPS,
+                              A_ORPD,
+                              A_ORPS,
+                              A_ANDPD,
+                              A_ANDPS,
+                              A_UNPCKLPS,
+                              A_UNPCKHPS,
+                              A_SHUFPD,
+                              A_SHUFPS:
+
+                                replaceoper:=-1;
 {$ifdef x86_64}
 {$ifdef x86_64}
-                          A_MOV:
-                             { 64 bit constants can only be moved into registers }
-                             if (oper[0]^.typ=top_const) and
-                                (oper[1]^.typ=top_reg) and
-                                ((oper[0]^.val<low(longint)) or
-                                 (oper[0]^.val>high(longint))) then
-                               replaceoper:=-1;
+                              A_MOV:
+                                 { 64 bit constants can only be moved into registers }
+                                 if (oper[0]^.typ=top_const) and
+                                    (oper[1]^.typ=top_reg) and
+                                    ((oper[0]^.val<low(longint)) or
+                                     (oper[0]^.val>high(longint))) then
+                                   replaceoper:=-1;
 {$endif x86_64}
 {$endif x86_64}
+                            end;
+                          end;
                         end;
                         end;
-                      end;
                     end;
                     end;
                 end;
                 end;
              end;
              end;
 
 
-            {$ifdef x86_64}
+{$ifdef x86_64}
             { 32 bit operations on 32 bit registers on x86_64 can result in
             { 32 bit operations on 32 bit registers on x86_64 can result in
               zeroing the upper 32 bits of the register. This does not happen
               zeroing the upper 32 bits of the register. This does not happen
               with memory operations, so we have to perform these calculations
               with memory operations, so we have to perform these calculations
               in registers.  }
               in registers.  }
             if (instr.opsize=S_L) then
             if (instr.opsize=S_L) then
               replaceoper:=-1;
               replaceoper:=-1;
-            {$endif x86_64}
+{$endif x86_64}
 
 
             { Replace register with spill reference }
             { Replace register with spill reference }
             if replaceoper<>-1 then
             if replaceoper<>-1 then
@@ -287,6 +308,10 @@ implementation
                     opcode:=A_MOVSS;
                     opcode:=A_MOVSS;
                   A_MOVAPD:
                   A_MOVAPD:
                     opcode:=A_MOVSD;
                     opcode:=A_MOVSD;
+                  A_VMOVAPS:
+                    opcode:=A_VMOVSS;
+                  A_VMOVAPD:
+                    opcode:=A_VMOVSD;
                 end;
                 end;
                 result:=true;
                 result:=true;
               end;
               end;

+ 10 - 10
compiler/x86/x86ins.dat

@@ -3453,22 +3453,22 @@ void                   \326\1\xA7                                    X86_64
 
 
 
 
 [VADDPD]
 [VADDPD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \361\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmrm                  \361\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \361\362\364\370\1\x58\75\120        AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \361\362\364\370\1\x58\75\120        AVX,SANDYBRIDGE
 
 
 [VADDPS]
 [VADDPS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \362\370\1\x58\75\120                AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmrm                  \362\370\1\x58\75\120                AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \362\364\370\1\x58\75\120            AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \362\364\370\1\x58\75\120            AVX,SANDYBRIDGE
 
 
 [VADDSD]
 [VADDSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem64                  \334\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,mem64                  \334\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 
 
 [VADDSS]
 [VADDSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem32                  \333\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,mem32                  \333\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x58\75\120            AVX,SANDYBRIDGE
 
 
@@ -3919,7 +3919,7 @@ rm64,xmmreg                          \361\362\363\370\1\x7E\101           AVX,SA
 xmmreg,rm64                          \361\362\363\370\1\x6E\110           AVX,SANDYBRIDGE
 xmmreg,rm64                          \361\362\363\370\1\x6E\110           AVX,SANDYBRIDGE
 
 
 [VMOVSD]
 [VMOVSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x10\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x10\75\120            AVX,SANDYBRIDGE
 xmmreg,mem64                         \334\362\370\1\x10\110               AVX,SANDYBRIDGE
 xmmreg,mem64                         \334\362\370\1\x10\110               AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x11\75\102            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x11\75\102            AVX,SANDYBRIDGE
@@ -3936,7 +3936,7 @@ xmmreg,xmmrm                         \333\362\370\1\x12\110               AVX,SA
 ymmreg,ymmrm                         \333\362\364\370\1\x12\110           AVX,SANDYBRIDGE
 ymmreg,ymmrm                         \333\362\364\370\1\x12\110           AVX,SANDYBRIDGE
 
 
 [VMOVSS]
 [VMOVSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x10\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x10\75\120            AVX,SANDYBRIDGE
 xmmreg,mem64                         \333\362\370\1\x10\110               AVX,SANDYBRIDGE
 xmmreg,mem64                         \333\362\370\1\x10\110               AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x11\75\102            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x11\75\102            AVX,SANDYBRIDGE
@@ -3961,22 +3961,22 @@ ymmrm,ymmreg                         \362\364\370\1\x11\101               AVX,SA
 xmmreg,xmmreg,xmmrm,imm8             \361\362\372\1\x42\75\120\27         AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmrm,imm8             \361\362\372\1\x42\75\120\27         AVX,SANDYBRIDGE
 
 
 [VMULPD]
 [VMULPD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \361\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmrm                  \361\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \361\362\364\370\1\x59\75\120        AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \361\362\364\370\1\x59\75\120        AVX,SANDYBRIDGE
 
 
 [VMULPS]
 [VMULPS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,xmmrm                  \362\370\1\x59\75\120                AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmrm                  \362\370\1\x59\75\120                AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \362\364\370\1\x59\75\120            AVX,SANDYBRIDGE
 ymmreg,ymmreg,ymmrm                  \362\364\370\1\x59\75\120            AVX,SANDYBRIDGE
 
 
 [VMULSD]
 [VMULSD]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem64                  \334\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,mem64                  \334\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \334\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 
 
 [VMULSS]
 [VMULSS]
-(Ch_All, Ch_None, Ch_None)
+(Ch_Wop3, Ch_Rop2, Ch_Rop1)
 xmmreg,xmmreg,mem32                  \333\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,mem32                  \333\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 xmmreg,xmmreg,xmmreg                 \333\362\370\1\x59\75\120            AVX,SANDYBRIDGE
 
 

+ 8 - 4
compiler/x86_64/cpuinfo.pas

@@ -51,7 +51,8 @@ Type
       fpu_ssse3,
       fpu_ssse3,
       fpu_sse41,
       fpu_sse41,
       fpu_sse42,
       fpu_sse42,
-      fpu_avx
+      fpu_avx,
+      fpu_avx2
      );
      );
 
 
 Const
 Const
@@ -86,11 +87,14 @@ Const
      'SSSE3',
      'SSSE3',
      'SSE41',
      'SSE41',
      'SSE42',
      'SSE42',
-     'AVX'
+     'AVX',
+     'AVX2'
    );
    );
 
 
-   sse_singlescalar : set of tfputype = [fpu_sse64,fpu_sse3];
-   sse_doublescalar : set of tfputype = [fpu_sse64,fpu_sse3];
+   sse_singlescalar = [fpu_sse64..fpu_avx2];
+   sse_doublescalar = [fpu_sse64..fpu_avx2];
+
+   fpu_avx_instructionsets = [fpu_avx,fpu_avx2];
 
 
    { Supported optimizations, only used for information }
    { Supported optimizations, only used for information }
    supported_optimizerswitches = genericlevel1optimizerswitches+
    supported_optimizerswitches = genericlevel1optimizerswitches+

+ 10 - 10
compiler/x86_64/x8664pro.inc

@@ -685,6 +685,10 @@
 (Ch: (Ch_RRAX, Ch_WMemEDI, Ch_RWRDI)),
 (Ch: (Ch_RRAX, Ch_WMemEDI, Ch_RWRDI)),
 (Ch: (Ch_WRAX, Ch_RWRSI, Ch_None)),
 (Ch: (Ch_WRAX, Ch_RWRSI, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
@@ -772,21 +776,17 @@
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
-(Ch: (Ch_All, Ch_None, Ch_None)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
+(Ch: (Ch_Wop3, Ch_Rop2, Ch_Rop1)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),
 (Ch: (Ch_All, Ch_None, Ch_None)),