2
0
Эх сурвалжийг харах

+ implementation of the vectorcall calling convention by J. Gareth Moreton
+ tests

git-svn-id: trunk@38206 -

florian 7 жил өмнө
parent
commit
31f78ea2b6

+ 3 - 0
.gitattributes

@@ -11999,6 +11999,9 @@ tests/test/cg/ttryfin4.pp svneol=native#text/plain
 tests/test/cg/ttryfin5.pp svneol=native#text/plain
 tests/test/cg/tumin.pp svneol=native#text/plain
 tests/test/cg/tvec.pp svneol=native#text/plain
+tests/test/cg/tvectorcall1.pp svneol=native#text/pascal
+tests/test/cg/tvectorcall2.pp svneol=native#text/pascal
+tests/test/cg/tvectorcall3.pp svneol=native#text/pascal
 tests/test/cg/uandorxorassign.pp svneol=native#text/plain
 tests/test/cg/unegnotassign.pp svneol=native#text/plain
 tests/test/cg/uprintf3.pp svneol=native#text/plain

+ 83 - 25
compiler/cgbase.pas

@@ -164,14 +164,18 @@ interface
        { OS_NO is also used memory references with large data that can
          not be loaded in a register directly }
        TCgSize = (OS_NO,
-                 { integer registers }
-                  OS_8,OS_16,OS_32,OS_64,OS_128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
-                 { single,double,extended,comp,float128 }
-                  OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
+                  OS_8,   OS_16,   OS_32,   OS_64,   OS_128,
+                  OS_S8,  OS_S16,  OS_S32,  OS_S64,  OS_S128,
+                 { single, double, extended, comp, float128 }
+                  OS_F32, OS_F64,  OS_F80,  OS_C64,  OS_F128,
                  { multi-media sizes: split in byte, word, dword, ... }
                  { entities, then the signed counterparts             }
-                  OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,  
-                  OS_MS8,OS_MS16,OS_MS32,OS_MS64,OS_MS128,OS_MS256 );  
+                  OS_M8,  OS_M16,  OS_M32,  OS_M64,  OS_M128,  OS_M256,  OS_M512,
+                  OS_MS8, OS_MS16, OS_MS32, OS_MS64, OS_MS128, OS_MS256, OS_MS512,
+                 { multi-media sizes: single-precision floating-point }
+                  OS_MF32, OS_MF128, OS_MF256, OS_MF512,
+                 { multi-media sizes: double-precision floating-point }
+                  OS_MD64, OS_MD128, OS_MD256, OS_MD512);
 
       { Register types }
       TRegisterType = (
@@ -205,15 +209,16 @@ interface
         { For Intel X86 AVX-Register }
         R_SUBMMX,     { = 12; 128 BITS }
         R_SUBMMY,     { = 13; 256 BITS }
+        R_SUBMMZ,     { = 14; 512 BITS }
         { Subregisters for the flags register (x86) }
-        R_SUBFLAGCARRY,     { = 14; Carry flag }
-        R_SUBFLAGPARITY,    { = 15; Parity flag }
-        R_SUBFLAGAUXILIARY, { = 16; Auxiliary flag }
-        R_SUBFLAGZERO,      { = 17; Zero flag }
-        R_SUBFLAGSIGN,      { = 18; Sign flag }
-        R_SUBFLAGOVERFLOW,  { = 19; Overflow flag }
-        R_SUBFLAGINTERRUPT, { = 20; Interrupt enable flag }
-        R_SUBFLAGDIRECTION  { = 21; Direction flag }
+        R_SUBFLAGCARRY,     { = 15; Carry flag }
+        R_SUBFLAGPARITY,    { = 16; Parity flag }
+        R_SUBFLAGAUXILIARY, { = 17; Auxiliary flag }
+        R_SUBFLAGZERO,      { = 18; Zero flag }
+        R_SUBFLAGSIGN,      { = 19; Sign flag }
+        R_SUBFLAGOVERFLOW,  { = 20; Overflow flag }
+        R_SUBFLAGINTERRUPT, { = 21; Interrupt enable flag }
+        R_SUBFLAGDIRECTION  { = 22; Direction flag }
       );
       TSubRegisterSet = set of TSubRegister;
 
@@ -307,12 +312,19 @@ interface
        NR_INVALID    = tregister($fffffffff);
 
        tcgsize2size : Array[tcgsize] of integer =
+        (0,
          { integer values }
-        (0,1,2,4,8,16,1,2,4,8,16,
+         1,  2,  4,  8, 16,
+         1,  2,  4,  8, 16,
          { floating point values }
-         4,8,10,8,16,
+         4,  8, 10,  8, 16,
          { multimedia values }
-         1,2,4,8,16,32,1,2,4,8,16,32); 
+         1,  2,  4,  8, 16, 32, 64,
+         1,  2,  4,  8, 16, 32, 64,
+         { single-precision multimedia values }
+         4, 16, 32, 64,
+         { double-precision multimedia values }
+         8, 16, 32, 64);
 
        tfloat2tcgsize: array[tfloattype] of tcgsize =
          (OS_F32,OS_F64,OS_F80,OS_F80,OS_C64,OS_C64,OS_F128);
@@ -348,16 +360,25 @@ interface
        { Table to convert tcgsize variables to the correspondending
          unsigned types }
        tcgsize2unsigned : array[tcgsize] of tcgsize = (OS_NO,
-          OS_8,OS_16,OS_32,OS_64,OS_128,OS_8,OS_16,OS_32,OS_64,OS_128,
-          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
-          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
-          OS_M64,OS_M128,OS_M256);
+         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
+         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
+
+         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
+         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
+         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
+         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
+         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
+
 
        tcgsize2signed : array[tcgsize] of tcgsize = (OS_NO,
-          OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
-          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
-          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
-          OS_M64,OS_M128,OS_M256);
+         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
+         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
+
+         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
+         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
+         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
+         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
+         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
 
 
        tcgloc2str : array[TCGLoc] of string[12] = (
@@ -404,6 +425,8 @@ interface
     }
     function int_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
     function int_float_cgsize(const a: tcgint): tcgsize;
+    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
+    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
 
     function tcgsize2str(cgsize: tcgsize):string;
 
@@ -660,6 +683,8 @@ implementation
             result:=result+'mx';
           R_SUBMMY:
             result:=result+'my';
+          R_SUBMMZ:
+            result:=result+'mz';
           else
             internalerror(200308252);
         end;
@@ -701,6 +726,39 @@ implementation
       end;
 
 
+    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
+      begin
+        case a of
+          4:
+            result := OS_MF32;
+          16:
+            result := OS_MF128;
+          32:
+            result := OS_MF256;
+          64:
+            result := OS_MF512;
+          else
+            result := int_cgsize(a);
+        end;
+      end;
+
+    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
+      begin
+        case a of
+          8:
+            result := OS_MD64;
+          16:
+            result := OS_MD128;
+          32:
+            result := OS_MD256;
+          64:
+            result := OS_MD512;
+          else
+            result := int_cgsize(a);
+        end;
+      end;
+
+
     function tcgsize2str(cgsize: tcgsize):string;
       begin
         Str(cgsize, Result);

+ 64 - 19
compiler/defutil.pas

@@ -1338,7 +1338,24 @@ implementation
           arraydef :
             begin
               if is_dynamic_array(def) or not is_special_array(def) then
-                result := int_cgsize(def.size)
+                begin
+                  if (cs_support_vectors in current_settings.globalswitches) and is_vector(def) and ((TArrayDef(def).elementdef.typ = floatdef) and not (cs_fp_emulation in current_settings.moduleswitches)) then
+                    begin
+                      { Determine if, based on the floating-point type and the size
+                        of the array, if it can be made into a vector }
+                      case TFloatDef(def).floattype of
+                        s32real:
+                          result := float_array_cgsize(def.size);
+                        s64real:
+                          result := double_array_cgsize(def.size);
+                        else
+                          { If not, fall back }
+                          result := int_cgsize(def.size);
+                      end;
+                    end
+                  else
+                    result := int_cgsize(def.size);
+                end
               else
                 result := OS_NO;
             end;
@@ -1379,25 +1396,53 @@ implementation
         case def.typ of
           arraydef:
             begin
-              if tarraydef(def).elementdef.typ in [orddef,floatdef] then
-                begin
-                  { this is not correct, OS_MX normally mean that the vector
-                    contains elements of size X. However, vectors themselves
-                    can also have different sizes (e.g. a vector of 2 singles on
-                    SSE) and the total size is currently more important }
-                  case def.size of
-                    1: result:=OS_M8;
-                    2: result:=OS_M16;
-                    4: result:=OS_M32;
-                    8: result:=OS_M64;
-                    16: result:=OS_M128;
-                    32: result:=OS_M256;
-                    else
-                      internalerror(2013060103);
+              case tarraydef(def).elementdef.typ of
+                orddef:
+                  begin
+                    { this is not correct, OS_MX normally mean that the vector
+                      contains elements of size X. However, vectors themselves
+                      can also have different sizes (e.g. a vector of 2 singles on
+                      SSE) and the total size is currently more important }
+                    case def.size of
+                      1: result:=OS_M8;
+                      2: result:=OS_M16;
+                      4: result:=OS_M32;
+                      8: result:=OS_M64;
+                      16: result:=OS_M128;
+                      32: result:=OS_M256;
+                      64: result:=OS_M512;
+                      else
+                        internalerror(2013060103);
+                    end;
                   end;
-                end
-              else
-                result:=def_cgsize(def);
+                floatdef:
+                  begin
+                    case TFloatDef(tarraydef(def).elementdef).floattype of
+                      s32real:
+                        case def.size of
+                          4:  result:=OS_MF32;
+                          16: result:=OS_MF128;
+                          32: result:=OS_MF256;
+                          64: result:=OS_MF512;
+                          else
+                            internalerror(2017121400);
+                        end;
+                      s64real:
+                        case def.size of
+                          8:  result:=OS_MD64;
+                          16: result:=OS_MD128;
+                          32: result:=OS_MD256;
+                          64: result:=OS_MD512;
+                          else
+                            internalerror(2017121401);
+                        end;
+                      else
+                        internalerror(2017121402);
+                    end;
+                  end;
+                else
+                  result:=def_cgsize(def);
+              end;
             end
           else
             result:=def_cgsize(def);

+ 2 - 1
compiler/globals.pas

@@ -1112,7 +1112,8 @@ implementation
          'SYSV_ABI_DEFAULT',
          'SYSV_ABI_CDECL',
          'MS_ABI_DEFAULT',
-         'MS_ABI_CDECL'
+         'MS_ABI_CDECL',
+         'VECTORCALL'
         );
       var
         t  : tproccalloption;

+ 6 - 3
compiler/globtype.pas

@@ -539,7 +539,9 @@ interface
          pocall_sysv_abi_cdecl,
          { for x86-64: forces Microsoft ABI (Pascal resp. C) }
          pocall_ms_abi_default,
-         pocall_ms_abi_cdecl
+         pocall_ms_abi_cdecl,
+         { for x86-64: Microsoft's "vectorcall" ABI }
+         pocall_vectorcall
        );
        tproccalloptions = set of tproccalloption;
 
@@ -560,9 +562,10 @@ interface
            'Interrupt',
            'HardFloat',
            'SysV_ABI_Default',
-           'MS_ABI_CDecl',
+           'MS_ABI_CDecl', { TODO: Is this correct? Shouldn't it be SysV_ABI_Default }
            'MS_ABI_Default',
-           'MS_ABI_CDecl'
+           'MS_ABI_CDecl',
+           'VectorCall'
          );
 
        { Default calling convention }

+ 2 - 0
compiler/hlcg2ll.pas

@@ -1538,6 +1538,8 @@ implementation
               result:=OS_F32;
             OS_64:
               result:=OS_F64;
+            OS_128:
+              result:=OS_M128;
           end;
         end;
     end;

+ 2 - 1
compiler/i386/cpubase.inc

@@ -35,7 +35,8 @@
         S_NEAR,S_FAR,S_SHORT,
         S_T,
         S_XMM,
-        S_YMM
+        S_YMM,
+        S_ZMM
       );
 
       TOpSizes = set of topsize;

+ 2 - 1
compiler/i8086/cpubase.inc

@@ -35,7 +35,8 @@
         S_NEAR,S_FAR,S_SHORT,
         S_T,
         S_XMM,
-        S_YMM
+        S_YMM,
+        S_ZMM
       );
 
       TOpSizes = set of topsize;

+ 15 - 15
compiler/ncgld.pas

@@ -682,6 +682,7 @@ implementation
 
     procedure tcgassignmentnode.pass_generate_code;
       var
+         shuffle : pmmshuffle;
          hlabel : tasmlabel;
          href : treference;
          releaseright : boolean;
@@ -968,22 +969,21 @@ implementation
               LOC_MMREGISTER,
               LOC_CMMREGISTER:
                 begin
-                  if left.resultdef.typ=arraydef then
-                    begin
-                    end
+                  if (is_vector(left.resultdef)) then
+                    shuffle := nil
                   else
-                    begin
-                      case left.location.loc of
-                        LOC_CMMREGISTER,
-                        LOC_MMREGISTER:
-                          hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register,mms_movescalar);
-                        LOC_REFERENCE,
-                        LOC_CREFERENCE:
-                          hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference,mms_movescalar);
-                        else
-                          internalerror(2009112601);
-                      end;
-                    end;
+                    shuffle := mms_movescalar;
+
+                  case left.location.loc of
+                    LOC_CMMREGISTER,
+                    LOC_MMREGISTER:
+                      hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register, shuffle);
+                    LOC_REFERENCE,
+                    LOC_CREFERENCE:
+                      hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference, shuffle);
+                    else
+                      internalerror(2009112601);
+                  end;
                 end;
               LOC_REGISTER,
               LOC_CREGISTER :

+ 2 - 1
compiler/ncgrtti.pas

@@ -335,7 +335,8 @@ implementation
          { pocall_sysv_abi_default } 14,
          { pocall_sysv_abi_cdecl }   15,
          { pocall_ms_abi_default }   16,
-         { pocall_ms_abi_cdecl }     17
+         { pocall_ms_abi_cdecl }     17,
+         { pocall_vectorcall }       18
         );
       begin
         tcb.emit_ord_const(ProcCallOptionToCallConv[def.proccalloption],u8inttype);

+ 10 - 1
compiler/pdecsub.pas

@@ -2382,7 +2382,7 @@ type
    end;
 const
   {Should contain the number of procedure directives we support.}
-  num_proc_directives=50;
+  num_proc_directives=51;
   proc_direcdata:array[1..num_proc_directives] of proc_dir_rec=
    (
     (
@@ -2849,6 +2849,15 @@ const
       mutexclpocall : [];
       mutexclpotype : [potype_constructor,potype_destructor,potype_class_constructor,potype_class_destructor];
       mutexclpo     : [po_interrupt]
+    ),(
+      idtok:_VECTORCALL;
+      pd_flags : [pd_interface,pd_implemen,pd_body,pd_procvar];
+      handler  : nil;
+      pocall   : pocall_vectorcall;
+      pooption : [];
+      mutexclpocall : [];
+      mutexclpotype : [potype_constructor,potype_destructor,potype_class_constructor,potype_class_destructor];
+      mutexclpo     : [po_interrupt]
     )
    );
 

+ 9 - 0
compiler/symtype.pas

@@ -82,6 +82,7 @@ interface
          function  alignment:shortint;virtual;abstract;
          { alignment when this type appears in a record/class/... }
          function  structalignment:shortint;virtual;
+         function  aggregatealignment:shortint;virtual;
          function  getvardef:longint;virtual;abstract;
          function  getparentdef:tdef;virtual;
          function  getsymtable(t:tgetsymtable):TSymtable;virtual;
@@ -379,6 +380,14 @@ implementation
         result:=alignment;
       end;
 
+    function tdef.aggregatealignment: shortint;
+      begin
+        if Assigned(Owner) and Assigned(Owner.defowner) and (Owner.defowner is TDef) and (Owner.defowner <> Self) then
+          Result := max(structalignment, TDef(Owner.defowner).aggregatealignment)
+        else
+          Result := structalignment;
+      end;
+
 
     procedure tdef.ChangeOwner(st:TSymtable);
       begin

+ 2 - 0
compiler/tokens.pas

@@ -289,6 +289,7 @@ type
     _OPENSTRING,
     _RIGHTSHIFT,
     _SPECIALIZE,
+    _VECTORCALL,
     _CONSTRUCTOR,
     _GREATERTHAN,
     _INTERNCONST,
@@ -628,6 +629,7 @@ const
       (str:'OPENSTRING'    ;special:false;keyword:[m_none];op:NOTOKEN),
       (str:'RIGHTSHIFT'    ;special:false;keyword:[m_none];op:NOTOKEN), { delphi operator name }
       (str:'SPECIALIZE'    ;special:false;keyword:[m_none];op:NOTOKEN),
+      (str:'VECTORCALL'    ;special:false;keyword:[m_none];op:NOTOKEN),
       (str:'CONSTRUCTOR'   ;special:false;keyword:alllanguagemodes-[m_iso,m_extpas];op:NOTOKEN),
       (str:'GREATERTHAN'   ;special:false;keyword:[m_none];op:NOTOKEN), { delphi operator name }
       (str:'INTERNCONST'   ;special:false;keyword:[m_none];op:NOTOKEN),

+ 19 - 9
compiler/x86/aasmcpu.pas

@@ -52,6 +52,7 @@ interface
       OT_BITS64    = $00000008;  { x86_64 and FPU }
       OT_BITS128   = $10000000;  { 16 byte SSE }
       OT_BITS256   = $20000000;  { 32 byte AVX }
+      OT_BITS512   = $40000000;  { 64 byte AVX512 }
       OT_BITS80    = $00000010;  { FPU only  }
       OT_FAR       = $00000020;  { this means 16:16 or 16:32, like in CALL/JMP }
       OT_NEAR      = $00000040;
@@ -612,7 +613,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,OT_BITS8,OT_BITS16,OT_BITS32,
@@ -622,7 +624,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,
@@ -632,7 +635,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          )
        );
 
@@ -650,7 +654,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
@@ -660,7 +665,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
@@ -670,7 +676,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          )
       );
 
@@ -688,7 +695,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
@@ -698,7 +706,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          ),
          (OT_NONE,
           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
@@ -708,7 +717,8 @@ implementation
           OT_NEAR,OT_FAR,OT_SHORT,
           OT_NONE,
           OT_BITS128,
-          OT_BITS256
+          OT_BITS256,
+          OT_BITS512
          )
       );
 

+ 386 - 36
compiler/x86/cgx86.pas

@@ -158,20 +158,26 @@ unit cgx86;
       TCGSize2OpSize: Array[tcgsize] of topsize =
         (S_NO,S_B,S_W,S_L,S_Q,S_XMM,S_B,S_W,S_L,S_Q,S_XMM,
          S_FS,S_FL,S_FX,S_IQ,S_FXX,
-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM);
 {$elseif defined(i386)}
       TCGSize2OpSize: Array[tcgsize] of topsize =
         (S_NO,S_B,S_W,S_L,S_L,S_T,S_B,S_W,S_L,S_L,S_L,
          S_FS,S_FL,S_FX,S_IQ,S_FXX,
-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM);
 {$elseif defined(i8086)}
       TCGSize2OpSize: Array[tcgsize] of topsize =
         (S_NO,S_B,S_W,S_W,S_W,S_T,S_B,S_W,S_W,S_W,S_W,
          S_FS,S_FL,S_FX,S_IQ,S_FXX,
-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM,
+         S_NO,S_XMM,S_YMM,S_ZMM);
 {$endif}
 
 {$ifndef NOTARGETWIN}
@@ -185,6 +191,9 @@ unit cgx86;
     { returns true, if the compiler should use leave instead of mov/pop }
     function UseLeave: boolean;
 
+    { Gets the byte alignment of a reference }
+    function GetRefAlignment(ref: treference): Byte;
+
   implementation
 
     uses
@@ -225,6 +234,22 @@ unit cgx86;
 {$endif}
       end;
 
+    function GetRefAlignment(ref: treference): Byte; {$IFDEF USEINLINE}inline;{$ENDIF}
+      begin
+{$ifdef x86_64}
+        { The stack pointer and base pointer will be aligned to 16-byte boundaries if the machine code is well-behaved }
+        if (ref.base = NR_RSP) or (ref.base = NR_RBP) then
+          begin
+            if (ref.index = NR_NO) and ((ref.offset mod 16) = 0) then
+              Result := 16
+            else
+              Result := ref.alignment;
+          end
+        else
+{$endif x86_64}
+          Result := ref.alignment;
+      end;
+
     const
       TOpCG2AsmOp: Array[topcg] of TAsmOp = (A_NONE,A_MOV,A_ADD,A_AND,A_DIV,
                             A_IDIV,A_IMUL,A_MUL,A_NEG,A_NOT,A_OR,
@@ -268,8 +293,19 @@ unit cgx86;
             result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
           OS_M64:
             result:=rg[R_MMREGISTER].getregister(list,R_SUBQ);
-          OS_M128:
-            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMWHOLE);
+          OS_M128,
+          OS_F128,
+          OS_MF128,
+          OS_MD128:
+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMX); { R_SUBMMWHOLE seems a bit dangerous and ambiguous, so changed to R_SUBMMX. [Kit] }
+          OS_M256,
+          OS_MF256,
+          OS_MD256:
+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMY);
+          OS_M512,
+          OS_MF512,
+          OS_MD512:
+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMZ);
           else
             internalerror(200506041);
         end;
@@ -1260,13 +1296,13 @@ unit cgx86;
           (A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
-          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_MOVAPS));
         convertopavx : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
           (A_VMOVSS,A_VCVTSS2SD,A_NONE,A_NONE,A_NONE),
           (A_VCVTSD2SS,A_VMOVSD,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
-          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
+          (A_NONE,A_NONE,A_NONE,A_NONE,A_VMOVAPS));
       begin
         { we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
           OS_32/OS_64 (record in memory/LOC_REFERENCE) }
@@ -1288,13 +1324,33 @@ unit cgx86;
           end
         { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
           OS_64 (record in memory/LOC_REFERENCE) }
-        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
-                (fromsize=OS_M64) then
+        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) then
           begin
-            if UseAVX then
-              result:=A_VMOVQ
-            else
-              result:=A_MOVQ;
+            case fromsize of
+              OS_M64:
+                { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
+                  OS_64 (record in memory/LOC_REFERENCE) }
+                if UseAVX then
+                  result:=A_VMOVQ
+                else
+                  result:=A_MOVQ;
+              OS_M128:
+                { 128-bit aligned vector }
+                if UseAVX then
+                  result:=A_VMOVAPS
+                else
+                  result:=A_MOVAPS;
+              OS_M256,
+              OS_M512:
+                { 256-bit aligned vector }
+                if UseAVX then
+                  result:=A_VMOVAPS
+                else
+                  { SSE does not support 256-bit or 512-bit vectors }
+                  InternalError(2018012930);
+              else
+                InternalError(2018012920);
+            end;
           end
         else
           internalerror(2010060104);
@@ -1313,12 +1369,14 @@ unit cgx86;
             if fromsize=tosize then
               { needs correct size in case of spilling }
               case fromsize of
-                OS_F32:
+                OS_F32,
+                OS_MF128:
                   if UseAVX then
                     instr:=taicpu.op_reg_reg(A_VMOVAPS,S_NO,reg1,reg2)
                   else
                     instr:=taicpu.op_reg_reg(A_MOVAPS,S_NO,reg1,reg2);
-                OS_F64:
+                OS_F64,
+                OS_MD128:
                   if UseAVX then
                     instr:=taicpu.op_reg_reg(A_VMOVAPD,S_NO,reg1,reg2)
                   else
@@ -1328,6 +1386,32 @@ unit cgx86;
                     instr:=taicpu.op_reg_reg(A_VMOVQ,S_NO,reg1,reg2)
                   else
                     instr:=taicpu.op_reg_reg(A_MOVQ,S_NO,reg1,reg2);
+                OS_M128, OS_MS128:
+                  if UseAVX then
+                    instr:=taicpu.op_reg_reg(A_VMOVDQA,S_NO,reg1,reg2)
+                  else
+                    instr:=taicpu.op_reg_reg(A_MOVDQA,S_NO,reg1,reg2);
+                OS_MF256,
+                OS_MF512:
+                  if UseAVX then
+                    instr:=taicpu.op_reg_reg(A_VMOVAPS,S_NO,reg1,reg2)
+                  else
+                    { SSE doesn't support 512-bit vectors }
+                    InternalError(2018012931);
+                OS_MD256,
+                OS_MD512:
+                  if UseAVX then
+                    instr:=taicpu.op_reg_reg(A_VMOVAPD,S_NO,reg1,reg2)
+                  else
+                    { SSE doesn't support 512-bit vectors }
+                    InternalError(2018012932);
+                OS_M256, OS_MS256,
+                OS_M512, OS_MS512:
+                  if UseAVX then
+                    instr:=taicpu.op_reg_reg(A_VMOVDQA,S_NO,reg1,reg2)
+                  else
+                    { SSE doesn't support 512-bit vectors }
+                    InternalError(2018012933);
                 else
                   internalerror(2006091201);
               end
@@ -1385,15 +1469,152 @@ unit cgx86;
          make_simple_ref(list,tmpref);
          if shuffle=nil then
            begin
-             if fromsize=OS_M64 then
-               list.concat(taicpu.op_ref_reg(A_MOVQ,S_NO,tmpref,reg))
-             else
-{$ifdef x86_64}
-               { x86-64 has always properly aligned data }
-               list.concat(taicpu.op_ref_reg(A_MOVDQA,S_NO,tmpref,reg));
-{$else x86_64}
-               list.concat(taicpu.op_ref_reg(A_MOVDQU,S_NO,tmpref,reg));
-{$endif x86_64}
+             case fromsize of
+               OS_F32:
+                 if UseAVX then
+                   op := A_VMOVSS
+                 else
+                   op := A_MOVSS;
+               OS_F64:
+                 if UseAVX then
+                   op := A_VMOVSD
+                 else
+                   op := A_MOVSD;
+               OS_M32, OS_32, OS_S32:
+                 if UseAVX then
+                   op := A_VMOVD
+                 else
+                   op := A_MOVD;
+               OS_M64, OS_64, OS_S64:
+                 if UseAVX then
+                   op := A_VMOVQ
+                 else
+                   op := A_MOVQ;
+               OS_MF128:
+                 { Use XMM transfer of packed singles }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 16 then
+                       op := A_VMOVAPS
+                     else
+                       op := A_VMOVUPS
+                   end
+                 else
+                   begin
+                     if GetRefAlignment(tmpref) = 16 then
+                       op := A_MOVAPS
+                     else
+                       op := A_MOVUPS
+                   end;
+               OS_MD128:
+                 { Use XMM transfer of packed doubles }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 16 then
+                       op := A_VMOVAPD
+                     else
+                       op := A_VMOVUPD
+                   end
+                 else
+                   begin
+                     if GetRefAlignment(tmpref) = 16 then
+                       op := A_MOVAPD
+                     else
+                       op := A_MOVUPD
+                   end;
+               OS_M128, OS_MS128:
+                 { Use XMM integer transfer }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 16 then
+                       op := A_VMOVDQA
+                     else
+                       op := A_VMOVDQU
+                   end
+                 else
+                   begin
+                     if GetRefAlignment(tmpref) = 16 then
+                       op := A_MOVDQA
+                     else
+                       op := A_MOVDQU
+                   end;
+               OS_MF256:
+                 { Use YMM transfer of packed singles }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 32 then
+                       op := A_VMOVAPS
+                     else
+                       op := A_VMOVUPS
+                   end
+                 else
+                   { SSE doesn't support 256-bit vectors }
+                   InternalError(2018012934);
+               OS_MD256:
+                 { Use YMM transfer of packed doubles }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 32 then
+                       op := A_VMOVAPD
+                     else
+                       op := A_VMOVUPD
+                   end
+                 else
+                   { SSE doesn't support 256-bit vectors }
+                   InternalError(2018012935);
+               OS_M256, OS_MS256:
+                 { Use YMM integer transfer }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 32 then
+                       op := A_VMOVDQA
+                     else
+                       op := A_VMOVDQU
+                   end
+                 else
+                   { SSE doesn't support 256-bit vectors }
+                   InternalError(2018012936);
+               OS_MF512:
+                 { Use ZMM transfer of packed singles }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 64 then
+                       op := A_VMOVAPS
+                     else
+                       op := A_VMOVUPS
+                   end
+                 else
+                   { SSE doesn't support 512-bit vectors }
+                   InternalError(2018012937);
+               OS_MD512:
+                 { Use ZMM transfer of packed doubles }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 64 then
+                       op := A_VMOVAPD
+                     else
+                       op := A_VMOVUPD
+                   end
+                 else
+                   { SSE doesn't support 512-bit vectors }
+                   InternalError(2018012938);
+               OS_M512, OS_MS512:
+                 { Use ZMM integer transfer }
+                 if UseAVX then
+                   begin
+                     if GetRefAlignment(tmpref) = 64 then
+                       op := A_VMOVDQA
+                     else
+                       op := A_VMOVDQU
+                   end
+                 else
+                   { SSE doesn't support 512-bit vectors }
+                   InternalError(2018012939);
+               else
+                 { No valid transfer command available }
+                 internalerror(2017121410);
+             end;
+             list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg));
            end
          else if shufflescalar(shuffle) then
            begin
@@ -1415,20 +1636,149 @@ unit cgx86;
          hreg : tregister;
          tmpref  : treference;
          op : tasmop;
+
        begin
          tmpref:=ref;
          make_simple_ref(list,tmpref);
          if shuffle=nil then
            begin
-             if fromsize=OS_M64 then
-               list.concat(taicpu.op_reg_ref(A_MOVQ,S_NO,reg,tmpref))
-             else
-{$ifdef x86_64}
-               { x86-64 has always properly aligned data }
-               list.concat(taicpu.op_reg_ref(A_MOVDQA,S_NO,reg,tmpref))
-{$else x86_64}
-               list.concat(taicpu.op_reg_ref(A_MOVDQU,S_NO,reg,tmpref))
-{$endif x86_64}
+             case fromsize of
+               OS_F32:
+                 if UseAVX then
+                   op := A_VMOVSS
+                 else
+                   op := A_MOVSS;
+               OS_F64:
+                 if UseAVX then
+                   op := A_VMOVSD
+                 else
+                   op := A_MOVSD;
+               OS_M32, OS_32, OS_S32:
+                 if UseAVX then
+                   op := A_VMOVD
+                 else
+                   op := A_MOVD;
+               OS_M64, OS_64, OS_S64:
+                 if UseAVX then
+                   op := A_VMOVQ
+                 else
+                   op := A_MOVQ;
+               OS_MF128:
+                 { Use XMM transfer of packed singles }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 16 then
+                     op := A_VMOVAPS
+                   else
+                     op := A_VMOVUPS
+                 end else
+                 begin
+                   if GetRefAlignment(tmpref) = 16 then
+                     op := A_MOVAPS
+                   else
+                     op := A_MOVUPS
+                 end;
+               OS_MD128:
+                 { Use XMM transfer of packed doubles }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 16 then
+                     op := A_VMOVAPD
+                   else
+                     op := A_VMOVUPD
+                 end else
+                 begin
+                   if GetRefAlignment(tmpref) = 16 then
+                     op := A_MOVAPD
+                   else
+                     op := A_MOVUPD
+                 end;
+               OS_M128, OS_MS128:
+                 { Use XMM integer transfer }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 16 then
+                     op := A_VMOVDQA
+                   else
+                     op := A_VMOVDQU
+                 end else
+                 begin
+                   if GetRefAlignment(tmpref) = 16 then
+                     op := A_MOVDQA
+                   else
+                     op := A_MOVDQU
+                 end;
+               OS_MF256:
+                 { Use XMM transfer of packed singles }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 32 then
+                     op := A_VMOVAPS
+                   else
+                     op := A_VMOVUPS
+                 end else
+                   { SSE doesn't support 256-bit vectors }
+                   InternalError(2018012940);
+               OS_MD256:
+                 { Use XMM transfer of packed doubles }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 32 then
+                     op := A_VMOVAPD
+                   else
+                     op := A_VMOVUPD
+                 end else
+                   { SSE doesn't support 256-bit vectors }
+                   InternalError(2018012941);
+               OS_M256, OS_MS256:
+                 { Use XMM integer transfer }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 32 then
+                     op := A_VMOVDQA
+                   else
+                     op := A_VMOVDQU
+                 end else
+                   { SSE doesn't support 256-bit vectors }
+                   InternalError(2018012942);
+               OS_MF512:
+                 { Use XMM transfer of packed singles }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 64 then
+                     op := A_VMOVAPS
+                   else
+                     op := A_VMOVUPS
+                 end else
+                   { SSE doesn't support 512-bit vectors }
+                   InternalError(2018012943);
+               OS_MD512:
+                 { Use XMM transfer of packed doubles }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 64 then
+                     op := A_VMOVAPD
+                   else
+                     op := A_VMOVUPD
+                 end else
+                   { SSE doesn't support 512-bit vectors }
+                   InternalError(2018012944);
+               OS_M512, OS_MS512:
+                 { Use XMM integer transfer }
+                 if UseAVX then
+                 begin
+                   if GetRefAlignment(tmpref) = 64 then
+                     op := A_VMOVDQA
+                   else
+                     op := A_VMOVDQU
+                 end else
+                   { SSE doesn't support 512-bit vectors }
+                   InternalError(2018012945);
+               else
+                 { No valid transfer command available }
+                 internalerror(2017121411);
+             end;
+             list.concat(taicpu.op_reg_ref(op,S_NO,reg,tmpref));
            end
          else if shufflescalar(shuffle) then
            begin

+ 6 - 4
compiler/x86/cpubase.pas

@@ -419,10 +419,12 @@ implementation
               else
                 internalerror(2009071902);
             end;
-          OS_M128,OS_MS128:
+          OS_M128,OS_MS128,OS_MF128,OS_MD128:
             cgsize2subreg:=R_SUBMMX;
-          OS_M256,OS_MS256:
+          OS_M256,OS_MS256,OS_MF256,OS_MD256:
             cgsize2subreg:=R_SUBMMY;
+          OS_M512,OS_MS512,OS_MF512,OS_MD512:
+            cgsize2subreg:=R_SUBMMZ;
           OS_NO:
             { error message should have been thrown already before, so avoid only
               an internal error }
@@ -435,7 +437,7 @@ implementation
 
     function reg_cgsize(const reg: tregister): tcgsize;
       const subreg2cgsize:array[Tsubregister] of Tcgsize =
-            (OS_NO,OS_8,OS_8,OS_16,OS_32,OS_64,OS_NO,OS_NO,OS_NO,OS_F32,OS_F64,OS_NO,OS_M128,OS_M256,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO);
+            (OS_NO,OS_8,OS_8,OS_16,OS_32,OS_64,OS_NO,OS_NO,OS_NO,OS_F32,OS_F64,OS_NO,OS_M128,OS_M256,OS_M512,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO);
       begin
         case getregtype(reg) of
           R_INTREGISTER :
@@ -466,7 +468,7 @@ implementation
     function reg2opsize(r:Tregister):topsize;
       const
         subreg2opsize : array[tsubregister] of topsize =
-          (S_NO,S_B,S_B,S_W,S_L,S_Q,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO);
+          (S_NO,S_B,S_B,S_W,S_L,S_Q,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO);
       begin
         reg2opsize:=S_L;
         case getregtype(r) of

+ 26 - 24
compiler/x86/itcpugas.pas

@@ -52,27 +52,28 @@ interface
        'd',
        '','','',
        't',
-        'x',
-        'y'
+       'x',
+       'y',
+       'z'
      );
      { suffix-to-opsize conversion tables, used in asmreadrer }
      { !! S_LQ excluded: movzlq does not exist, movslq is processed
        as a separate instruction w/o suffix (aka movsxd), and there are
        no more instructions needing it. }
-     att_sizesuffixstr : array[0..13] of string[2] = (
-       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y'
+     att_sizesuffixstr : array[0..14] of string[2] = (
+       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y','Z'
      );
-     att_sizesuffix : array[0..13] of topsize = (
-       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO
+     att_sizesuffix : array[0..14] of topsize = (
+       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO,S_NO
      );
-     att_sizefpusuffix : array[0..13] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
+     att_sizefpusuffix : array[0..14] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
      );
-     att_sizefpuintsuffix : array[0..13] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
+     att_sizefpuintsuffix : array[0..14] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
      );
-     att_sizemmsuffix : array[0..13] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
+     att_sizemmsuffix : array[0..14] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
      );
 {$else x86_64}
      gas_opsize2str : array[topsize] of string[2] = ('',
@@ -82,24 +83,25 @@ interface
        'd',
        '','','',
        't',
-        'x',
-        'y'
+       'x',
+       'y',
+       'z'
      );
      { suffix-to-opsize conversion tables, used in asmreadrer }
-     att_sizesuffixstr : array[0..11] of string[2] = (
-       '','BW','BL','WL','B','W','L','S','Q','T','X','Y'
+     att_sizesuffixstr : array[0..12] of string[2] = (
+       '','BW','BL','WL','B','W','L','S','Q','T','X','Y','Z'
      );
-     att_sizesuffix : array[0..11] of topsize = (
-       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO
+     att_sizesuffix : array[0..12] of topsize = (
+       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO
      );
-     att_sizefpusuffix : array[0..11] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
+     att_sizefpusuffix : array[0..12] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
      );
-     att_sizefpuintsuffix : array[0..11] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
+     att_sizefpuintsuffix : array[0..12] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
      );
-     att_sizemmsuffix : array[0..11] of topsize = (
-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
+     att_sizemmsuffix : array[0..12] of topsize = (
+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
      );
 
 {$endif x86_64}

+ 4 - 2
compiler/x86/rax86.pas

@@ -343,7 +343,8 @@ const
      0,0,0,
      80,
      128,
-     256
+     256,
+     512
     );
 {$else}
 topsize2memsize: array[topsize] of integer =
@@ -354,7 +355,8 @@ topsize2memsize: array[topsize] of integer =
    0,0,0,
    80,
    128,
-   256
+   256,
+   512
   );
 {$endif}
 

+ 6 - 2
compiler/x86_64/aoptcpu.pas

@@ -74,10 +74,14 @@ uses
               A_MOVZX:
                 Result:=OptPass1Movx(p);
               A_VMOVAPS,
-              A_VMOVAPD:
+              A_VMOVAPD,
+              A_VMOVUPS,
+              A_VMOVUPD:
                 result:=OptPass1VMOVAP(p);
               A_MOVAPD,
-              A_MOVAPS:
+              A_MOVAPS,
+              A_MOVUPD,
+              A_MOVUPS:
                 result:=OptPass1MOVAP(p);
               A_VDIVSD,
               A_VDIVSS,

+ 2 - 1
compiler/x86_64/cpubase.inc

@@ -35,7 +35,8 @@ type
     S_NEAR,S_FAR,S_SHORT,
     S_T,
     S_XMM,
-    S_YMM
+    S_YMM,
+    S_ZMM
   );
 
   TOpSizes = set of topsize;

+ 2 - 1
compiler/x86_64/cpuinfo.pas

@@ -108,7 +108,8 @@ Const
      pocall_sysv_abi_default,
      pocall_sysv_abi_cdecl,
      pocall_ms_abi_default,
-     pocall_ms_abi_cdecl
+     pocall_ms_abi_cdecl,
+     pocall_vectorcall
    ];
 
    cputypestr : array[tcputype] of string[10] = ('',

Файлын зөрүү хэтэрхий том тул дарагдсан байна
+ 588 - 83
compiler/x86_64/cpupara.pas


+ 1 - 1
compiler/x86_64/cpupi.pas

@@ -173,7 +173,7 @@ implementation
         result:=
            ((target_info.system=system_x86_64_win64) and
             not(proccall in [pocall_sysv_abi_default,pocall_sysv_abi_cdecl])) or
-           (proccall in [pocall_ms_abi_default,pocall_ms_abi_cdecl]);
+            (proccall in [pocall_ms_abi_default,pocall_ms_abi_cdecl,pocall_vectorcall]);
       end;
 
 

+ 869 - 0
tests/test/cg/tvectorcall1.pp

@@ -0,0 +1,869 @@
+{ %CPU=x86_64 }
+program vectorcall_hva_test1;
+
+{$IFNDEF CPUX86_64}
+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
+{$ENDIF}
+
+{$ASMMODE Intel}
+{$PUSH}
+{$CODEALIGN RECORDMIN=16}
+{$PACKRECORDS C}
+type
+  TM128 = record
+    case Byte of
+      0: (M128_F32: array[0..3] of Single);
+      1: (M128_F64: array[0..1] of Double);
+  end;
+{$POP}
+
+{ HFA test: field style. }
+
+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
+  turned into vectors rather than HFAs. }
+
+  THFA1_SF = packed record
+    F1: Single;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_SF = packed record
+    F1, F2: Single;
+  end;
+
+  THFA3_SF = packed record
+    F1, F2, F3: Single;
+  end;
+
+  THFA4_SF = packed record
+    F1, F2, F3, F4: Single;
+  end;
+{$ENDIF}
+
+  THFA1_DF = packed record
+    F1: Double;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_DF = packed record
+    F1, F2: Double;
+  end;
+
+  THFA3_DF = packed record
+    F1, F2, F3: Double;
+  end;
+
+  THFA4_DF = packed record
+    F1, F2, F3, F4: Double;
+  end;
+{$ENDIF}
+
+{ HFA test - array style }
+
+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
+  turned into vectors rather than HFAs. }
+
+  THFA1_SA = packed record
+    F: array[0..0] of Single;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_SA = packed record
+    F: array[0..1] of Single;
+  end;
+
+  THFA3_SA = packed record
+    F: array[0..2] of Single;
+  end;
+
+  THFA4_SA = packed record
+    F: array[0..3] of Single;
+  end;
+{$ENDIF}
+
+  THFA1_DA = packed record
+    F: array[0..0] of Double;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_DA = packed record
+    F: array[0..1] of Double;
+  end;
+
+  THFA3_DA = packed record
+    F: array[0..2] of Double;
+  end;
+
+  THFA4_DA = packed record
+    F: array[0..3] of Double;
+  end;
+{$ENDIF}
+
+{ Single-type vector }
+
+function HorizontalAddSingle(V: TM128): Single; vectorcall;
+begin
+  HorizontalAddSingle := V.M128_F32[0] + V.M128_F32[1] + V.M128_F32[2] + V.M128_F32[3];
+end;
+
+function HorizontalAddSingle_ASM(V: TM128): Single; vectorcall; assembler; nostackframe;
+asm
+  HADDPS XMM0, XMM0
+  HADDPS XMM0, XMM0
+end;
+
+{ Double-type vector }
+
+function HorizontalAddDouble(V: TM128): Double; vectorcall;
+begin
+  HorizontalAddDouble := V.M128_F64[0] + V.M128_F64[1];
+end;
+
+function HorizontalAddDouble_ASM(V: TM128): Double; vectorcall; assembler; nostackframe;
+asm
+  HADDPD XMM0, XMM0
+end;
+
+{ 3-element aggregate }
+
+function AddSingles1F(HFA: THFA1_SF): Single; vectorcall;
+begin
+  AddSingles1F := HFA.F1;
+end;
+
+function AddSingles1F_ASM(HFA: THFA1_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddDoubles1F(HFA: THFA1_DF): Double; vectorcall;
+begin
+  AddDoubles1F := HFA.F1;
+end;
+
+function AddDoubles1F_ASM(HFA: THFA1_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddSingles1A(HFA: THFA1_SA): Single; vectorcall;
+begin
+  AddSingles1A := HFA.F[0];
+end;
+
+function AddSingles1A_ASM(HFA: THFA1_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddDoubles1A(HFA: THFA1_DA): Double; vectorcall;
+begin
+  AddDoubles1A := HFA.F[0];
+end;
+
+function AddDoubles1A_ASM(HFA: THFA1_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+{$IFDEF WIN64}
+{ 2-element aggregate }
+
+function AddSingles2F(HFA: THFA2_SF): Single; vectorcall;
+begin
+  AddSingles2F := HFA.F1 + HFA.F2;
+end;
+
+function AddSingles2F_ASM(HFA: THFA2_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+end;
+
+function AddDoubles2F(HFA: THFA2_DF): Double; vectorcall;
+begin
+  AddDoubles2F := HFA.F1 + HFA.F2;
+end;
+
+function AddDoubles2F_ASM(HFA: THFA2_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+end;
+
+function AddSingles2A(HFA: THFA2_SA): Single; vectorcall;
+begin
+  AddSingles2A := HFA.F[0] + HFA.F[1];
+end;
+
+function AddSingles2A_ASM(HFA: THFA2_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+end;
+
+function AddDoubles2A(HFA: THFA2_DA): Double; vectorcall;
+begin
+  AddDoubles2A := HFA.F[0] + HFA.F[1];
+end;
+
+function AddDoubles2A_ASM(HFA: THFA2_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+end;
+
+{ 3-element aggregate }
+
+function AddSingles3F(HFA: THFA3_SF): Single; vectorcall;
+begin
+  AddSingles3F := HFA.F1 + HFA.F2 + HFA.F3;
+end;
+
+function AddSingles3F_ASM(HFA: THFA3_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+end;
+
+function AddDoubles3F(HFA: THFA3_DF): Double; vectorcall;
+begin
+  AddDoubles3F := HFA.F1 + HFA.F2 + HFA.F3;
+end;
+
+function AddDoubles3F_ASM(HFA: THFA3_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+end;
+
+function AddSingles3A(HFA: THFA3_SA): Single; vectorcall;
+begin
+  AddSingles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
+end;
+
+function AddSingles3A_ASM(HFA: THFA3_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+end;
+
+function AddDoubles3A(HFA: THFA3_DA): Double; vectorcall;
+begin
+  AddDoubles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
+end;
+
+function AddDoubles3A_ASM(HFA: THFA3_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+end;
+
+{ 4-element aggregate }
+
+function AddSingles4F(HFA: THFA4_SF): Single; vectorcall;
+begin
+  AddSingles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
+end;
+
+function AddSingles4F_ASM(HFA: THFA4_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+  ADDSS XMM0, XMM3
+end;
+
+function AddDoubles4F(HFA: THFA4_DF): Double; vectorcall;
+begin
+  AddDoubles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
+end;
+
+function AddDoubles4F_ASM(HFA: THFA4_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+  ADDSD XMM0, XMM3
+end;
+
+function AddSingles4A(HFA: THFA4_SA): Single; vectorcall;
+begin
+  AddSingles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
+end;
+
+function AddSingles4A_ASM(HFA: THFA4_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+  ADDSS XMM0, XMM3
+end;
+
+function AddDoubles4A(HFA: THFA4_DA): Double; vectorcall;
+begin
+  AddDoubles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
+end;
+
+function AddDoubles4A_ASM(HFA: THFA4_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+  ADDSD XMM0, XMM3
+end;
+{$ENDIF}
+
+var
+  HVA: TM128;
+  HFA1_SF: THFA1_SF;
+  HFA1_DF: THFA1_DF;
+  HFA1_SA: THFA1_SA;
+  HFA1_DA: THFA1_DA;
+{$IFDEF WIN64}
+  HFA2_SF: THFA2_SF;
+  HFA2_DF: THFA2_DF;
+  HFA2_SA: THFA2_SA;
+  HFA2_DA: THFA2_DA;
+  HFA3_SF: THFA3_SF;
+  HFA3_DF: THFA3_DF;
+  HFA3_SA: THFA3_SA;
+  HFA3_DA: THFA3_DA;
+  HFA4_SF: THFA4_SF;
+  HFA4_DF: THFA4_DF;
+  HFA4_SA: THFA4_SA;
+  HFA4_DA: THFA4_DA;
+{$ENDIF}
+  TestPointer: PtrUInt;
+  I, J: Integer;
+  ResS, ResSA: Single;
+  ResD, ResDA: Double;
+  Addresses: array[0..3] of Pointer;
+  FieldAddresses: array[0..3, 0..3] of Pointer;
+const
+  AddressNames1: array[0..3] of ShortString = ('HFA1_SF', 'HFA1_DF', 'HFA1_SA', 'HFA1_DA');
+{$IFDEF WIN64}
+  AddressNames2: array[0..3] of ShortString = ('HFA2_SF', 'HFA2_DF', 'HFA2_SA', 'HFA2_DA');
+  AddressNames3: array[0..3] of ShortString = ('HFA3_SF', 'HFA3_DF', 'HFA3_SA', 'HFA3_DA');
+  AddressNames4: array[0..3] of ShortString = ('HFA4_SF', 'HFA4_DF', 'HFA4_SA', 'HFA4_DA');
+{$ENDIF}
+  FieldAddressNames: array[0..3] of ShortString = ('F1', 'F2', 'F3', 'F4');
+
+  ExpS1: Single = 5.0;
+{$IFDEF WIN64}
+  ExpS2: Single = -5.0;
+  ExpS3: Single = 10.0;
+{$ENDIF}
+  ExpS4: Single = -10.0;
+  ExpD1: Double = 5.0;
+  ExpD2: Double = -5.0;
+{$IFDEF WIN64}
+  ExpD3: Double = 10.0;
+  ExpD4: Double = -10.0;
+{$ENDIF}
+begin
+
+  if (PtrUInt(@HVA) and $F) <> 0 then
+  begin
+    WriteLn('FAIL: HVA is not correctly aligned.');
+    Halt(1);
+  end;
+
+  { array of singles }
+  WriteLn('- horizontal add (4 singles)');
+  HVA.M128_F32[0] := 5.0;
+  HVA.M128_F32[1] := -10.0;
+  HVA.M128_F32[2] := 15.0;
+  HVA.M128_F32[3] := -20.0;
+  ResS := HorizontalAddSingle(HVA);
+  ResSA := HorizontalAddSingle_ASM(HVA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: HorizontalAddSingle(HVA) has the vector in the wrong register.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: HorizontalAddSingle(HVA) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  { array of doubles }
+  WriteLn('- horizontal add (2 doubles)');
+  HVA.M128_F64[0] := 5.0;
+  HVA.M128_F64[1] := -10.0;
+  ResD := HorizontalAddDouble(HVA);
+  ResDA := HorizontalAddDouble_ASM(HVA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: HorizontalAddDouble(HVA) has the vector in the wrong register.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: HorizontalAddDouble(HVA) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  { 1-field aggregates }
+  WriteLn('- 1-field aggregates');
+
+  Addresses[0] := @HFA1_SF;
+  Addresses[1] := @HFA1_SA;
+  Addresses[2] := @HFA1_DF;
+  Addresses[3] := @HFA1_DA;
+  FieldAddresses[0][0] := @(HFA1_SF.F1);
+  FieldAddresses[1][0] := @(HFA1_SA.F[0]);
+  FieldAddresses[2][0] := @(HFA1_DF.F1);
+  FieldAddresses[3][0] := @(HFA1_DA.F[0]);
+
+  { Check alignment }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    if Pointer(TestPointer) <> FieldAddresses[I][0] then
+    begin
+      WriteLn('FAIL: ', AddressNames1[I], ' is not correctly packed; field F1 is not in the expected place.');
+      Halt(1);
+    end;
+  end;
+
+  HFA1_SF.F1 := 5.0;
+  ResS := AddSingles1F(HFA1_SF);
+  ResSA := AddSingles1F_ASM(HFA1_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS1 then
+    begin
+      WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_DF.F1 := 5.0;
+  ResD := AddDoubles1F(HFA1_DF);
+  ResDA := AddDoubles1F_ASM(HFA1_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD1 then
+    begin
+      WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_SA.F[0] := 5.0;
+  ResS := AddSingles1A(HFA1_SA);
+  ResSA := AddSingles1A_ASM(HFA1_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS1 then
+    begin
+      WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_DA.F[0] := 5.0;
+  ResD := AddDoubles1A(HFA1_DA);
+  ResDA := AddDoubles1A_ASM(HFA1_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD1 then
+    begin
+      WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
+      Halt(1);
+    end;
+  end;
+
+{$IFDEF WIN64}
+  { 2-field aggregates }
+  WriteLn('- 2-field aggregates');
+
+  Addresses[0] := @HFA2_SF;
+  Addresses[1] := @HFA2_SA;
+  FieldAddresses[0][0] := @(HFA2_SF.F1);
+  FieldAddresses[0][1] := @(HFA2_SF.F2);
+  FieldAddresses[1][0] := @(HFA2_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA2_SA.F[1]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 1 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA2_DF;
+  Addresses[3] := @HFA2_DA;
+  FieldAddresses[2][0] := @(HFA2_DF.F1);
+  FieldAddresses[2][1] := @(HFA2_DF.F2);
+  FieldAddresses[3][0] := @(HFA2_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA2_DA.F[1]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 1 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA2_SF.F1 := 5.0;
+  HFA2_SF.F2 := -10.0;
+  ResS := AddSingles2F(HFA2_SF);
+  ResSA := AddSingles2F_ASM(HFA2_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles2F(HFA2_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS2 then
+    begin
+      WriteLn('FAIL: AddSingles2F(HFA2_SF) returned ', ResS, ' instead of ', ExpS2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_DF.F1 := 5.0;
+  HFA2_DF.F2 := -10.0;
+  ResD := AddDoubles2F(HFA2_DF);
+  ResDA := AddDoubles2F_ASM(HFA2_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles2F(HFA2_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: AddDoubles2F(HFA2_DF) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_SA.F[0] := 5.0;
+  HFA2_SA.F[1] := -10.0;
+  ResS := AddSingles2A(HFA2_SA);
+  ResSA := AddSingles2A_ASM(HFA2_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles2A(HFA2_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS2 then
+    begin
+      WriteLn('FAIL: AddSingles2A(HFA2_SA) returned ', ResS, ' instead of ', ExpS2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_DA.F[0] := 5.0;
+  HFA2_DA.F[1] := -10.0;
+  ResD := AddDoubles2A(HFA2_DA);
+  ResDA := AddDoubles2A_ASM(HFA2_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles2A(HFA2_DA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: AddDoubles2A(HFA2_DA) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  { 3-field aggregates }
+  WriteLn('- 3-field aggregates');
+
+  Addresses[0] := @HFA3_SF;
+  Addresses[1] := @HFA3_SA;
+  FieldAddresses[0][0] := @(HFA3_SF.F1);
+  FieldAddresses[0][1] := @(HFA3_SF.F2);
+  FieldAddresses[0][2] := @(HFA3_SF.F3);
+  FieldAddresses[1][0] := @(HFA3_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA3_SA.F[1]);
+  FieldAddresses[1][2] := @(HFA3_SA.F[2]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 2 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA3_DF;
+  Addresses[3] := @HFA3_DA;
+  FieldAddresses[2][0] := @(HFA3_DF.F1);
+  FieldAddresses[2][1] := @(HFA3_DF.F2);
+  FieldAddresses[2][2] := @(HFA3_DF.F3);
+  FieldAddresses[3][0] := @(HFA3_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA3_DA.F[1]);
+  FieldAddresses[3][2] := @(HFA3_DA.F[2]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 2 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA3_SF.F1 := 5.0;
+  HFA3_SF.F2 := -10.0;
+  HFA3_SF.F3 := 15.0;
+  ResS := AddSingles3F(HFA3_SF);
+  ResSA := AddSingles3F_ASM(HFA3_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles3F(HFA3_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS3 then
+    begin
+      WriteLn('FAIL: AddSingles3F(HFA3_SF) returned ', ResS, ' instead of ', ExpS3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_DF.F1 := 5.0;
+  HFA3_DF.F2 := -10.0;
+  HFA3_DF.F3 := 15.0;
+  ResD := AddDoubles3F(HFA3_DF);
+  ResDA := AddDoubles3F_ASM(HFA3_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles3F(HFA3_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD3 then
+    begin
+      WriteLn('FAIL: AddDoubles3F(HFA3_DF) returned ', ResD, ' instead of ', ExpD3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_SA.F[0] := 5.0;
+  HFA3_SA.F[1] := -10.0;
+  HFA3_SA.F[2] := 15.0;
+  ResS := AddSingles3A(HFA3_SA);
+  ResSA := AddSingles3A_ASM(HFA3_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles3A(HFA3_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS3 then
+    begin
+      WriteLn('FAIL: AddSingles3A(HFA3_SA) returned ', ResS, ' instead of ', ExpS3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_DA.F[0] := 5.0;
+  HFA3_DA.F[1] := -10.0;
+  HFA3_DA.F[2] := 15.0;
+  ResD := AddDoubles3A(HFA3_DA);
+  ResDA := AddDoubles3A_ASM(HFA3_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles3A(HFA3_DA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD3 then
+    begin
+      WriteLn('FAIL: AddDoubles3A(HFA3_DA) returned ', ResD, ' instead of ', ExpD3);
+      Halt(1);
+    end;
+  end;
+
+  { 4-field aggregates }
+  WriteLn('- 4-field aggregates');
+
+  Addresses[0] := @HFA4_SF;
+  Addresses[1] := @HFA4_SA;
+  FieldAddresses[0][0] := @(HFA4_SF.F1);
+  FieldAddresses[0][1] := @(HFA4_SF.F2);
+  FieldAddresses[0][2] := @(HFA4_SF.F3);
+  FieldAddresses[0][3] := @(HFA4_SF.F4);
+  FieldAddresses[1][0] := @(HFA4_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA4_SA.F[1]);
+  FieldAddresses[1][2] := @(HFA4_SA.F[2]);
+  FieldAddresses[1][3] := @(HFA4_SA.F[3]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 3 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA4_DF;
+  Addresses[3] := @HFA4_DA;
+  FieldAddresses[2][0] := @(HFA4_DF.F1);
+  FieldAddresses[2][1] := @(HFA4_DF.F2);
+  FieldAddresses[2][2] := @(HFA4_DF.F3);
+  FieldAddresses[2][3] := @(HFA4_DF.F4);
+  FieldAddresses[3][0] := @(HFA4_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA4_DA.F[1]);
+  FieldAddresses[3][2] := @(HFA4_DA.F[2]);
+  FieldAddresses[3][3] := @(HFA4_DA.F[3]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 3 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA4_SF.F1 := 5.0;
+  HFA4_SF.F2 := -10.0;
+  HFA4_SF.F3 := 15.0;
+  HFA4_SF.F4 := -20.0;
+  ResS := AddSingles4F(HFA4_SF);
+  ResSA := AddSingles4F_ASM(HFA4_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles4F(HFA4_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: AddSingles4F(HFA4_SF) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_DF.F1 := 5.0;
+  HFA4_DF.F2 := -10.0;
+  HFA4_DF.F3 := 15.0;
+  HFA4_DF.F4 := -20.0;
+  ResD := AddDoubles4F(HFA4_DF);
+  ResDA := AddDoubles4F_ASM(HFA4_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles4F(HFA4_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD4 then
+    begin
+      WriteLn('FAIL: AddDoubles4F(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_SA.F[0] := 5.0;
+  HFA4_SA.F[1] := -10.0;
+  HFA4_SA.F[2] := 15.0;
+  HFA4_SA.F[3] := -20.0;
+  ResS := AddSingles4A(HFA4_SA);
+  ResSA := AddSingles4A_ASM(HFA4_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles4A(HFA4_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: AddSingles4A(HFA4_SA) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_DA.F[0] := 5.0;
+  HFA4_DA.F[1] := -10.0;
+  HFA4_DA.F[2] := 15.0;
+  HFA4_DA.F[3] := -20.0;
+  ResD := AddDoubles4A(HFA4_DA);
+  ResDA := AddDoubles4A_ASM(HFA4_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles4A(HFA4_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD4 then
+    begin
+      WriteLn('FAIL: AddDoubles4A(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
+      Halt(1);
+    end;
+  end;
+{$ENDIF}
+  WriteLn('ok');
+end.
+

+ 162 - 0
tests/test/cg/tvectorcall2.pp

@@ -0,0 +1,162 @@
+{ %CPU=x86_64 }
+program vectorcall_hva_test2;
+
+{$IFNDEF CPUX86_64}
+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
+{$ENDIF}
+
+{$push}
+{$CODEALIGN RECORDMIN=16}
+{$PACKRECORDS C}
+type
+  TM128 = record
+    case Byte of
+      0: (M128_F32: array[0..3] of Single);
+      1: (M128_F64: array[0..1] of Double);
+  end;
+{$pop}
+
+{ HVA test }
+  THVA = record
+    V1, V2, V3, V4: TM128;
+  end;
+
+operator +(X, Y: TM128)Z: TM128; vectorcall;
+  var
+    I: Integer;
+  begin
+    for I := 0 to 3 do
+      Z.M128_F32[I] := X.M128_F32[I] + Y.M128_F32[I];
+  end;
+
+operator -(X, Y: TM128)Z: TM128; vectorcall;
+  var
+    I: Integer;
+  begin
+    for I := 0 to 3 do
+      Z.M128_F32[I] := X.M128_F32[I] - Y.M128_F32[I];
+  end;
+
+{ - InputHVA goes on the stack because there are not enough free XMM registers to contain the entire argument
+  - A4 does NOT go on the stack and goes into an XMM register.
+}
+function HVATest(A1, A2, A3: TM128; InputHVA: THVA; A4: TM128; Op: Integer): THVA; vectorcall;
+  begin
+    { FIXME: There is an internal stack misalignment for A4, necessitating the
+      use of (V)MOVDQU instead of (V)MOVDQA in the compiled code. }
+    case Op of
+      1:
+        begin
+          HVATest.V1 := InputHVA.V1 + A1;
+          HVATest.V2 := InputHVA.V2 + A2;
+          HVATest.V3 := InputHVA.V3 + A3;
+          HVATest.V4 := InputHVA.V4 + A4;
+        end;
+      2:
+        begin
+          HVATest.V1 := InputHVA.V1 - A1;
+          HVATest.V2 := InputHVA.V2 - A2;
+          HVATest.V3 := InputHVA.V3 - A3;
+          HVATest.V4 := InputHVA.V4 - A4;
+        end;
+      else
+        begin
+          HVATest.V1 := InputHVA.V1 + A1;
+          HVATest.V2 := InputHVA.V2 - A2;
+          HVATest.V3 := InputHVA.V3 + A3;
+          HVATest.V4 := InputHVA.V4 - A4;
+        end;
+    end;
+  end;
+
+var
+  B1, B2, B3, B4: TM128; HVA, AddRes, SubRes, MixRes, AddExp, SubExp, MixExp: THVA; I: Integer;
+begin
+  B1.M128_F32[0] := 1.0;        B1.M128_F32[1] := 2.0;        B1.M128_F32[2] := 3.0;        B1.M128_F32[3] := 4.0;
+  B2.M128_F32[0] := 5.0;        B2.M128_F32[1] := 6.0;        B2.M128_F32[2] := 7.0;        B2.M128_F32[3] := 8.0;
+  B3.M128_F32[0] := 9.0;        B3.M128_F32[1] := 10.0;       B3.M128_F32[2] := 11.0;       B3.M128_F32[3] := 12.0;
+  B4.M128_F32[0] := 13.0;       B4.M128_F32[1] := 14.0;       B4.M128_F32[2] := 15.0;       B4.M128_F32[3] := 16.0;
+
+  HVA.V1.M128_F32[0] := 10.0;   HVA.V1.M128_F32[1] := 20.0;   HVA.V1.M128_F32[2] := 30.0;   HVA.V1.M128_F32[3] := 40.0;
+  HVA.V2.M128_F32[0] := 50.0;   HVA.V2.M128_F32[1] := 60.0;   HVA.V2.M128_F32[2] := 70.0;   HVA.V2.M128_F32[3] := 80.0;
+  HVA.V3.M128_F32[0] := 90.0;   HVA.V3.M128_F32[1] := 100.0;  HVA.V3.M128_F32[2] := 110.0;  HVA.V3.M128_F32[3] := 120.0;
+  HVA.V4.M128_F32[0] := 130.0;  HVA.V4.M128_F32[1] := 140.0;  HVA.V4.M128_F32[2] := 150.0;  HVA.V4.M128_F32[3] := 160.0;
+
+  AddExp.V1.M128_F32[0] := 11.0;   AddExp.V1.M128_F32[1] := 22.0;   AddExp.V1.M128_F32[2] := 33.0;   AddExp.V1.M128_F32[3] := 44.0;
+  AddExp.V2.M128_F32[0] := 55.0;   AddExp.V2.M128_F32[1] := 66.0;   AddExp.V2.M128_F32[2] := 77.0;   AddExp.V2.M128_F32[3] := 88.0;
+  AddExp.V3.M128_F32[0] := 99.0;   AddExp.V3.M128_F32[1] := 110.0;  AddExp.V3.M128_F32[2] := 121.0;  AddExp.V3.M128_F32[3] := 132.0;
+  AddExp.V4.M128_F32[0] := 143.0;  AddExp.V4.M128_F32[1] := 154.0;  AddExp.V4.M128_F32[2] := 165.0;  AddExp.V4.M128_F32[3] := 176.0;
+
+  SubExp.V1.M128_F32[0] := 9.0;    SubExp.V1.M128_F32[1] := 18.0;   SubExp.V1.M128_F32[2] := 27.0;   SubExp.V1.M128_F32[3] := 36.0;
+  SubExp.V2.M128_F32[0] := 45.0;   SubExp.V2.M128_F32[1] := 54.0;   SubExp.V2.M128_F32[2] := 63.0;   SubExp.V2.M128_F32[3] := 72.0;
+  SubExp.V3.M128_F32[0] := 81.0;   SubExp.V3.M128_F32[1] := 90.0;   SubExp.V3.M128_F32[2] := 99.0;   SubExp.V3.M128_F32[3] := 108.0;
+  SubExp.V4.M128_F32[0] := 117.0;  SubExp.V4.M128_F32[1] := 126.0;  SubExp.V4.M128_F32[2] := 135.0;  SubExp.V4.M128_F32[3] := 144.0;
+
+  MixExp.V1.M128_F32[0] := 11.0;   MixExp.V1.M128_F32[1] := 22.0;   MixExp.V1.M128_F32[2] := 33.0;   MixExp.V1.M128_F32[3] := 44.0;
+  MixExp.V2.M128_F32[0] := 45.0;   MixExp.V2.M128_F32[1] := 54.0;   MixExp.V2.M128_F32[2] := 63.0;   MixExp.V2.M128_F32[3] := 72.0;
+  MixExp.V3.M128_F32[0] := 99.0;   MixExp.V3.M128_F32[1] := 110.0;  MixExp.V3.M128_F32[2] := 121.0;  MixExp.V3.M128_F32[3] := 132.0;
+  MixExp.V4.M128_F32[0] := 117.0;  MixExp.V4.M128_F32[1] := 126.0;  MixExp.V4.M128_F32[2] := 135.0;  MixExp.V4.M128_F32[3] := 144.0;
+
+  WriteLn('    B1: ', B1.M128_F32[0], ',', B1.M128_F32[1], ',', B1.M128_F32[2], ',', B1.M128_F32[3]);
+  WriteLn('    B2: ', B2.M128_F32[0], ',', B2.M128_F32[1], ',', B2.M128_F32[2], ',', B2.M128_F32[3]);
+  WriteLn('    B3: ', B3.M128_F32[0], ',', B3.M128_F32[1], ',', B3.M128_F32[2], ',', B3.M128_F32[3]);
+  WriteLn('    B4: ', B4.M128_F32[0], ',', B4.M128_F32[1], ',', B4.M128_F32[2], ',', B4.M128_F32[3]);
+  WriteLn('HVA.V1: ', HVA.V1.M128_F32[0], ',', HVA.V1.M128_F32[1], ',', HVA.V1.M128_F32[2], ',', HVA.V1.M128_F32[3]);
+  WriteLn('HVA.V2: ', HVA.V2.M128_F32[0], ',', HVA.V2.M128_F32[1], ',', HVA.V2.M128_F32[2], ',', HVA.V2.M128_F32[3]);
+  WriteLn('HVA.V3: ', HVA.V3.M128_F32[0], ',', HVA.V3.M128_F32[1], ',', HVA.V3.M128_F32[2], ',', HVA.V3.M128_F32[3]);
+  WriteLn('HVA.V4: ', HVA.V4.M128_F32[0], ',', HVA.V4.M128_F32[1], ',', HVA.V4.M128_F32[2], ',', HVA.V4.M128_F32[3]);
+  AddRes := HVATest(B1, B2, B3, HVA, B4, 1);
+  SubRes := HVATest(B1, B2, B3, HVA, B4, 2);
+  MixRes := HVATest(B1, B2, B3, HVA, B4, 0);
+  WriteLn('----');
+  WriteLn('AddRes.V1: ', AddRes.V1.M128_F32[0], ',', AddRes.V1.M128_F32[1], ',', AddRes.V1.M128_F32[2], ',', AddRes.V1.M128_F32[3]);
+  WriteLn('AddRes.V2: ', AddRes.V2.M128_F32[0], ',', AddRes.V2.M128_F32[1], ',', AddRes.V2.M128_F32[2], ',', AddRes.V2.M128_F32[3]);
+  WriteLn('AddRes.V3: ', AddRes.V3.M128_F32[0], ',', AddRes.V3.M128_F32[1], ',', AddRes.V3.M128_F32[2], ',', AddRes.V3.M128_F32[3]);
+  WriteLn('AddRes.V4: ', AddRes.V4.M128_F32[0], ',', AddRes.V4.M128_F32[1], ',', AddRes.V4.M128_F32[2], ',', AddRes.V4.M128_F32[3]);
+  WriteLn();
+  WriteLn('AddExp.V1: ', AddExp.V1.M128_F32[0], ',', AddExp.V1.M128_F32[1], ',', AddExp.V1.M128_F32[2], ',', AddExp.V1.M128_F32[3]);
+  WriteLn('AddExp.V2: ', AddExp.V2.M128_F32[0], ',', AddExp.V2.M128_F32[1], ',', AddExp.V2.M128_F32[2], ',', AddExp.V2.M128_F32[3]);
+  WriteLn('AddExp.V3: ', AddExp.V3.M128_F32[0], ',', AddExp.V3.M128_F32[1], ',', AddExp.V3.M128_F32[2], ',', AddExp.V3.M128_F32[3]);
+  WriteLn('AddExp.V4: ', AddExp.V4.M128_F32[0], ',', AddExp.V4.M128_F32[1], ',', AddExp.V4.M128_F32[2], ',', AddExp.V4.M128_F32[3]);
+  WriteLn('----');
+  WriteLn('SubRes.V1: ', SubRes.V1.M128_F32[0], ',', SubRes.V1.M128_F32[1], ',', SubRes.V1.M128_F32[2], ',', SubRes.V1.M128_F32[3]);
+  WriteLn('SubRes.V2: ', SubRes.V2.M128_F32[0], ',', SubRes.V2.M128_F32[1], ',', SubRes.V2.M128_F32[2], ',', SubRes.V2.M128_F32[3]);
+  WriteLn('SubRes.V3: ', SubRes.V3.M128_F32[0], ',', SubRes.V3.M128_F32[1], ',', SubRes.V3.M128_F32[2], ',', SubRes.V3.M128_F32[3]);
+  WriteLn('SubRes.V4: ', SubRes.V4.M128_F32[0], ',', SubRes.V4.M128_F32[1], ',', SubRes.V4.M128_F32[2], ',', SubRes.V4.M128_F32[3]);
+  WriteLn();
+  WriteLn('SubExp.V1: ', SubExp.V1.M128_F32[0], ',', SubExp.V1.M128_F32[1], ',', SubExp.V1.M128_F32[2], ',', SubExp.V1.M128_F32[3]);
+  WriteLn('SubExp.V2: ', SubExp.V2.M128_F32[0], ',', SubExp.V2.M128_F32[1], ',', SubExp.V2.M128_F32[2], ',', SubExp.V2.M128_F32[3]);
+  WriteLn('SubExp.V3: ', SubExp.V3.M128_F32[0], ',', SubExp.V3.M128_F32[1], ',', SubExp.V3.M128_F32[2], ',', SubExp.V3.M128_F32[3]);
+  WriteLn('SubExp.V4: ', SubExp.V4.M128_F32[0], ',', SubExp.V4.M128_F32[1], ',', SubExp.V4.M128_F32[2], ',', SubExp.V4.M128_F32[3]);
+  WriteLn('----');
+  WriteLn('MixRes.V1: ', MixRes.V1.M128_F32[0], ',', MixRes.V1.M128_F32[1], ',', MixRes.V1.M128_F32[2], ',', MixRes.V1.M128_F32[3]);
+  WriteLn('MixRes.V2: ', MixRes.V2.M128_F32[0], ',', MixRes.V2.M128_F32[1], ',', MixRes.V2.M128_F32[2], ',', MixRes.V2.M128_F32[3]);
+  WriteLn('MixRes.V3: ', MixRes.V3.M128_F32[0], ',', MixRes.V3.M128_F32[1], ',', MixRes.V3.M128_F32[2], ',', MixRes.V3.M128_F32[3]);
+  WriteLn('MixRes.V4: ', MixRes.V4.M128_F32[0], ',', MixRes.V4.M128_F32[1], ',', MixRes.V4.M128_F32[2], ',', MixRes.V4.M128_F32[3]);
+  WriteLn();
+  WriteLn('MixExp.V1: ', MixExp.V1.M128_F32[0], ',', MixExp.V1.M128_F32[1], ',', MixExp.V1.M128_F32[2], ',', MixExp.V1.M128_F32[3]);
+  WriteLn('MixExp.V2: ', MixExp.V2.M128_F32[0], ',', MixExp.V2.M128_F32[1], ',', MixExp.V2.M128_F32[2], ',', MixExp.V2.M128_F32[3]);
+  WriteLn('MixExp.V3: ', MixExp.V3.M128_F32[0], ',', MixExp.V3.M128_F32[1], ',', MixExp.V3.M128_F32[2], ',', MixExp.V3.M128_F32[3]);
+  WriteLn('MixExp.V4: ', MixExp.V4.M128_F32[0], ',', MixExp.V4.M128_F32[1], ',', MixExp.V4.M128_F32[2], ',', MixExp.V4.M128_F32[3]);
+
+  for I := 0 to 3 do
+    begin
+      if AddRes.V1.M128_F32[I] <> AddExp.V1.M128_F32[I] then
+        begin
+          WriteLn('FAILURE on AddRes.V1.M128_F32[', I, ']');
+          Halt(1);
+        end;
+      if SubRes.V1.M128_F32[I] <> SubExp.V1.M128_F32[I] then
+        begin
+          WriteLn('FAILURE on SubRes.V1.M128_F32[', I, ']');
+          Halt(1);
+        end;
+      if MixRes.V1.M128_F32[I] <> MixExp.V1.M128_F32[I] then
+        begin
+          WriteLn('FAILURE on MixRes.V1.M128_F32[', I, ']');
+          Halt(1);
+        end;
+    end;
+
+  WriteLn('ok');
+end.

+ 158 - 0
tests/test/cg/tvectorcall3.pp

@@ -0,0 +1,158 @@
+{ %CPU=x86_64 } 
+program vectorcall_stack_test;
+
+{$IFNDEF CPUX86_64}
+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
+{$ENDIF}
+
+{ This program can be compiled on Linux, and all the vectorcall
+  routines should work the same, including the assembler routine.
+  'vectorcall' should be ignored by the compiler on this platform. }
+
+{$push}
+{$CODEALIGN RECORDMIN=16}
+{$PACKRECORDS C}
+type
+  TM128 = record
+    case Byte of
+      0: (M128_F32: array[0..3] of Single);
+      1: (M128_F64: array[0..1] of Double);
+  end;
+
+{$CODEALIGN RECORDMIN=32}
+{$PACKRECORDS C}
+type
+  TM256 = record
+    case Byte of
+      0: (M256_F32: array[0..7] of Single);
+      1: (M256_F64: array[0..3] of Double);
+      2: (M256_M128: array[0..1] of TM128);
+  end;
+{$pop}
+
+  TVector4f = packed record
+    case Byte of
+      0: (M128: TM128);
+      1: (X, Y, Z, W: Single);
+  end;
+
+  TVectorPair4f = packed record
+    case Byte of
+      0: (M256: TM256);
+      1: (V: array[0..1] of TVector4f);
+      2: (X1, Y1, Z1, W1, X2, Y2, Z2, W2: Single);
+  end;
+
+function TestFloat(TP: Single): Single; vectorcall; { vectorcall should have no effect on how this function behaves }
+begin
+  TestFloat := TP * 1.5;
+end;
+
+function AddVectors(V1, V2: TVector4f): TVector4f; vectorcall;
+begin
+  AddVectors.X := V1.X + V2.X;
+  AddVectors.Y := V1.Y + V2.Y;
+  AddVectors.Z := V1.Z + V2.Z;
+  AddVectors.W := V1.W + V2.W;
+end;
+
+{$ASMMODE Intel}
+function AddVectorsAsm(V1, V2: TVector4f): TVector4f; vectorcall; assembler; nostackframe; inline; { The inline is for a future test }
+asm
+  ADDPS XMM0, XMM1
+end;
+
+{ Note: V1, V2 and the result will go on the stack until FPC fully supports 256-bit vectors }
+function AddVectors(V1, V2: TVectorPair4f): TVectorPair4f; vectorcall;
+var
+  C: Integer;
+begin
+  for C := 0 to 1 do
+  begin
+    AddVectors.V[C].X := V1.V[C].X + V2.V[C].X;
+    AddVectors.V[C].Y := V1.V[C].Y + V2.V[C].Y;
+    AddVectors.V[C].Z := V1.V[C].Z + V2.V[C].Z;
+    AddVectors.V[C].W := V1.V[C].W + V2.V[C].W;
+  end;
+end;
+
+var
+  Vecs: array[0..1] of TVector4f; Res, ResAsm, Exp: TVector4f;
+  Pairs: array[0..1] of TVectorPair4f; ResPair, ExpPair: TVectorPair4f;
+  I: Integer;
+begin
+  FillDWord(Vecs[0], 0, 8);
+  Vecs[0].X := TestFloat(2.0);
+  Vecs[0].Y := 1.0;
+  Vecs[0].Z := -4.0;
+  Vecs[0].W := 1.0;
+
+  Vecs[1].X := 0.0;
+  Vecs[1].Y := -2.0;
+  Vecs[1].Z := TestFloat(4.0);
+  Vecs[1].W := 0.0;
+
+  Exp.X := 3.0;
+  Exp.Y := -1.0;
+  Exp.Z := 2.0;
+  Exp.W := 1.0;
+
+  Pairs[0].V[0].X := 1.0;     Pairs[0].V[1].X := 5.0;
+  Pairs[0].V[0].Y := 2.0;     Pairs[0].V[1].Y := 6.0;
+  Pairs[0].V[0].Z := 3.0;     Pairs[0].V[1].Z := 7.0;
+  Pairs[0].V[0].W := 4.0;     Pairs[0].V[1].W := 8.0;
+
+  Pairs[1].V[0].X := 9.0;     Pairs[1].V[1].X := 13.0;
+  Pairs[1].V[0].Y := 10.0;    Pairs[1].V[1].Y := 14.0;
+  Pairs[1].V[0].Z := 11.0;    Pairs[1].V[1].Z := 15.0;
+  Pairs[1].V[0].W := 12.0;    Pairs[1].V[1].W := 16.0;
+
+  ExpPair.V[0].X := 10.0;     ExpPair.V[1].X := 18.0;
+  ExpPair.V[0].Y := 12.0;     ExpPair.V[1].Y := 20.0;
+  ExpPair.V[0].Z := 14.0;     ExpPair.V[1].Z := 22.0;
+  ExpPair.V[0].W := 16.0;     ExpPair.V[1].W := 24.0;
+
+  WriteLn('Vecs[0]  = (', Vecs[0].X, ', ', Vecs[0].Y, ', ', Vecs[0].Z, ', ', Vecs[0].W, ')');
+  WriteLn('Vecs[1]  = (', Vecs[1].X, ', ', Vecs[1].Y, ', ', Vecs[1].Z, ', ', Vecs[1].W, ')');
+
+  Res := AddVectors(Vecs[0], Vecs[1]);
+  ResAsm := AddVectorsAsm(Vecs[0], Vecs[1]);
+
+  WriteLn('Result   = (', Res.X, ', ', Res.Y, ', ', Res.Z, ', ', Res.W, ')');
+  WriteLn('ResAsm   = (', ResAsm.X, ', ', ResAsm.Y, ', ', ResAsm.Z, ', ', ResAsm.W, ')');
+  WriteLn('Expected = (', Exp.X, ', ', Exp.Y, ', ', Exp.Z, ', ', Exp.W, ')');
+
+  WriteLn('Pairs[0] = (', Pairs[0].V[0].X, ', ', Pairs[0].V[0].Y, ', ', Pairs[0].V[0].Z, ', ', Pairs[0].V[0].W, ', ', Pairs[0].V[1].X, ', ', Pairs[0].V[1].Y, ', ', Pairs[0].V[1].Z, ', ', Pairs[0].V[1].W, ')');
+  WriteLn('Pairs[1] = (', Pairs[1].V[0].X, ', ', Pairs[1].V[0].Y, ', ', Pairs[1].V[0].Z, ', ', Pairs[1].V[0].W, ', ', Pairs[1].V[1].X, ', ', Pairs[1].V[1].Y, ', ', Pairs[1].V[1].Z, ', ', Pairs[1].V[1].W, ')');
+
+  ResPair := AddVectors(Pairs[0], Pairs[1]);
+
+  WriteLn('ResPair  = (', ResPair.V[0].X, ', ', ResPair.V[0].Y, ', ', ResPair.V[0].Z, ', ', ResPair.V[0].W, ', ', ResPair.V[1].X, ', ', ResPair.V[1].Y, ', ', ResPair.V[1].Z, ', ', ResPair.V[1].W, ')');
+  WriteLn('Expected = (', ExpPair.V[0].X, ', ', ExpPair.V[0].Y, ', ', ExpPair.V[0].Z, ', ', ExpPair.V[0].W, ', ', ExpPair.V[1].X, ', ', ExpPair.V[1].Y, ', ', ExpPair.V[1].Z, ', ', ExpPair.V[1].W, ')');
+
+  for I := 0 to 3 do
+  begin
+    if Res.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
+    begin
+      WriteLn('FAILURE on Res.M128.M128_F32[', I, ']');
+      Halt(1);
+    end;
+
+    if ResAsm.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
+    begin
+      WriteLn('FAILURE on ResAsm.M128.M128_F32[', I, ']');
+      Halt(1);
+    end;
+  end;
+
+  for I := 0 to 7 do
+  begin
+    if ResPair.M256.M256_F32[I] <> ExpPair.M256.M256_F32[I] then
+    begin
+      WriteLn('FAILURE on ResPair.M256.M256_F32[', I, ']');
+      Halt(1);
+    end;
+  end;
+
+  WriteLn('ok');
+end.

Энэ ялгаанд хэт олон файл өөрчлөгдсөн тул зарим файлыг харуулаагүй болно