7 жил өмнө · 31f78ea2b6
--- a/.gitattributes
+++ b/.gitattributes
@@ -11999,6 +11999,9 @@ tests/test/cg/ttryfin4.pp svneol=native#text/plain
 
															 tests/test/cg/ttryfin5.pp svneol=native#text/plain
														
 
															 tests/test/cg/tumin.pp svneol=native#text/plain
														
 
															 tests/test/cg/tvec.pp svneol=native#text/plain
														
 
															+tests/test/cg/tvectorcall1.pp svneol=native#text/pascal
														
 
															+tests/test/cg/tvectorcall2.pp svneol=native#text/pascal
														
 
															+tests/test/cg/tvectorcall3.pp svneol=native#text/pascal
														
 
															 tests/test/cg/uandorxorassign.pp svneol=native#text/plain
														
 
															 tests/test/cg/unegnotassign.pp svneol=native#text/plain
														
 
															 tests/test/cg/uprintf3.pp svneol=native#text/plain
														
--- a/compiler/cgbase.pas
+++ b/compiler/cgbase.pas
@@ -164,14 +164,18 @@ interface
 
															        { OS_NO is also used memory references with large data that can
														
 
															          not be loaded in a register directly }
														
 
															        TCgSize = (OS_NO,
														
 
															-                 { integer registers }
														
 
															-                  OS_8,OS_16,OS_32,OS_64,OS_128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
														
 
															-                 { single,double,extended,comp,float128 }
														
 
															-                  OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
														
 
															+                  OS_8,   OS_16,   OS_32,   OS_64,   OS_128,
														
 
															+                  OS_S8,  OS_S16,  OS_S32,  OS_S64,  OS_S128,
														
 
															+                 { single, double, extended, comp, float128 }
														
 
															+                  OS_F32, OS_F64,  OS_F80,  OS_C64,  OS_F128,
														
 
															                  { multi-media sizes: split in byte, word, dword, ... }
														
 
															                  { entities, then the signed counterparts             }
														
 
															-                  OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,  
														
 
															-                  OS_MS8,OS_MS16,OS_MS32,OS_MS64,OS_MS128,OS_MS256 );  
														
 
															+                  OS_M8,  OS_M16,  OS_M32,  OS_M64,  OS_M128,  OS_M256,  OS_M512,
														
 
															+                  OS_MS8, OS_MS16, OS_MS32, OS_MS64, OS_MS128, OS_MS256, OS_MS512,
														
 
															+                 { multi-media sizes: single-precision floating-point }
														
 
															+                  OS_MF32, OS_MF128, OS_MF256, OS_MF512,
														
 
															+                 { multi-media sizes: double-precision floating-point }
														
 
															+                  OS_MD64, OS_MD128, OS_MD256, OS_MD512);
														
 
															       { Register types }
														
 
															       TRegisterType = (
														
@@ -205,15 +209,16 @@ interface
 
															         { For Intel X86 AVX-Register }
														
 
															         R_SUBMMX,     { = 12; 128 BITS }
														
 
															         R_SUBMMY,     { = 13; 256 BITS }
														
 
															+        R_SUBMMZ,     { = 14; 512 BITS }
														
 
															         { Subregisters for the flags register (x86) }
														
 
															-        R_SUBFLAGCARRY,     { = 14; Carry flag }
														
 
															-        R_SUBFLAGPARITY,    { = 15; Parity flag }
														
 
															-        R_SUBFLAGAUXILIARY, { = 16; Auxiliary flag }
														
 
															-        R_SUBFLAGZERO,      { = 17; Zero flag }
														
 
															-        R_SUBFLAGSIGN,      { = 18; Sign flag }
														
 
															-        R_SUBFLAGOVERFLOW,  { = 19; Overflow flag }
														
 
															-        R_SUBFLAGINTERRUPT, { = 20; Interrupt enable flag }
														
 
															-        R_SUBFLAGDIRECTION  { = 21; Direction flag }
														
 
															+        R_SUBFLAGCARRY,     { = 15; Carry flag }
														
 
															+        R_SUBFLAGPARITY,    { = 16; Parity flag }
														
 
															+        R_SUBFLAGAUXILIARY, { = 17; Auxiliary flag }
														
 
															+        R_SUBFLAGZERO,      { = 18; Zero flag }
														
 
															+        R_SUBFLAGSIGN,      { = 19; Sign flag }
														
 
															+        R_SUBFLAGOVERFLOW,  { = 20; Overflow flag }
														
 
															+        R_SUBFLAGINTERRUPT, { = 21; Interrupt enable flag }
														
 
															+        R_SUBFLAGDIRECTION  { = 22; Direction flag }
														
 
															       );
														
 
															       TSubRegisterSet = set of TSubRegister;
														
@@ -307,12 +312,19 @@ interface
 
															        NR_INVALID    = tregister($fffffffff);
														
 
															        tcgsize2size : Array[tcgsize] of integer =
														
 
															+        (0,
														
 
															          { integer values }
														
 
															-        (0,1,2,4,8,16,1,2,4,8,16,
														
 
															+         1,  2,  4,  8, 16,
														
 
															+         1,  2,  4,  8, 16,
														
 
															          { floating point values }
														
 
															-         4,8,10,8,16,
														
 
															+         4,  8, 10,  8, 16,
														
 
															          { multimedia values }
														
 
															-         1,2,4,8,16,32,1,2,4,8,16,32); 
														
 
															+         1,  2,  4,  8, 16, 32, 64,
														
 
															+         1,  2,  4,  8, 16, 32, 64,
														
 
															+         { single-precision multimedia values }
														
 
															+         4, 16, 32, 64,
														
 
															+         { double-precision multimedia values }
														
 
															+         8, 16, 32, 64);
														
 
															        tfloat2tcgsize: array[tfloattype] of tcgsize =
														
 
															          (OS_F32,OS_F64,OS_F80,OS_F80,OS_C64,OS_C64,OS_F128);
														
@@ -348,16 +360,25 @@ interface
 
															        { Table to convert tcgsize variables to the correspondending
														
 
															          unsigned types }
														
 
															        tcgsize2unsigned : array[tcgsize] of tcgsize = (OS_NO,
														
 
															-          OS_8,OS_16,OS_32,OS_64,OS_128,OS_8,OS_16,OS_32,OS_64,OS_128,
														
 
															-          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
														
 
															-          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
														
 
															-          OS_M64,OS_M128,OS_M256);
														
 
															+         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
														
 
															+         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
														
 
															+
														
 
															+         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
														
 
															+         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
														
 
															+         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
														
 
															+         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
														
 
															+         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
														
 
															+
														
 
															        tcgsize2signed : array[tcgsize] of tcgsize = (OS_NO,
														
 
															-          OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
														
 
															-          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
														
 
															-          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
														
 
															-          OS_M64,OS_M128,OS_M256);
														
 
															+         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
														
 
															+         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
														
 
															+
														
 
															+         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
														
 
															+         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
														
 
															+         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
														
 
															+         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
														
 
															+         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
														
 
															        tcgloc2str : array[TCGLoc] of string[12] = (
														
@@ -404,6 +425,8 @@ interface
 
															     }
														
 
															     function int_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
														
 
															     function int_float_cgsize(const a: tcgint): tcgsize;
														
 
															+    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
														
 
															+    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
														
 
															     function tcgsize2str(cgsize: tcgsize):string;
														
@@ -660,6 +683,8 @@ implementation
 
															             result:=result+'mx';
														
 
															           R_SUBMMY:
														
 
															             result:=result+'my';
														
 
															+          R_SUBMMZ:
														
 
															+            result:=result+'mz';
														
 
															           else
														
 
															             internalerror(200308252);
														
 
															         end;
														
@@ -701,6 +726,39 @@ implementation
 
															       end;
														
 
															+    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
														
 
															+      begin
														
 
															+        case a of
														
 
															+          4:
														
 
															+            result := OS_MF32;
														
 
															+          16:
														
 
															+            result := OS_MF128;
														
 
															+          32:
														
 
															+            result := OS_MF256;
														
 
															+          64:
														
 
															+            result := OS_MF512;
														
 
															+          else
														
 
															+            result := int_cgsize(a);
														
 
															+        end;
														
 
															+      end;
														
 
															+
														
 
															+    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
														
 
															+      begin
														
 
															+        case a of
														
 
															+          8:
														
 
															+            result := OS_MD64;
														
 
															+          16:
														
 
															+            result := OS_MD128;
														
 
															+          32:
														
 
															+            result := OS_MD256;
														
 
															+          64:
														
 
															+            result := OS_MD512;
														
 
															+          else
														
 
															+            result := int_cgsize(a);
														
 
															+        end;
														
 
															+      end;
														
 
															+
														
 
															+
														
 
															     function tcgsize2str(cgsize: tcgsize):string;
														
 
															       begin
														
 
															         Str(cgsize, Result);
														
--- a/compiler/defutil.pas
+++ b/compiler/defutil.pas
@@ -1338,7 +1338,24 @@ implementation
 
															           arraydef :
														
 
															             begin
														
 
															               if is_dynamic_array(def) or not is_special_array(def) then
														
 
															-                result := int_cgsize(def.size)
														
 
															+                begin
														
 
															+                  if (cs_support_vectors in current_settings.globalswitches) and is_vector(def) and ((TArrayDef(def).elementdef.typ = floatdef) and not (cs_fp_emulation in current_settings.moduleswitches)) then
														
 
															+                    begin
														
 
															+                      { Determine if, based on the floating-point type and the size
														
 
															+                        of the array, if it can be made into a vector }
														
 
															+                      case TFloatDef(def).floattype of
														
 
															+                        s32real:
														
 
															+                          result := float_array_cgsize(def.size);
														
 
															+                        s64real:
														
 
															+                          result := double_array_cgsize(def.size);
														
 
															+                        else
														
 
															+                          { If not, fall back }
														
 
															+                          result := int_cgsize(def.size);
														
 
															+                      end;
														
 
															+                    end
														
 
															+                  else
														
 
															+                    result := int_cgsize(def.size);
														
 
															+                end
														
 
															               else
														
 
															                 result := OS_NO;
														
 
															             end;
														
@@ -1379,25 +1396,53 @@ implementation
 
															         case def.typ of
														
 
															           arraydef:
														
 
															             begin
														
 
															-              if tarraydef(def).elementdef.typ in [orddef,floatdef] then
														
 
															-                begin
														
 
															-                  { this is not correct, OS_MX normally mean that the vector
														
 
															-                    contains elements of size X. However, vectors themselves
														
 
															-                    can also have different sizes (e.g. a vector of 2 singles on
														
 
															-                    SSE) and the total size is currently more important }
														
 
															-                  case def.size of
														
 
															-                    1: result:=OS_M8;
														
 
															-                    2: result:=OS_M16;
														
 
															-                    4: result:=OS_M32;
														
 
															-                    8: result:=OS_M64;
														
 
															-                    16: result:=OS_M128;
														
 
															-                    32: result:=OS_M256;
														
 
															-                    else
														
 
															-                      internalerror(2013060103);
														
 
															+              case tarraydef(def).elementdef.typ of
														
 
															+                orddef:
														
 
															+                  begin
														
 
															+                    { this is not correct, OS_MX normally mean that the vector
														
 
															+                      contains elements of size X. However, vectors themselves
														
 
															+                      can also have different sizes (e.g. a vector of 2 singles on
														
 
															+                      SSE) and the total size is currently more important }
														
 
															+                    case def.size of
														
 
															+                      1: result:=OS_M8;
														
 
															+                      2: result:=OS_M16;
														
 
															+                      4: result:=OS_M32;
														
 
															+                      8: result:=OS_M64;
														
 
															+                      16: result:=OS_M128;
														
 
															+                      32: result:=OS_M256;
														
 
															+                      64: result:=OS_M512;
														
 
															+                      else
														
 
															+                        internalerror(2013060103);
														
 
															+                    end;
														
 
															                   end;
														
 
															-                end
														
 
															-              else
														
 
															-                result:=def_cgsize(def);
														
 
															+                floatdef:
														
 
															+                  begin
														
 
															+                    case TFloatDef(tarraydef(def).elementdef).floattype of
														
 
															+                      s32real:
														
 
															+                        case def.size of
														
 
															+                          4:  result:=OS_MF32;
														
 
															+                          16: result:=OS_MF128;
														
 
															+                          32: result:=OS_MF256;
														
 
															+                          64: result:=OS_MF512;
														
 
															+                          else
														
 
															+                            internalerror(2017121400);
														
 
															+                        end;
														
 
															+                      s64real:
														
 
															+                        case def.size of
														
 
															+                          8:  result:=OS_MD64;
														
 
															+                          16: result:=OS_MD128;
														
 
															+                          32: result:=OS_MD256;
														
 
															+                          64: result:=OS_MD512;
														
 
															+                          else
														
 
															+                            internalerror(2017121401);
														
 
															+                        end;
														
 
															+                      else
														
 
															+                        internalerror(2017121402);
														
 
															+                    end;
														
 
															+                  end;
														
 
															+                else
														
 
															+                  result:=def_cgsize(def);
														
 
															+              end;
														
 
															             end
														
 
															           else
														
 
															             result:=def_cgsize(def);
														
--- a/compiler/globals.pas
+++ b/compiler/globals.pas
@@ -1112,7 +1112,8 @@ implementation
 
															          'SYSV_ABI_DEFAULT',
														
 
															          'SYSV_ABI_CDECL',
														
 
															          'MS_ABI_DEFAULT',
														
 
															-         'MS_ABI_CDECL'
														
 
															+         'MS_ABI_CDECL',
														
 
															+         'VECTORCALL'
														
 
															         );
														
 
															       var
														
 
															         t  : tproccalloption;
														
--- a/compiler/globtype.pas
+++ b/compiler/globtype.pas
@@ -539,7 +539,9 @@ interface
 
															          pocall_sysv_abi_cdecl,
														
 
															          { for x86-64: forces Microsoft ABI (Pascal resp. C) }
														
 
															          pocall_ms_abi_default,
														
 
															-         pocall_ms_abi_cdecl
														
 
															+         pocall_ms_abi_cdecl,
														
 
															+         { for x86-64: Microsoft's "vectorcall" ABI }
														
 
															+         pocall_vectorcall
														
 
															        );
														
 
															        tproccalloptions = set of tproccalloption;
														
@@ -560,9 +562,10 @@ interface
 
															            'Interrupt',
														
 
															            'HardFloat',
														
 
															            'SysV_ABI_Default',
														
 
															-           'MS_ABI_CDecl',
														
 
															+           'MS_ABI_CDecl', { TODO: Is this correct? Shouldn't it be SysV_ABI_Default }
														
 
															            'MS_ABI_Default',
														
 
															-           'MS_ABI_CDecl'
														
 
															+           'MS_ABI_CDecl',
														
 
															+           'VectorCall'
														
 
															          );
														
 
															        { Default calling convention }
														
--- a/compiler/hlcg2ll.pas
+++ b/compiler/hlcg2ll.pas
@@ -1538,6 +1538,8 @@ implementation
 
															               result:=OS_F32;
														
 
															             OS_64:
														
 
															               result:=OS_F64;
														
 
															+            OS_128:
														
 
															+              result:=OS_M128;
														
 
															           end;
														
 
															         end;
														
 
															     end;
														
--- a/compiler/i386/cpubase.inc
+++ b/compiler/i386/cpubase.inc
@@ -35,7 +35,8 @@
 
															         S_NEAR,S_FAR,S_SHORT,
														
 
															         S_T,
														
 
															         S_XMM,
														
 
															-        S_YMM
														
 
															+        S_YMM,
														
 
															+        S_ZMM
														
 
															       );
														
 
															       TOpSizes = set of topsize;
														
--- a/compiler/i8086/cpubase.inc
+++ b/compiler/i8086/cpubase.inc
@@ -35,7 +35,8 @@
 
															         S_NEAR,S_FAR,S_SHORT,
														
 
															         S_T,
														
 
															         S_XMM,
														
 
															-        S_YMM
														
 
															+        S_YMM,
														
 
															+        S_ZMM
														
 
															       );
														
 
															       TOpSizes = set of topsize;
														
--- a/compiler/ncgld.pas
+++ b/compiler/ncgld.pas
@@ -682,6 +682,7 @@ implementation
 
															     procedure tcgassignmentnode.pass_generate_code;
														
 
															       var
														
 
															+         shuffle : pmmshuffle;
														
 
															          hlabel : tasmlabel;
														
 
															          href : treference;
														
 
															          releaseright : boolean;
														
@@ -968,22 +969,21 @@ implementation
 
															               LOC_MMREGISTER,
														
 
															               LOC_CMMREGISTER:
														
 
															                 begin
														
 
															-                  if left.resultdef.typ=arraydef then
														
 
															-                    begin
														
 
															-                    end
														
 
															+                  if (is_vector(left.resultdef)) then
														
 
															+                    shuffle := nil
														
 
															                   else
														
 
															-                    begin
														
 
															-                      case left.location.loc of
														
 
															-                        LOC_CMMREGISTER,
														
 
															-                        LOC_MMREGISTER:
														
 
															-                          hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register,mms_movescalar);
														
 
															-                        LOC_REFERENCE,
														
 
															-                        LOC_CREFERENCE:
														
 
															-                          hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference,mms_movescalar);
														
 
															-                        else
														
 
															-                          internalerror(2009112601);
														
 
															-                      end;
														
 
															-                    end;
														
 
															+                    shuffle := mms_movescalar;
														
 
															+
														
 
															+                  case left.location.loc of
														
 
															+                    LOC_CMMREGISTER,
														
 
															+                    LOC_MMREGISTER:
														
 
															+                      hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register, shuffle);
														
 
															+                    LOC_REFERENCE,
														
 
															+                    LOC_CREFERENCE:
														
 
															+                      hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference, shuffle);
														
 
															+                    else
														
 
															+                      internalerror(2009112601);
														
 
															+                  end;
														
 
															                 end;
														
 
															               LOC_REGISTER,
														
 
															               LOC_CREGISTER :
														
--- a/compiler/ncgrtti.pas
+++ b/compiler/ncgrtti.pas
@@ -335,7 +335,8 @@ implementation
 
															          { pocall_sysv_abi_default } 14,
														
 
															          { pocall_sysv_abi_cdecl }   15,
														
 
															          { pocall_ms_abi_default }   16,
														
 
															-         { pocall_ms_abi_cdecl }     17
														
 
															+         { pocall_ms_abi_cdecl }     17,
														
 
															+         { pocall_vectorcall }       18
														
 
															         );
														
 
															       begin
														
 
															         tcb.emit_ord_const(ProcCallOptionToCallConv[def.proccalloption],u8inttype);
														
--- a/compiler/pdecsub.pas
+++ b/compiler/pdecsub.pas
@@ -2382,7 +2382,7 @@ type
 
															    end;
														
 
															 const
														
 
															   {Should contain the number of procedure directives we support.}
														
 
															-  num_proc_directives=50;
														
 
															+  num_proc_directives=51;
														
 
															   proc_direcdata:array[1..num_proc_directives] of proc_dir_rec=
														
 
															    (
														
 
															     (
														
@@ -2849,6 +2849,15 @@ const
 
															       mutexclpocall : [];
														
 
															       mutexclpotype : [potype_constructor,potype_destructor,potype_class_constructor,potype_class_destructor];
														
 
															       mutexclpo     : [po_interrupt]
														
 
															+    ),(
														
 
															+      idtok:_VECTORCALL;
														
 
															+      pd_flags : [pd_interface,pd_implemen,pd_body,pd_procvar];
														
 
															+      handler  : nil;
														
 
															+      pocall   : pocall_vectorcall;
														
 
															+      pooption : [];
														
 
															+      mutexclpocall : [];
														
 
															+      mutexclpotype : [potype_constructor,potype_destructor,potype_class_constructor,potype_class_destructor];
														
 
															+      mutexclpo     : [po_interrupt]
														
 
															     )
														
 
															    );
														
--- a/compiler/symtype.pas
+++ b/compiler/symtype.pas
@@ -82,6 +82,7 @@ interface
 
															          function  alignment:shortint;virtual;abstract;
														
 
															          { alignment when this type appears in a record/class/... }
														
 
															          function  structalignment:shortint;virtual;
														
 
															+         function  aggregatealignment:shortint;virtual;
														
 
															          function  getvardef:longint;virtual;abstract;
														
 
															          function  getparentdef:tdef;virtual;
														
 
															          function  getsymtable(t:tgetsymtable):TSymtable;virtual;
														
@@ -379,6 +380,14 @@ implementation
 
															         result:=alignment;
														
 
															       end;
														
 
															+    function tdef.aggregatealignment: shortint;
														
 
															+      begin
														
 
															+        if Assigned(Owner) and Assigned(Owner.defowner) and (Owner.defowner is TDef) and (Owner.defowner <> Self) then
														
 
															+          Result := max(structalignment, TDef(Owner.defowner).aggregatealignment)
														
 
															+        else
														
 
															+          Result := structalignment;
														
 
															+      end;
														
 
															+
														
 
															     procedure tdef.ChangeOwner(st:TSymtable);
														
 
															       begin
														
--- a/compiler/tokens.pas
+++ b/compiler/tokens.pas
@@ -289,6 +289,7 @@ type
 
															     _OPENSTRING,
														
 
															     _RIGHTSHIFT,
														
 
															     _SPECIALIZE,
														
 
															+    _VECTORCALL,
														
 
															     _CONSTRUCTOR,
														
 
															     _GREATERTHAN,
														
 
															     _INTERNCONST,
														
@@ -628,6 +629,7 @@ const
 
															       (str:'OPENSTRING'    ;special:false;keyword:[m_none];op:NOTOKEN),
														
 
															       (str:'RIGHTSHIFT'    ;special:false;keyword:[m_none];op:NOTOKEN), { delphi operator name }
														
 
															       (str:'SPECIALIZE'    ;special:false;keyword:[m_none];op:NOTOKEN),
														
 
															+      (str:'VECTORCALL'    ;special:false;keyword:[m_none];op:NOTOKEN),
														
 
															       (str:'CONSTRUCTOR'   ;special:false;keyword:alllanguagemodes-[m_iso,m_extpas];op:NOTOKEN),
														
 
															       (str:'GREATERTHAN'   ;special:false;keyword:[m_none];op:NOTOKEN), { delphi operator name }
														
 
															       (str:'INTERNCONST'   ;special:false;keyword:[m_none];op:NOTOKEN),
														
--- a/compiler/x86/aasmcpu.pas
+++ b/compiler/x86/aasmcpu.pas
@@ -52,6 +52,7 @@ interface
 
															       OT_BITS64    = $00000008;  { x86_64 and FPU }
														
 
															       OT_BITS128   = $10000000;  { 16 byte SSE }
														
 
															       OT_BITS256   = $20000000;  { 32 byte AVX }
														
 
															+      OT_BITS512   = $40000000;  { 64 byte AVX512 }
														
 
															       OT_BITS80    = $00000010;  { FPU only  }
														
 
															       OT_FAR       = $00000020;  { this means 16:16 or 16:32, like in CALL/JMP }
														
 
															       OT_NEAR      = $00000040;
														
@@ -612,7 +613,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          ),
														
 
															          (OT_NONE,
														
 
															           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,OT_BITS8,OT_BITS16,OT_BITS32,
														
@@ -622,7 +624,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          ),
														
 
															          (OT_NONE,
														
 
															           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,
														
@@ -632,7 +635,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          )
														
 
															        );
														
@@ -650,7 +654,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          ),
														
 
															          (OT_NONE,
														
 
															           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
														
@@ -660,7 +665,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          ),
														
 
															          (OT_NONE,
														
 
															           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
														
@@ -670,7 +676,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          )
														
 
															       );
														
@@ -688,7 +695,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          ),
														
 
															          (OT_NONE,
														
 
															           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
														
@@ -698,7 +706,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          ),
														
 
															          (OT_NONE,
														
 
															           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
														
@@ -708,7 +717,8 @@ implementation
 
															           OT_NEAR,OT_FAR,OT_SHORT,
														
 
															           OT_NONE,
														
 
															           OT_BITS128,
														
 
															-          OT_BITS256
														
 
															+          OT_BITS256,
														
 
															+          OT_BITS512
														
 
															          )
														
 
															       );
														
--- a/compiler/x86/cgx86.pas
+++ b/compiler/x86/cgx86.pas
@@ -158,20 +158,26 @@ unit cgx86;
 
															       TCGSize2OpSize: Array[tcgsize] of topsize =
														
 
															         (S_NO,S_B,S_W,S_L,S_Q,S_XMM,S_B,S_W,S_L,S_Q,S_XMM,
														
 
															          S_FS,S_FL,S_FX,S_IQ,S_FXX,
														
 
															-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
														
 
															-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
														
 
															+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_XMM,S_YMM,S_ZMM);
														
 
															 {$elseif defined(i386)}
														
 
															       TCGSize2OpSize: Array[tcgsize] of topsize =
														
 
															         (S_NO,S_B,S_W,S_L,S_L,S_T,S_B,S_W,S_L,S_L,S_L,
														
 
															          S_FS,S_FL,S_FX,S_IQ,S_FXX,
														
 
															-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
														
 
															-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
														
 
															+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_XMM,S_YMM,S_ZMM);
														
 
															 {$elseif defined(i8086)}
														
 
															       TCGSize2OpSize: Array[tcgsize] of topsize =
														
 
															         (S_NO,S_B,S_W,S_W,S_W,S_T,S_B,S_W,S_W,S_W,S_W,
														
 
															          S_FS,S_FL,S_FX,S_IQ,S_FXX,
														
 
															-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
														
 
															-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
														
 
															+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_XMM,S_YMM,S_ZMM,
														
 
															+         S_NO,S_XMM,S_YMM,S_ZMM);
														
 
															 {$endif}
														
 
															 {$ifndef NOTARGETWIN}
														
@@ -185,6 +191,9 @@ unit cgx86;
 
															     { returns true, if the compiler should use leave instead of mov/pop }
														
 
															     function UseLeave: boolean;
														
 
															+    { Gets the byte alignment of a reference }
														
 
															+    function GetRefAlignment(ref: treference): Byte;
														
 
															+
														
 
															   implementation
														
 
															     uses
														
@@ -225,6 +234,22 @@ unit cgx86;
 
															 {$endif}
														
 
															       end;
														
 
															+    function GetRefAlignment(ref: treference): Byte; {$IFDEF USEINLINE}inline;{$ENDIF}
														
 
															+      begin
														
 
															+{$ifdef x86_64}
														
 
															+        { The stack pointer and base pointer will be aligned to 16-byte boundaries if the machine code is well-behaved }
														
 
															+        if (ref.base = NR_RSP) or (ref.base = NR_RBP) then
														
 
															+          begin
														
 
															+            if (ref.index = NR_NO) and ((ref.offset mod 16) = 0) then
														
 
															+              Result := 16
														
 
															+            else
														
 
															+              Result := ref.alignment;
														
 
															+          end
														
 
															+        else
														
 
															+{$endif x86_64}
														
 
															+          Result := ref.alignment;
														
 
															+      end;
														
 
															+
														
 
															     const
														
 
															       TOpCG2AsmOp: Array[topcg] of TAsmOp = (A_NONE,A_MOV,A_ADD,A_AND,A_DIV,
														
 
															                             A_IDIV,A_IMUL,A_MUL,A_NEG,A_NOT,A_OR,
														
@@ -268,8 +293,19 @@ unit cgx86;
 
															             result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
														
 
															           OS_M64:
														
 
															             result:=rg[R_MMREGISTER].getregister(list,R_SUBQ);
														
 
															-          OS_M128:
														
 
															-            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMWHOLE);
														
 
															+          OS_M128,
														
 
															+          OS_F128,
														
 
															+          OS_MF128,
														
 
															+          OS_MD128:
														
 
															+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMX); { R_SUBMMWHOLE seems a bit dangerous and ambiguous, so changed to R_SUBMMX. [Kit] }
														
 
															+          OS_M256,
														
 
															+          OS_MF256,
														
 
															+          OS_MD256:
														
 
															+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMY);
														
 
															+          OS_M512,
														
 
															+          OS_MF512,
														
 
															+          OS_MD512:
														
 
															+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMZ);
														
 
															           else
														
 
															             internalerror(200506041);
														
 
															         end;
														
@@ -1260,13 +1296,13 @@ unit cgx86;
 
															           (A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
														
 
															           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
														
 
															           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
														
 
															-          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
														
 
															+          (A_NONE,A_NONE,A_NONE,A_NONE,A_MOVAPS));
														
 
															         convertopavx : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
														
 
															           (A_VMOVSS,A_VCVTSS2SD,A_NONE,A_NONE,A_NONE),
														
 
															           (A_VCVTSD2SS,A_VMOVSD,A_NONE,A_NONE,A_NONE),
														
 
															           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
														
 
															           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
														
 
															-          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
														
 
															+          (A_NONE,A_NONE,A_NONE,A_NONE,A_VMOVAPS));
														
 
															       begin
														
 
															         { we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
														
 
															           OS_32/OS_64 (record in memory/LOC_REFERENCE) }
														
@@ -1288,13 +1324,33 @@ unit cgx86;
 
															           end
														
 
															         { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
														
 
															           OS_64 (record in memory/LOC_REFERENCE) }
														
 
															-        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
														
 
															-                (fromsize=OS_M64) then
														
 
															+        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) then
														
 
															           begin
														
 
															-            if UseAVX then
														
 
															-              result:=A_VMOVQ
														
 
															-            else
														
 
															-              result:=A_MOVQ;
														
 
															+            case fromsize of
														
 
															+              OS_M64:
														
 
															+                { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
														
 
															+                  OS_64 (record in memory/LOC_REFERENCE) }
														
 
															+                if UseAVX then
														
 
															+                  result:=A_VMOVQ
														
 
															+                else
														
 
															+                  result:=A_MOVQ;
														
 
															+              OS_M128:
														
 
															+                { 128-bit aligned vector }
														
 
															+                if UseAVX then
														
 
															+                  result:=A_VMOVAPS
														
 
															+                else
														
 
															+                  result:=A_MOVAPS;
														
 
															+              OS_M256,
														
 
															+              OS_M512:
														
 
															+                { 256-bit aligned vector }
														
 
															+                if UseAVX then
														
 
															+                  result:=A_VMOVAPS
														
 
															+                else
														
 
															+                  { SSE does not support 256-bit or 512-bit vectors }
														
 
															+                  InternalError(2018012930);
														
 
															+              else
														
 
															+                InternalError(2018012920);
														
 
															+            end;
														
 
															           end
														
 
															         else
														
 
															           internalerror(2010060104);
														
@@ -1313,12 +1369,14 @@ unit cgx86;
 
															             if fromsize=tosize then
														
 
															               { needs correct size in case of spilling }
														
 
															               case fromsize of
														
 
															-                OS_F32:
														
 
															+                OS_F32,
														
 
															+                OS_MF128:
														
 
															                   if UseAVX then
														
 
															                     instr:=taicpu.op_reg_reg(A_VMOVAPS,S_NO,reg1,reg2)
														
 
															                   else
														
 
															                     instr:=taicpu.op_reg_reg(A_MOVAPS,S_NO,reg1,reg2);
														
 
															-                OS_F64:
														
 
															+                OS_F64,
														
 
															+                OS_MD128:
														
 
															                   if UseAVX then
														
 
															                     instr:=taicpu.op_reg_reg(A_VMOVAPD,S_NO,reg1,reg2)
														
 
															                   else
														
@@ -1328,6 +1386,32 @@ unit cgx86;
 
															                     instr:=taicpu.op_reg_reg(A_VMOVQ,S_NO,reg1,reg2)
														
 
															                   else
														
 
															                     instr:=taicpu.op_reg_reg(A_MOVQ,S_NO,reg1,reg2);
														
 
															+                OS_M128, OS_MS128:
														
 
															+                  if UseAVX then
														
 
															+                    instr:=taicpu.op_reg_reg(A_VMOVDQA,S_NO,reg1,reg2)
														
 
															+                  else
														
 
															+                    instr:=taicpu.op_reg_reg(A_MOVDQA,S_NO,reg1,reg2);
														
 
															+                OS_MF256,
														
 
															+                OS_MF512:
														
 
															+                  if UseAVX then
														
 
															+                    instr:=taicpu.op_reg_reg(A_VMOVAPS,S_NO,reg1,reg2)
														
 
															+                  else
														
 
															+                    { SSE doesn't support 512-bit vectors }
														
 
															+                    InternalError(2018012931);
														
 
															+                OS_MD256,
														
 
															+                OS_MD512:
														
 
															+                  if UseAVX then
														
 
															+                    instr:=taicpu.op_reg_reg(A_VMOVAPD,S_NO,reg1,reg2)
														
 
															+                  else
														
 
															+                    { SSE doesn't support 512-bit vectors }
														
 
															+                    InternalError(2018012932);
														
 
															+                OS_M256, OS_MS256,
														
 
															+                OS_M512, OS_MS512:
														
 
															+                  if UseAVX then
														
 
															+                    instr:=taicpu.op_reg_reg(A_VMOVDQA,S_NO,reg1,reg2)
														
 
															+                  else
														
 
															+                    { SSE doesn't support 512-bit vectors }
														
 
															+                    InternalError(2018012933);
														
 
															                 else
														
 
															                   internalerror(2006091201);
														
 
															               end
														
@@ -1385,15 +1469,152 @@ unit cgx86;
 
															          make_simple_ref(list,tmpref);
														
 
															          if shuffle=nil then
														
 
															            begin
														
 
															-             if fromsize=OS_M64 then
														
 
															-               list.concat(taicpu.op_ref_reg(A_MOVQ,S_NO,tmpref,reg))
														
 
															-             else
														
 
															-{$ifdef x86_64}
														
 
															-               { x86-64 has always properly aligned data }
														
 
															-               list.concat(taicpu.op_ref_reg(A_MOVDQA,S_NO,tmpref,reg));
														
 
															-{$else x86_64}
														
 
															-               list.concat(taicpu.op_ref_reg(A_MOVDQU,S_NO,tmpref,reg));
														
 
															-{$endif x86_64}
														
 
															+             case fromsize of
														
 
															+               OS_F32:
														
 
															+                 if UseAVX then
														
 
															+                   op := A_VMOVSS
														
 
															+                 else
														
 
															+                   op := A_MOVSS;
														
 
															+               OS_F64:
														
 
															+                 if UseAVX then
														
 
															+                   op := A_VMOVSD
														
 
															+                 else
														
 
															+                   op := A_MOVSD;
														
 
															+               OS_M32, OS_32, OS_S32:
														
 
															+                 if UseAVX then
														
 
															+                   op := A_VMOVD
														
 
															+                 else
														
 
															+                   op := A_MOVD;
														
 
															+               OS_M64, OS_64, OS_S64:
														
 
															+                 if UseAVX then
														
 
															+                   op := A_VMOVQ
														
 
															+                 else
														
 
															+                   op := A_MOVQ;
														
 
															+               OS_MF128:
														
 
															+                 { Use XMM transfer of packed singles }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 16 then
														
 
															+                       op := A_VMOVAPS
														
 
															+                     else
														
 
															+                       op := A_VMOVUPS
														
 
															+                   end
														
 
															+                 else
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 16 then
														
 
															+                       op := A_MOVAPS
														
 
															+                     else
														
 
															+                       op := A_MOVUPS
														
 
															+                   end;
														
 
															+               OS_MD128:
														
 
															+                 { Use XMM transfer of packed doubles }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 16 then
														
 
															+                       op := A_VMOVAPD
														
 
															+                     else
														
 
															+                       op := A_VMOVUPD
														
 
															+                   end
														
 
															+                 else
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 16 then
														
 
															+                       op := A_MOVAPD
														
 
															+                     else
														
 
															+                       op := A_MOVUPD
														
 
															+                   end;
														
 
															+               OS_M128, OS_MS128:
														
 
															+                 { Use XMM integer transfer }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 16 then
														
 
															+                       op := A_VMOVDQA
														
 
															+                     else
														
 
															+                       op := A_VMOVDQU
														
 
															+                   end
														
 
															+                 else
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 16 then
														
 
															+                       op := A_MOVDQA
														
 
															+                     else
														
 
															+                       op := A_MOVDQU
														
 
															+                   end;
														
 
															+               OS_MF256:
														
 
															+                 { Use YMM transfer of packed singles }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 32 then
														
 
															+                       op := A_VMOVAPS
														
 
															+                     else
														
 
															+                       op := A_VMOVUPS
														
 
															+                   end
														
 
															+                 else
														
 
															+                   { SSE doesn't support 256-bit vectors }
														
 
															+                   InternalError(2018012934);
														
 
															+               OS_MD256:
														
 
															+                 { Use YMM transfer of packed doubles }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 32 then
														
 
															+                       op := A_VMOVAPD
														
 
															+                     else
														
 
															+                       op := A_VMOVUPD
														
 
															+                   end
														
 
															+                 else
														
 
															+                   { SSE doesn't support 256-bit vectors }
														
 
															+                   InternalError(2018012935);
														
 
															+               OS_M256, OS_MS256:
														
 
															+                 { Use YMM integer transfer }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 32 then
														
 
															+                       op := A_VMOVDQA
														
 
															+                     else
														
 
															+                       op := A_VMOVDQU
														
 
															+                   end
														
 
															+                 else
														
 
															+                   { SSE doesn't support 256-bit vectors }
														
 
															+                   InternalError(2018012936);
														
 
															+               OS_MF512:
														
 
															+                 { Use ZMM transfer of packed singles }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 64 then
														
 
															+                       op := A_VMOVAPS
														
 
															+                     else
														
 
															+                       op := A_VMOVUPS
														
 
															+                   end
														
 
															+                 else
														
 
															+                   { SSE doesn't support 512-bit vectors }
														
 
															+                   InternalError(2018012937);
														
 
															+               OS_MD512:
														
 
															+                 { Use ZMM transfer of packed doubles }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 64 then
														
 
															+                       op := A_VMOVAPD
														
 
															+                     else
														
 
															+                       op := A_VMOVUPD
														
 
															+                   end
														
 
															+                 else
														
 
															+                   { SSE doesn't support 512-bit vectors }
														
 
															+                   InternalError(2018012938);
														
 
															+               OS_M512, OS_MS512:
														
 
															+                 { Use ZMM integer transfer }
														
 
															+                 if UseAVX then
														
 
															+                   begin
														
 
															+                     if GetRefAlignment(tmpref) = 64 then
														
 
															+                       op := A_VMOVDQA
														
 
															+                     else
														
 
															+                       op := A_VMOVDQU
														
 
															+                   end
														
 
															+                 else
														
 
															+                   { SSE doesn't support 512-bit vectors }
														
 
															+                   InternalError(2018012939);
														
 
															+               else
														
 
															+                 { No valid transfer command available }
														
 
															+                 internalerror(2017121410);
														
 
															+             end;
														
 
															+             list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg));
														
 
															            end
														
 
															          else if shufflescalar(shuffle) then
														
 
															            begin
														
@@ -1415,20 +1636,149 @@ unit cgx86;
 
															          hreg : tregister;
														
 
															          tmpref  : treference;
														
 
															          op : tasmop;
														
 
															+
														
 
															        begin
														
 
															          tmpref:=ref;
														
 
															          make_simple_ref(list,tmpref);
														
 
															          if shuffle=nil then
														
 
															            begin
														
 
															-             if fromsize=OS_M64 then
														
 
															-               list.concat(taicpu.op_reg_ref(A_MOVQ,S_NO,reg,tmpref))
														
 
															-             else
														
 
															-{$ifdef x86_64}
														
 
															-               { x86-64 has always properly aligned data }
														
 
															-               list.concat(taicpu.op_reg_ref(A_MOVDQA,S_NO,reg,tmpref))
														
 
															-{$else x86_64}
														
 
															-               list.concat(taicpu.op_reg_ref(A_MOVDQU,S_NO,reg,tmpref))
														
 
															-{$endif x86_64}
														
 
															+             case fromsize of
														
 
															+               OS_F32:
														
 
															+                 if UseAVX then
														
 
															+                   op := A_VMOVSS
														
 
															+                 else
														
 
															+                   op := A_MOVSS;
														
 
															+               OS_F64:
														
 
															+                 if UseAVX then
														
 
															+                   op := A_VMOVSD
														
 
															+                 else
														
 
															+                   op := A_MOVSD;
														
 
															+               OS_M32, OS_32, OS_S32:
														
 
															+                 if UseAVX then
														
 
															+                   op := A_VMOVD
														
 
															+                 else
														
 
															+                   op := A_MOVD;
														
 
															+               OS_M64, OS_64, OS_S64:
														
 
															+                 if UseAVX then
														
 
															+                   op := A_VMOVQ
														
 
															+                 else
														
 
															+                   op := A_MOVQ;
														
 
															+               OS_MF128:
														
 
															+                 { Use XMM transfer of packed singles }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 16 then
														
 
															+                     op := A_VMOVAPS
														
 
															+                   else
														
 
															+                     op := A_VMOVUPS
														
 
															+                 end else
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 16 then
														
 
															+                     op := A_MOVAPS
														
 
															+                   else
														
 
															+                     op := A_MOVUPS
														
 
															+                 end;
														
 
															+               OS_MD128:
														
 
															+                 { Use XMM transfer of packed doubles }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 16 then
														
 
															+                     op := A_VMOVAPD
														
 
															+                   else
														
 
															+                     op := A_VMOVUPD
														
 
															+                 end else
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 16 then
														
 
															+                     op := A_MOVAPD
														
 
															+                   else
														
 
															+                     op := A_MOVUPD
														
 
															+                 end;
														
 
															+               OS_M128, OS_MS128:
														
 
															+                 { Use XMM integer transfer }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 16 then
														
 
															+                     op := A_VMOVDQA
														
 
															+                   else
														
 
															+                     op := A_VMOVDQU
														
 
															+                 end else
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 16 then
														
 
															+                     op := A_MOVDQA
														
 
															+                   else
														
 
															+                     op := A_MOVDQU
														
 
															+                 end;
														
 
															+               OS_MF256:
														
 
															+                 { Use XMM transfer of packed singles }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 32 then
														
 
															+                     op := A_VMOVAPS
														
 
															+                   else
														
 
															+                     op := A_VMOVUPS
														
 
															+                 end else
														
 
															+                   { SSE doesn't support 256-bit vectors }
														
 
															+                   InternalError(2018012940);
														
 
															+               OS_MD256:
														
 
															+                 { Use XMM transfer of packed doubles }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 32 then
														
 
															+                     op := A_VMOVAPD
														
 
															+                   else
														
 
															+                     op := A_VMOVUPD
														
 
															+                 end else
														
 
															+                   { SSE doesn't support 256-bit vectors }
														
 
															+                   InternalError(2018012941);
														
 
															+               OS_M256, OS_MS256:
														
 
															+                 { Use XMM integer transfer }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 32 then
														
 
															+                     op := A_VMOVDQA
														
 
															+                   else
														
 
															+                     op := A_VMOVDQU
														
 
															+                 end else
														
 
															+                   { SSE doesn't support 256-bit vectors }
														
 
															+                   InternalError(2018012942);
														
 
															+               OS_MF512:
														
 
															+                 { Use XMM transfer of packed singles }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 64 then
														
 
															+                     op := A_VMOVAPS
														
 
															+                   else
														
 
															+                     op := A_VMOVUPS
														
 
															+                 end else
														
 
															+                   { SSE doesn't support 512-bit vectors }
														
 
															+                   InternalError(2018012943);
														
 
															+               OS_MD512:
														
 
															+                 { Use XMM transfer of packed doubles }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 64 then
														
 
															+                     op := A_VMOVAPD
														
 
															+                   else
														
 
															+                     op := A_VMOVUPD
														
 
															+                 end else
														
 
															+                   { SSE doesn't support 512-bit vectors }
														
 
															+                   InternalError(2018012944);
														
 
															+               OS_M512, OS_MS512:
														
 
															+                 { Use XMM integer transfer }
														
 
															+                 if UseAVX then
														
 
															+                 begin
														
 
															+                   if GetRefAlignment(tmpref) = 64 then
														
 
															+                     op := A_VMOVDQA
														
 
															+                   else
														
 
															+                     op := A_VMOVDQU
														
 
															+                 end else
														
 
															+                   { SSE doesn't support 512-bit vectors }
														
 
															+                   InternalError(2018012945);
														
 
															+               else
														
 
															+                 { No valid transfer command available }
														
 
															+                 internalerror(2017121411);
														
 
															+             end;
														
 
															+             list.concat(taicpu.op_reg_ref(op,S_NO,reg,tmpref));
														
 
															            end
														
 
															          else if shufflescalar(shuffle) then
														
 
															            begin
														
--- a/compiler/x86/cpubase.pas
+++ b/compiler/x86/cpubase.pas
@@ -419,10 +419,12 @@ implementation
 
															               else
														
 
															                 internalerror(2009071902);
														
 
															             end;
														
 
															-          OS_M128,OS_MS128:
														
 
															+          OS_M128,OS_MS128,OS_MF128,OS_MD128:
														
 
															             cgsize2subreg:=R_SUBMMX;
														
 
															-          OS_M256,OS_MS256:
														
 
															+          OS_M256,OS_MS256,OS_MF256,OS_MD256:
														
 
															             cgsize2subreg:=R_SUBMMY;
														
 
															+          OS_M512,OS_MS512,OS_MF512,OS_MD512:
														
 
															+            cgsize2subreg:=R_SUBMMZ;
														
 
															           OS_NO:
														
 
															             { error message should have been thrown already before, so avoid only
														
 
															               an internal error }
														
@@ -435,7 +437,7 @@ implementation
 
															     function reg_cgsize(const reg: tregister): tcgsize;
														
 
															       const subreg2cgsize:array[Tsubregister] of Tcgsize =
														
 
															-            (OS_NO,OS_8,OS_8,OS_16,OS_32,OS_64,OS_NO,OS_NO,OS_NO,OS_F32,OS_F64,OS_NO,OS_M128,OS_M256,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO);
														
 
															+            (OS_NO,OS_8,OS_8,OS_16,OS_32,OS_64,OS_NO,OS_NO,OS_NO,OS_F32,OS_F64,OS_NO,OS_M128,OS_M256,OS_M512,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO);
														
 
															       begin
														
 
															         case getregtype(reg) of
														
 
															           R_INTREGISTER :
														
@@ -466,7 +468,7 @@ implementation
 
															     function reg2opsize(r:Tregister):topsize;
														
 
															       const
														
 
															         subreg2opsize : array[tsubregister] of topsize =
														
 
															-          (S_NO,S_B,S_B,S_W,S_L,S_Q,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO);
														
 
															+          (S_NO,S_B,S_B,S_W,S_L,S_Q,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO);
														
 
															       begin
														
 
															         reg2opsize:=S_L;
														
 
															         case getregtype(r) of
														
--- a/compiler/x86/itcpugas.pas
+++ b/compiler/x86/itcpugas.pas
@@ -52,27 +52,28 @@ interface
 
															        'd',
														
 
															        '','','',
														
 
															        't',
														
 
															-        'x',
														
 
															-        'y'
														
 
															+       'x',
														
 
															+       'y',
														
 
															+       'z'
														
 
															      );
														
 
															      { suffix-to-opsize conversion tables, used in asmreadrer }
														
 
															      { !! S_LQ excluded: movzlq does not exist, movslq is processed
														
 
															        as a separate instruction w/o suffix (aka movsxd), and there are
														
 
															        no more instructions needing it. }
														
 
															-     att_sizesuffixstr : array[0..13] of string[2] = (
														
 
															-       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y'
														
 
															+     att_sizesuffixstr : array[0..14] of string[2] = (
														
 
															+       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y','Z'
														
 
															      );
														
 
															-     att_sizesuffix : array[0..13] of topsize = (
														
 
															-       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO
														
 
															+     att_sizesuffix : array[0..14] of topsize = (
														
 
															+       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO,S_NO
														
 
															      );
														
 
															-     att_sizefpusuffix : array[0..13] of topsize = (
														
 
															-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
														
 
															+     att_sizefpusuffix : array[0..14] of topsize = (
														
 
															+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
														
 
															      );
														
 
															-     att_sizefpuintsuffix : array[0..13] of topsize = (
														
 
															-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
														
 
															+     att_sizefpuintsuffix : array[0..14] of topsize = (
														
 
															+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
														
 
															      );
														
 
															-     att_sizemmsuffix : array[0..13] of topsize = (
														
 
															-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
														
 
															+     att_sizemmsuffix : array[0..14] of topsize = (
														
 
															+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
														
 
															      );
														
 
															 {$else x86_64}
														
 
															      gas_opsize2str : array[topsize] of string[2] = ('',
														
@@ -82,24 +83,25 @@ interface
 
															        'd',
														
 
															        '','','',
														
 
															        't',
														
 
															-        'x',
														
 
															-        'y'
														
 
															+       'x',
														
 
															+       'y',
														
 
															+       'z'
														
 
															      );
														
 
															      { suffix-to-opsize conversion tables, used in asmreadrer }
														
 
															-     att_sizesuffixstr : array[0..11] of string[2] = (
														
 
															-       '','BW','BL','WL','B','W','L','S','Q','T','X','Y'
														
 
															+     att_sizesuffixstr : array[0..12] of string[2] = (
														
 
															+       '','BW','BL','WL','B','W','L','S','Q','T','X','Y','Z'
														
 
															      );
														
 
															-     att_sizesuffix : array[0..11] of topsize = (
														
 
															-       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO
														
 
															+     att_sizesuffix : array[0..12] of topsize = (
														
 
															+       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO
														
 
															      );
														
 
															-     att_sizefpusuffix : array[0..11] of topsize = (
														
 
															-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
														
 
															+     att_sizefpusuffix : array[0..12] of topsize = (
														
 
															+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
														
 
															      );
														
 
															-     att_sizefpuintsuffix : array[0..11] of topsize = (
														
 
															-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
														
 
															+     att_sizefpuintsuffix : array[0..12] of topsize = (
														
 
															+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
														
 
															      );
														
 
															-     att_sizemmsuffix : array[0..11] of topsize = (
														
 
															-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
														
 
															+     att_sizemmsuffix : array[0..12] of topsize = (
														
 
															+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
														
 
															      );
														
 
															 {$endif x86_64}
														
--- a/compiler/x86/rax86.pas
+++ b/compiler/x86/rax86.pas
@@ -343,7 +343,8 @@ const
 
															      0,0,0,
														
 
															      80,
														
 
															      128,
														
 
															-     256
														
 
															+     256,
														
 
															+     512
														
 
															     );
														
 
															 {$else}
														
 
															 topsize2memsize: array[topsize] of integer =
														
@@ -354,7 +355,8 @@ topsize2memsize: array[topsize] of integer =
 
															    0,0,0,
														
 
															    80,
														
 
															    128,
														
 
															-   256
														
 
															+   256,
														
 
															+   512
														
 
															   );
														
 
															 {$endif}
														
--- a/compiler/x86_64/aoptcpu.pas
+++ b/compiler/x86_64/aoptcpu.pas
@@ -74,10 +74,14 @@ uses
 
															               A_MOVZX:
														
 
															                 Result:=OptPass1Movx(p);
														
 
															               A_VMOVAPS,
														
 
															-              A_VMOVAPD:
														
 
															+              A_VMOVAPD,
														
 
															+              A_VMOVUPS,
														
 
															+              A_VMOVUPD:
														
 
															                 result:=OptPass1VMOVAP(p);
														
 
															               A_MOVAPD,
														
 
															-              A_MOVAPS:
														
 
															+              A_MOVAPS,
														
 
															+              A_MOVUPD,
														
 
															+              A_MOVUPS:
														
 
															                 result:=OptPass1MOVAP(p);
														
 
															               A_VDIVSD,
														
 
															               A_VDIVSS,
														
--- a/compiler/x86_64/cpubase.inc
+++ b/compiler/x86_64/cpubase.inc
@@ -35,7 +35,8 @@ type
 
															     S_NEAR,S_FAR,S_SHORT,
														
 
															     S_T,
														
 
															     S_XMM,
														
 
															-    S_YMM
														
 
															+    S_YMM,
														
 
															+    S_ZMM
														
 
															   );
														
 
															   TOpSizes = set of topsize;
														
--- a/compiler/x86_64/cpuinfo.pas
+++ b/compiler/x86_64/cpuinfo.pas
@@ -108,7 +108,8 @@ Const
 
															      pocall_sysv_abi_default,
														
 
															      pocall_sysv_abi_cdecl,
														
 
															      pocall_ms_abi_default,
														
 
															-     pocall_ms_abi_cdecl
														
 
															+     pocall_ms_abi_cdecl,
														
 
															+     pocall_vectorcall
														
 
															    ];
														
 
															    cputypestr : array[tcputype] of string[10] = ('',
														
--- a/compiler/x86_64/cpupara.pas
+++ b/compiler/x86_64/cpupara.pas
--- a/compiler/x86_64/cpupi.pas
+++ b/compiler/x86_64/cpupi.pas
@@ -173,7 +173,7 @@ implementation
 
															         result:=
														
 
															            ((target_info.system=system_x86_64_win64) and
														
 
															             not(proccall in [pocall_sysv_abi_default,pocall_sysv_abi_cdecl])) or
														
 
															-           (proccall in [pocall_ms_abi_default,pocall_ms_abi_cdecl]);
														
 
															+            (proccall in [pocall_ms_abi_default,pocall_ms_abi_cdecl,pocall_vectorcall]);
														
 
															       end;
														
--- a/tests/test/cg/tvectorcall1.pp
+++ b/tests/test/cg/tvectorcall1.pp
@@ -0,0 +1,869 @@
 
															+{ %CPU=x86_64 }
														
 
															+program vectorcall_hva_test1;
														
 
															+
														
 
															+{$IFNDEF CPUX86_64}
														
 
															+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
														
 
															+{$ENDIF}
														
 
															+
														
 
															+{$ASMMODE Intel}
														
 
															+{$PUSH}
														
 
															+{$CODEALIGN RECORDMIN=16}
														
 
															+{$PACKRECORDS C}
														
 
															+type
														
 
															+  TM128 = record
														
 
															+    case Byte of
														
 
															+      0: (M128_F32: array[0..3] of Single);
														
 
															+      1: (M128_F64: array[0..1] of Double);
														
 
															+  end;
														
 
															+{$POP}
														
 
															+
														
 
															+{ HFA test: field style. }
														
 
															+
														
 
															+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
														
 
															+  turned into vectors rather than HFAs. }
														
 
															+
														
 
															+  THFA1_SF = packed record
														
 
															+    F1: Single;
														
 
															+  end;
														
 
															+
														
 
															+{$IFDEF WIN64}
														
 
															+  THFA2_SF = packed record
														
 
															+    F1, F2: Single;
														
 
															+  end;
														
 
															+
														
 
															+  THFA3_SF = packed record
														
 
															+    F1, F2, F3: Single;
														
 
															+  end;
														
 
															+
														
 
															+  THFA4_SF = packed record
														
 
															+    F1, F2, F3, F4: Single;
														
 
															+  end;
														
 
															+{$ENDIF}
														
 
															+
														
 
															+  THFA1_DF = packed record
														
 
															+    F1: Double;
														
 
															+  end;
														
 
															+
														
 
															+{$IFDEF WIN64}
														
 
															+  THFA2_DF = packed record
														
 
															+    F1, F2: Double;
														
 
															+  end;
														
 
															+
														
 
															+  THFA3_DF = packed record
														
 
															+    F1, F2, F3: Double;
														
 
															+  end;
														
 
															+
														
 
															+  THFA4_DF = packed record
														
 
															+    F1, F2, F3, F4: Double;
														
 
															+  end;
														
 
															+{$ENDIF}
														
 
															+
														
 
															+{ HFA test - array style }
														
 
															+
														
 
															+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
														
 
															+  turned into vectors rather than HFAs. }
														
 
															+
														
 
															+  THFA1_SA = packed record
														
 
															+    F: array[0..0] of Single;
														
 
															+  end;
														
 
															+
														
 
															+{$IFDEF WIN64}
														
 
															+  THFA2_SA = packed record
														
 
															+    F: array[0..1] of Single;
														
 
															+  end;
														
 
															+
														
 
															+  THFA3_SA = packed record
														
 
															+    F: array[0..2] of Single;
														
 
															+  end;
														
 
															+
														
 
															+  THFA4_SA = packed record
														
 
															+    F: array[0..3] of Single;
														
 
															+  end;
														
 
															+{$ENDIF}
														
 
															+
														
 
															+  THFA1_DA = packed record
														
 
															+    F: array[0..0] of Double;
														
 
															+  end;
														
 
															+
														
 
															+{$IFDEF WIN64}
														
 
															+  THFA2_DA = packed record
														
 
															+    F: array[0..1] of Double;
														
 
															+  end;
														
 
															+
														
 
															+  THFA3_DA = packed record
														
 
															+    F: array[0..2] of Double;
														
 
															+  end;
														
 
															+
														
 
															+  THFA4_DA = packed record
														
 
															+    F: array[0..3] of Double;
														
 
															+  end;
														
 
															+{$ENDIF}
														
 
															+
														
 
															+{ Single-type vector }
														
 
															+
														
 
															+function HorizontalAddSingle(V: TM128): Single; vectorcall;
														
 
															+begin
														
 
															+  HorizontalAddSingle := V.M128_F32[0] + V.M128_F32[1] + V.M128_F32[2] + V.M128_F32[3];
														
 
															+end;
														
 
															+
														
 
															+function HorizontalAddSingle_ASM(V: TM128): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  HADDPS XMM0, XMM0
														
 
															+  HADDPS XMM0, XMM0
														
 
															+end;
														
 
															+
														
 
															+{ Double-type vector }
														
 
															+
														
 
															+function HorizontalAddDouble(V: TM128): Double; vectorcall;
														
 
															+begin
														
 
															+  HorizontalAddDouble := V.M128_F64[0] + V.M128_F64[1];
														
 
															+end;
														
 
															+
														
 
															+function HorizontalAddDouble_ASM(V: TM128): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  HADDPD XMM0, XMM0
														
 
															+end;
														
 
															+
														
 
															+{ 3-element aggregate }
														
 
															+
														
 
															+function AddSingles1F(HFA: THFA1_SF): Single; vectorcall;
														
 
															+begin
														
 
															+  AddSingles1F := HFA.F1;
														
 
															+end;
														
 
															+
														
 
															+function AddSingles1F_ASM(HFA: THFA1_SF): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  { Do absolutely nothing! }
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles1F(HFA: THFA1_DF): Double; vectorcall;
														
 
															+begin
														
 
															+  AddDoubles1F := HFA.F1;
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles1F_ASM(HFA: THFA1_DF): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  { Do absolutely nothing! }
														
 
															+end;
														
 
															+
														
 
															+function AddSingles1A(HFA: THFA1_SA): Single; vectorcall;
														
 
															+begin
														
 
															+  AddSingles1A := HFA.F[0];
														
 
															+end;
														
 
															+
														
 
															+function AddSingles1A_ASM(HFA: THFA1_SA): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  { Do absolutely nothing! }
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles1A(HFA: THFA1_DA): Double; vectorcall;
														
 
															+begin
														
 
															+  AddDoubles1A := HFA.F[0];
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles1A_ASM(HFA: THFA1_DA): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  { Do absolutely nothing! }
														
 
															+end;
														
 
															+
														
 
															+{$IFDEF WIN64}
														
 
															+{ 2-element aggregate }
														
 
															+
														
 
															+function AddSingles2F(HFA: THFA2_SF): Single; vectorcall;
														
 
															+begin
														
 
															+  AddSingles2F := HFA.F1 + HFA.F2;
														
 
															+end;
														
 
															+
														
 
															+function AddSingles2F_ASM(HFA: THFA2_SF): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSS XMM0, XMM1
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles2F(HFA: THFA2_DF): Double; vectorcall;
														
 
															+begin
														
 
															+  AddDoubles2F := HFA.F1 + HFA.F2;
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles2F_ASM(HFA: THFA2_DF): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSD XMM0, XMM1
														
 
															+end;
														
 
															+
														
 
															+function AddSingles2A(HFA: THFA2_SA): Single; vectorcall;
														
 
															+begin
														
 
															+  AddSingles2A := HFA.F[0] + HFA.F[1];
														
 
															+end;
														
 
															+
														
 
															+function AddSingles2A_ASM(HFA: THFA2_SA): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSS XMM0, XMM1
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles2A(HFA: THFA2_DA): Double; vectorcall;
														
 
															+begin
														
 
															+  AddDoubles2A := HFA.F[0] + HFA.F[1];
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles2A_ASM(HFA: THFA2_DA): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSD XMM0, XMM1
														
 
															+end;
														
 
															+
														
 
															+{ 3-element aggregate }
														
 
															+
														
 
															+function AddSingles3F(HFA: THFA3_SF): Single; vectorcall;
														
 
															+begin
														
 
															+  AddSingles3F := HFA.F1 + HFA.F2 + HFA.F3;
														
 
															+end;
														
 
															+
														
 
															+function AddSingles3F_ASM(HFA: THFA3_SF): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSS XMM0, XMM1
														
 
															+  ADDSS XMM0, XMM2
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles3F(HFA: THFA3_DF): Double; vectorcall;
														
 
															+begin
														
 
															+  AddDoubles3F := HFA.F1 + HFA.F2 + HFA.F3;
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles3F_ASM(HFA: THFA3_DF): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSD XMM0, XMM1
														
 
															+  ADDSD XMM0, XMM2
														
 
															+end;
														
 
															+
														
 
															+function AddSingles3A(HFA: THFA3_SA): Single; vectorcall;
														
 
															+begin
														
 
															+  AddSingles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
														
 
															+end;
														
 
															+
														
 
															+function AddSingles3A_ASM(HFA: THFA3_SA): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSS XMM0, XMM1
														
 
															+  ADDSS XMM0, XMM2
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles3A(HFA: THFA3_DA): Double; vectorcall;
														
 
															+begin
														
 
															+  AddDoubles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles3A_ASM(HFA: THFA3_DA): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSD XMM0, XMM1
														
 
															+  ADDSD XMM0, XMM2
														
 
															+end;
														
 
															+
														
 
															+{ 4-element aggregate }
														
 
															+
														
 
															+function AddSingles4F(HFA: THFA4_SF): Single; vectorcall;
														
 
															+begin
														
 
															+  AddSingles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
														
 
															+end;
														
 
															+
														
 
															+function AddSingles4F_ASM(HFA: THFA4_SF): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSS XMM0, XMM1
														
 
															+  ADDSS XMM0, XMM2
														
 
															+  ADDSS XMM0, XMM3
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles4F(HFA: THFA4_DF): Double; vectorcall;
														
 
															+begin
														
 
															+  AddDoubles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles4F_ASM(HFA: THFA4_DF): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSD XMM0, XMM1
														
 
															+  ADDSD XMM0, XMM2
														
 
															+  ADDSD XMM0, XMM3
														
 
															+end;
														
 
															+
														
 
															+function AddSingles4A(HFA: THFA4_SA): Single; vectorcall;
														
 
															+begin
														
 
															+  AddSingles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
														
 
															+end;
														
 
															+
														
 
															+function AddSingles4A_ASM(HFA: THFA4_SA): Single; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSS XMM0, XMM1
														
 
															+  ADDSS XMM0, XMM2
														
 
															+  ADDSS XMM0, XMM3
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles4A(HFA: THFA4_DA): Double; vectorcall;
														
 
															+begin
														
 
															+  AddDoubles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
														
 
															+end;
														
 
															+
														
 
															+function AddDoubles4A_ASM(HFA: THFA4_DA): Double; vectorcall; assembler; nostackframe;
														
 
															+asm
														
 
															+  ADDSD XMM0, XMM1
														
 
															+  ADDSD XMM0, XMM2
														
 
															+  ADDSD XMM0, XMM3
														
 
															+end;
														
 
															+{$ENDIF}
														
 
															+
														
 
															+var
														
 
															+  HVA: TM128;
														
 
															+  HFA1_SF: THFA1_SF;
														
 
															+  HFA1_DF: THFA1_DF;
														
 
															+  HFA1_SA: THFA1_SA;
														
 
															+  HFA1_DA: THFA1_DA;
														
 
															+{$IFDEF WIN64}
														
 
															+  HFA2_SF: THFA2_SF;
														
 
															+  HFA2_DF: THFA2_DF;
														
 
															+  HFA2_SA: THFA2_SA;
														
 
															+  HFA2_DA: THFA2_DA;
														
 
															+  HFA3_SF: THFA3_SF;
														
 
															+  HFA3_DF: THFA3_DF;
														
 
															+  HFA3_SA: THFA3_SA;
														
 
															+  HFA3_DA: THFA3_DA;
														
 
															+  HFA4_SF: THFA4_SF;
														
 
															+  HFA4_DF: THFA4_DF;
														
 
															+  HFA4_SA: THFA4_SA;
														
 
															+  HFA4_DA: THFA4_DA;
														
 
															+{$ENDIF}
														
 
															+  TestPointer: PtrUInt;
														
 
															+  I, J: Integer;
														
 
															+  ResS, ResSA: Single;
														
 
															+  ResD, ResDA: Double;
														
 
															+  Addresses: array[0..3] of Pointer;
														
 
															+  FieldAddresses: array[0..3, 0..3] of Pointer;
														
 
															+const
														
 
															+  AddressNames1: array[0..3] of ShortString = ('HFA1_SF', 'HFA1_DF', 'HFA1_SA', 'HFA1_DA');
														
 
															+{$IFDEF WIN64}
														
 
															+  AddressNames2: array[0..3] of ShortString = ('HFA2_SF', 'HFA2_DF', 'HFA2_SA', 'HFA2_DA');
														
 
															+  AddressNames3: array[0..3] of ShortString = ('HFA3_SF', 'HFA3_DF', 'HFA3_SA', 'HFA3_DA');
														
 
															+  AddressNames4: array[0..3] of ShortString = ('HFA4_SF', 'HFA4_DF', 'HFA4_SA', 'HFA4_DA');
														
 
															+{$ENDIF}
														
 
															+  FieldAddressNames: array[0..3] of ShortString = ('F1', 'F2', 'F3', 'F4');
														
 
															+
														
 
															+  ExpS1: Single = 5.0;
														
 
															+{$IFDEF WIN64}
														
 
															+  ExpS2: Single = -5.0;
														
 
															+  ExpS3: Single = 10.0;
														
 
															+{$ENDIF}
														
 
															+  ExpS4: Single = -10.0;
														
 
															+  ExpD1: Double = 5.0;
														
 
															+  ExpD2: Double = -5.0;
														
 
															+{$IFDEF WIN64}
														
 
															+  ExpD3: Double = 10.0;
														
 
															+  ExpD4: Double = -10.0;
														
 
															+{$ENDIF}
														
 
															+begin
														
 
															+
														
 
															+  if (PtrUInt(@HVA) and $F) <> 0 then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: HVA is not correctly aligned.');
														
 
															+    Halt(1);
														
 
															+  end;
														
 
															+
														
 
															+  { array of singles }
														
 
															+  WriteLn('- horizontal add (4 singles)');
														
 
															+  HVA.M128_F32[0] := 5.0;
														
 
															+  HVA.M128_F32[1] := -10.0;
														
 
															+  HVA.M128_F32[2] := 15.0;
														
 
															+  HVA.M128_F32[3] := -20.0;
														
 
															+  ResS := HorizontalAddSingle(HVA);
														
 
															+  ResSA := HorizontalAddSingle_ASM(HVA);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: HorizontalAddSingle(HVA) has the vector in the wrong register.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS4 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: HorizontalAddSingle(HVA) returned ', ResS, ' instead of ', ExpS4);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  { array of doubles }
														
 
															+  WriteLn('- horizontal add (2 doubles)');
														
 
															+  HVA.M128_F64[0] := 5.0;
														
 
															+  HVA.M128_F64[1] := -10.0;
														
 
															+  ResD := HorizontalAddDouble(HVA);
														
 
															+  ResDA := HorizontalAddDouble_ASM(HVA);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: HorizontalAddDouble(HVA) has the vector in the wrong register.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD2 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: HorizontalAddDouble(HVA) returned ', ResD, ' instead of ', ExpD2);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  { 1-field aggregates }
														
 
															+  WriteLn('- 1-field aggregates');
														
 
															+
														
 
															+  Addresses[0] := @HFA1_SF;
														
 
															+  Addresses[1] := @HFA1_SA;
														
 
															+  Addresses[2] := @HFA1_DF;
														
 
															+  Addresses[3] := @HFA1_DA;
														
 
															+  FieldAddresses[0][0] := @(HFA1_SF.F1);
														
 
															+  FieldAddresses[1][0] := @(HFA1_SA.F[0]);
														
 
															+  FieldAddresses[2][0] := @(HFA1_DF.F1);
														
 
															+  FieldAddresses[3][0] := @(HFA1_DA.F[0]);
														
 
															+
														
 
															+  { Check alignment }
														
 
															+  for I := 0 to 1 do
														
 
															+  begin
														
 
															+    TestPointer := PtrUInt(Addresses[I]);
														
 
															+    if Pointer(TestPointer) <> FieldAddresses[I][0] then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: ', AddressNames1[I], ' is not correctly packed; field F1 is not in the expected place.');
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA1_SF.F1 := 5.0;
														
 
															+  ResS := AddSingles1F(HFA1_SF);
														
 
															+  ResSA := AddSingles1F_ASM(HFA1_SF);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS1 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA1_DF.F1 := 5.0;
														
 
															+  ResD := AddDoubles1F(HFA1_DF);
														
 
															+  ResDA := AddDoubles1F_ASM(HFA1_DF);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD1 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA1_SA.F[0] := 5.0;
														
 
															+  ResS := AddSingles1A(HFA1_SA);
														
 
															+  ResSA := AddSingles1A_ASM(HFA1_SA);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS1 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA1_DA.F[0] := 5.0;
														
 
															+  ResD := AddDoubles1A(HFA1_DA);
														
 
															+  ResDA := AddDoubles1A_ASM(HFA1_DA);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD1 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+{$IFDEF WIN64}
														
 
															+  { 2-field aggregates }
														
 
															+  WriteLn('- 2-field aggregates');
														
 
															+
														
 
															+  Addresses[0] := @HFA2_SF;
														
 
															+  Addresses[1] := @HFA2_SA;
														
 
															+  FieldAddresses[0][0] := @(HFA2_SF.F1);
														
 
															+  FieldAddresses[0][1] := @(HFA2_SF.F2);
														
 
															+  FieldAddresses[1][0] := @(HFA2_SA.F[0]);
														
 
															+  FieldAddresses[1][1] := @(HFA2_SA.F[1]);
														
 
															+
														
 
															+  { Check alignment of Singles }
														
 
															+  for I := 0 to 1 do
														
 
															+  begin
														
 
															+    TestPointer := PtrUInt(Addresses[I]);
														
 
															+    for J := 0 to 1 do
														
 
															+    begin
														
 
															+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
														
 
															+      begin
														
 
															+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
														
 
															+        Halt(1);
														
 
															+      end;
														
 
															+
														
 
															+      Inc(TestPointer, $4);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  Addresses[2] := @HFA2_DF;
														
 
															+  Addresses[3] := @HFA2_DA;
														
 
															+  FieldAddresses[2][0] := @(HFA2_DF.F1);
														
 
															+  FieldAddresses[2][1] := @(HFA2_DF.F2);
														
 
															+  FieldAddresses[3][0] := @(HFA2_DA.F[0]);
														
 
															+  FieldAddresses[3][1] := @(HFA2_DA.F[1]);
														
 
															+
														
 
															+  { Check alignment of Doubles }
														
 
															+  for I := 2 to 3 do
														
 
															+  begin
														
 
															+    TestPointer := PtrUInt(Addresses[I]);
														
 
															+    for J := 0 to 1 do
														
 
															+    begin
														
 
															+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
														
 
															+      begin
														
 
															+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
														
 
															+        Halt(1);
														
 
															+      end;
														
 
															+
														
 
															+      Inc(TestPointer, $8);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA2_SF.F1 := 5.0;
														
 
															+  HFA2_SF.F2 := -10.0;
														
 
															+  ResS := AddSingles2F(HFA2_SF);
														
 
															+  ResSA := AddSingles2F_ASM(HFA2_SF);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddSingles2F(HFA2_SF) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS2 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddSingles2F(HFA2_SF) returned ', ResS, ' instead of ', ExpS2);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA2_DF.F1 := 5.0;
														
 
															+  HFA2_DF.F2 := -10.0;
														
 
															+  ResD := AddDoubles2F(HFA2_DF);
														
 
															+  ResDA := AddDoubles2F_ASM(HFA2_DF);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddDoubles2F(HFA2_DF) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD2 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddDoubles2F(HFA2_DF) returned ', ResD, ' instead of ', ExpD2);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA2_SA.F[0] := 5.0;
														
 
															+  HFA2_SA.F[1] := -10.0;
														
 
															+  ResS := AddSingles2A(HFA2_SA);
														
 
															+  ResSA := AddSingles2A_ASM(HFA2_SA);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddSingles2A(HFA2_SA) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS2 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddSingles2A(HFA2_SA) returned ', ResS, ' instead of ', ExpS2);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA2_DA.F[0] := 5.0;
														
 
															+  HFA2_DA.F[1] := -10.0;
														
 
															+  ResD := AddDoubles2A(HFA2_DA);
														
 
															+  ResDA := AddDoubles2A_ASM(HFA2_DA);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddDoubles2A(HFA2_DA) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD2 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddDoubles2A(HFA2_DA) returned ', ResD, ' instead of ', ExpD2);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  { 3-field aggregates }
														
 
															+  WriteLn('- 3-field aggregates');
														
 
															+
														
 
															+  Addresses[0] := @HFA3_SF;
														
 
															+  Addresses[1] := @HFA3_SA;
														
 
															+  FieldAddresses[0][0] := @(HFA3_SF.F1);
														
 
															+  FieldAddresses[0][1] := @(HFA3_SF.F2);
														
 
															+  FieldAddresses[0][2] := @(HFA3_SF.F3);
														
 
															+  FieldAddresses[1][0] := @(HFA3_SA.F[0]);
														
 
															+  FieldAddresses[1][1] := @(HFA3_SA.F[1]);
														
 
															+  FieldAddresses[1][2] := @(HFA3_SA.F[2]);
														
 
															+
														
 
															+  { Check alignment of Singles }
														
 
															+  for I := 0 to 1 do
														
 
															+  begin
														
 
															+    TestPointer := PtrUInt(Addresses[I]);
														
 
															+    for J := 0 to 2 do
														
 
															+    begin
														
 
															+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
														
 
															+      begin
														
 
															+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
														
 
															+        Halt(1);
														
 
															+      end;
														
 
															+
														
 
															+      Inc(TestPointer, $4);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  Addresses[2] := @HFA3_DF;
														
 
															+  Addresses[3] := @HFA3_DA;
														
 
															+  FieldAddresses[2][0] := @(HFA3_DF.F1);
														
 
															+  FieldAddresses[2][1] := @(HFA3_DF.F2);
														
 
															+  FieldAddresses[2][2] := @(HFA3_DF.F3);
														
 
															+  FieldAddresses[3][0] := @(HFA3_DA.F[0]);
														
 
															+  FieldAddresses[3][1] := @(HFA3_DA.F[1]);
														
 
															+  FieldAddresses[3][2] := @(HFA3_DA.F[2]);
														
 
															+
														
 
															+  { Check alignment of Doubles }
														
 
															+  for I := 2 to 3 do
														
 
															+  begin
														
 
															+    TestPointer := PtrUInt(Addresses[I]);
														
 
															+    for J := 0 to 2 do
														
 
															+    begin
														
 
															+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
														
 
															+      begin
														
 
															+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
														
 
															+        Halt(1);
														
 
															+      end;
														
 
															+
														
 
															+      Inc(TestPointer, $8);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA3_SF.F1 := 5.0;
														
 
															+  HFA3_SF.F2 := -10.0;
														
 
															+  HFA3_SF.F3 := 15.0;
														
 
															+  ResS := AddSingles3F(HFA3_SF);
														
 
															+  ResSA := AddSingles3F_ASM(HFA3_SF);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddSingles3F(HFA3_SF) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS3 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddSingles3F(HFA3_SF) returned ', ResS, ' instead of ', ExpS3);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA3_DF.F1 := 5.0;
														
 
															+  HFA3_DF.F2 := -10.0;
														
 
															+  HFA3_DF.F3 := 15.0;
														
 
															+  ResD := AddDoubles3F(HFA3_DF);
														
 
															+  ResDA := AddDoubles3F_ASM(HFA3_DF);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddDoubles3F(HFA3_DF) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD3 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddDoubles3F(HFA3_DF) returned ', ResD, ' instead of ', ExpD3);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA3_SA.F[0] := 5.0;
														
 
															+  HFA3_SA.F[1] := -10.0;
														
 
															+  HFA3_SA.F[2] := 15.0;
														
 
															+  ResS := AddSingles3A(HFA3_SA);
														
 
															+  ResSA := AddSingles3A_ASM(HFA3_SA);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddSingles3A(HFA3_SA) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS3 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddSingles3A(HFA3_SA) returned ', ResS, ' instead of ', ExpS3);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA3_DA.F[0] := 5.0;
														
 
															+  HFA3_DA.F[1] := -10.0;
														
 
															+  HFA3_DA.F[2] := 15.0;
														
 
															+  ResD := AddDoubles3A(HFA3_DA);
														
 
															+  ResDA := AddDoubles3A_ASM(HFA3_DA);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddDoubles3A(HFA3_DA) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD3 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddDoubles3A(HFA3_DA) returned ', ResD, ' instead of ', ExpD3);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  { 4-field aggregates }
														
 
															+  WriteLn('- 4-field aggregates');
														
 
															+
														
 
															+  Addresses[0] := @HFA4_SF;
														
 
															+  Addresses[1] := @HFA4_SA;
														
 
															+  FieldAddresses[0][0] := @(HFA4_SF.F1);
														
 
															+  FieldAddresses[0][1] := @(HFA4_SF.F2);
														
 
															+  FieldAddresses[0][2] := @(HFA4_SF.F3);
														
 
															+  FieldAddresses[0][3] := @(HFA4_SF.F4);
														
 
															+  FieldAddresses[1][0] := @(HFA4_SA.F[0]);
														
 
															+  FieldAddresses[1][1] := @(HFA4_SA.F[1]);
														
 
															+  FieldAddresses[1][2] := @(HFA4_SA.F[2]);
														
 
															+  FieldAddresses[1][3] := @(HFA4_SA.F[3]);
														
 
															+
														
 
															+  { Check alignment of Singles }
														
 
															+  for I := 0 to 1 do
														
 
															+  begin
														
 
															+    TestPointer := PtrUInt(Addresses[I]);
														
 
															+    for J := 0 to 3 do
														
 
															+    begin
														
 
															+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
														
 
															+      begin
														
 
															+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
														
 
															+        Halt(1);
														
 
															+      end;
														
 
															+
														
 
															+      Inc(TestPointer, $4);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  Addresses[2] := @HFA4_DF;
														
 
															+  Addresses[3] := @HFA4_DA;
														
 
															+  FieldAddresses[2][0] := @(HFA4_DF.F1);
														
 
															+  FieldAddresses[2][1] := @(HFA4_DF.F2);
														
 
															+  FieldAddresses[2][2] := @(HFA4_DF.F3);
														
 
															+  FieldAddresses[2][3] := @(HFA4_DF.F4);
														
 
															+  FieldAddresses[3][0] := @(HFA4_DA.F[0]);
														
 
															+  FieldAddresses[3][1] := @(HFA4_DA.F[1]);
														
 
															+  FieldAddresses[3][2] := @(HFA4_DA.F[2]);
														
 
															+  FieldAddresses[3][3] := @(HFA4_DA.F[3]);
														
 
															+
														
 
															+  { Check alignment of Doubles }
														
 
															+  for I := 2 to 3 do
														
 
															+  begin
														
 
															+    TestPointer := PtrUInt(Addresses[I]);
														
 
															+    for J := 0 to 3 do
														
 
															+    begin
														
 
															+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
														
 
															+      begin
														
 
															+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
														
 
															+        Halt(1);
														
 
															+      end;
														
 
															+
														
 
															+      Inc(TestPointer, $8);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA4_SF.F1 := 5.0;
														
 
															+  HFA4_SF.F2 := -10.0;
														
 
															+  HFA4_SF.F3 := 15.0;
														
 
															+  HFA4_SF.F4 := -20.0;
														
 
															+  ResS := AddSingles4F(HFA4_SF);
														
 
															+  ResSA := AddSingles4F_ASM(HFA4_SF);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddSingles4F(HFA4_SF) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS4 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddSingles4F(HFA4_SF) returned ', ResS, ' instead of ', ExpS4);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA4_DF.F1 := 5.0;
														
 
															+  HFA4_DF.F2 := -10.0;
														
 
															+  HFA4_DF.F3 := 15.0;
														
 
															+  HFA4_DF.F4 := -20.0;
														
 
															+  ResD := AddDoubles4F(HFA4_DF);
														
 
															+  ResDA := AddDoubles4F_ASM(HFA4_DF);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddDoubles4F(HFA4_DF) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD4 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddDoubles4F(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA4_SA.F[0] := 5.0;
														
 
															+  HFA4_SA.F[1] := -10.0;
														
 
															+  HFA4_SA.F[2] := 15.0;
														
 
															+  HFA4_SA.F[3] := -20.0;
														
 
															+  ResS := AddSingles4A(HFA4_SA);
														
 
															+  ResSA := AddSingles4A_ASM(HFA4_SA);
														
 
															+  if (ResS <> ResSA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddSingles4A(HFA4_SA) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResS <> ExpS4 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddSingles4A(HFA4_SA) returned ', ResS, ' instead of ', ExpS4);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  HFA4_DA.F[0] := 5.0;
														
 
															+  HFA4_DA.F[1] := -10.0;
														
 
															+  HFA4_DA.F[2] := 15.0;
														
 
															+  HFA4_DA.F[3] := -20.0;
														
 
															+  ResD := AddDoubles4A(HFA4_DA);
														
 
															+  ResDA := AddDoubles4A_ASM(HFA4_DA);
														
 
															+  if (ResD <> ResDA) then
														
 
															+  begin
														
 
															+    WriteLn('FAIL: AddDoubles4A(HFA4_DF) is not passing the aggregate correctly.');
														
 
															+    Halt(1);
														
 
															+  end else
														
 
															+  begin
														
 
															+    if ResD <> ExpD4 then
														
 
															+    begin
														
 
															+      WriteLn('FAIL: AddDoubles4A(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+{$ENDIF}
														
 
															+  WriteLn('ok');
														
 
															+end.
														
 
															+
														
--- a/tests/test/cg/tvectorcall2.pp
+++ b/tests/test/cg/tvectorcall2.pp
@@ -0,0 +1,162 @@
 
															+{ %CPU=x86_64 }
														
 
															+program vectorcall_hva_test2;
														
 
															+
														
 
															+{$IFNDEF CPUX86_64}
														
 
															+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
														
 
															+{$ENDIF}
														
 
															+
														
 
															+{$push}
														
 
															+{$CODEALIGN RECORDMIN=16}
														
 
															+{$PACKRECORDS C}
														
 
															+type
														
 
															+  TM128 = record
														
 
															+    case Byte of
														
 
															+      0: (M128_F32: array[0..3] of Single);
														
 
															+      1: (M128_F64: array[0..1] of Double);
														
 
															+  end;
														
 
															+{$pop}
														
 
															+
														
 
															+{ HVA test }
														
 
															+  THVA = record
														
 
															+    V1, V2, V3, V4: TM128;
														
 
															+  end;
														
 
															+
														
 
															+operator +(X, Y: TM128)Z: TM128; vectorcall;
														
 
															+  var
														
 
															+    I: Integer;
														
 
															+  begin
														
 
															+    for I := 0 to 3 do
														
 
															+      Z.M128_F32[I] := X.M128_F32[I] + Y.M128_F32[I];
														
 
															+  end;
														
 
															+
														
 
															+operator -(X, Y: TM128)Z: TM128; vectorcall;
														
 
															+  var
														
 
															+    I: Integer;
														
 
															+  begin
														
 
															+    for I := 0 to 3 do
														
 
															+      Z.M128_F32[I] := X.M128_F32[I] - Y.M128_F32[I];
														
 
															+  end;
														
 
															+
														
 
															+{ - InputHVA goes on the stack because there are not enough free XMM registers to contain the entire argument
														
 
															+  - A4 does NOT go on the stack and goes into an XMM register.
														
 
															+}
														
 
															+function HVATest(A1, A2, A3: TM128; InputHVA: THVA; A4: TM128; Op: Integer): THVA; vectorcall;
														
 
															+  begin
														
 
															+    { FIXME: There is an internal stack misalignment for A4, necessitating the
														
 
															+      use of (V)MOVDQU instead of (V)MOVDQA in the compiled code. }
														
 
															+    case Op of
														
 
															+      1:
														
 
															+        begin
														
 
															+          HVATest.V1 := InputHVA.V1 + A1;
														
 
															+          HVATest.V2 := InputHVA.V2 + A2;
														
 
															+          HVATest.V3 := InputHVA.V3 + A3;
														
 
															+          HVATest.V4 := InputHVA.V4 + A4;
														
 
															+        end;
														
 
															+      2:
														
 
															+        begin
														
 
															+          HVATest.V1 := InputHVA.V1 - A1;
														
 
															+          HVATest.V2 := InputHVA.V2 - A2;
														
 
															+          HVATest.V3 := InputHVA.V3 - A3;
														
 
															+          HVATest.V4 := InputHVA.V4 - A4;
														
 
															+        end;
														
 
															+      else
														
 
															+        begin
														
 
															+          HVATest.V1 := InputHVA.V1 + A1;
														
 
															+          HVATest.V2 := InputHVA.V2 - A2;
														
 
															+          HVATest.V3 := InputHVA.V3 + A3;
														
 
															+          HVATest.V4 := InputHVA.V4 - A4;
														
 
															+        end;
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+var
														
 
															+  B1, B2, B3, B4: TM128; HVA, AddRes, SubRes, MixRes, AddExp, SubExp, MixExp: THVA; I: Integer;
														
 
															+begin
														
 
															+  B1.M128_F32[0] := 1.0;        B1.M128_F32[1] := 2.0;        B1.M128_F32[2] := 3.0;        B1.M128_F32[3] := 4.0;
														
 
															+  B2.M128_F32[0] := 5.0;        B2.M128_F32[1] := 6.0;        B2.M128_F32[2] := 7.0;        B2.M128_F32[3] := 8.0;
														
 
															+  B3.M128_F32[0] := 9.0;        B3.M128_F32[1] := 10.0;       B3.M128_F32[2] := 11.0;       B3.M128_F32[3] := 12.0;
														
 
															+  B4.M128_F32[0] := 13.0;       B4.M128_F32[1] := 14.0;       B4.M128_F32[2] := 15.0;       B4.M128_F32[3] := 16.0;
														
 
															+
														
 
															+  HVA.V1.M128_F32[0] := 10.0;   HVA.V1.M128_F32[1] := 20.0;   HVA.V1.M128_F32[2] := 30.0;   HVA.V1.M128_F32[3] := 40.0;
														
 
															+  HVA.V2.M128_F32[0] := 50.0;   HVA.V2.M128_F32[1] := 60.0;   HVA.V2.M128_F32[2] := 70.0;   HVA.V2.M128_F32[3] := 80.0;
														
 
															+  HVA.V3.M128_F32[0] := 90.0;   HVA.V3.M128_F32[1] := 100.0;  HVA.V3.M128_F32[2] := 110.0;  HVA.V3.M128_F32[3] := 120.0;
														
 
															+  HVA.V4.M128_F32[0] := 130.0;  HVA.V4.M128_F32[1] := 140.0;  HVA.V4.M128_F32[2] := 150.0;  HVA.V4.M128_F32[3] := 160.0;
														
 
															+
														
 
															+  AddExp.V1.M128_F32[0] := 11.0;   AddExp.V1.M128_F32[1] := 22.0;   AddExp.V1.M128_F32[2] := 33.0;   AddExp.V1.M128_F32[3] := 44.0;
														
 
															+  AddExp.V2.M128_F32[0] := 55.0;   AddExp.V2.M128_F32[1] := 66.0;   AddExp.V2.M128_F32[2] := 77.0;   AddExp.V2.M128_F32[3] := 88.0;
														
 
															+  AddExp.V3.M128_F32[0] := 99.0;   AddExp.V3.M128_F32[1] := 110.0;  AddExp.V3.M128_F32[2] := 121.0;  AddExp.V3.M128_F32[3] := 132.0;
														
 
															+  AddExp.V4.M128_F32[0] := 143.0;  AddExp.V4.M128_F32[1] := 154.0;  AddExp.V4.M128_F32[2] := 165.0;  AddExp.V4.M128_F32[3] := 176.0;
														
 
															+
														
 
															+  SubExp.V1.M128_F32[0] := 9.0;    SubExp.V1.M128_F32[1] := 18.0;   SubExp.V1.M128_F32[2] := 27.0;   SubExp.V1.M128_F32[3] := 36.0;
														
 
															+  SubExp.V2.M128_F32[0] := 45.0;   SubExp.V2.M128_F32[1] := 54.0;   SubExp.V2.M128_F32[2] := 63.0;   SubExp.V2.M128_F32[3] := 72.0;
														
 
															+  SubExp.V3.M128_F32[0] := 81.0;   SubExp.V3.M128_F32[1] := 90.0;   SubExp.V3.M128_F32[2] := 99.0;   SubExp.V3.M128_F32[3] := 108.0;
														
 
															+  SubExp.V4.M128_F32[0] := 117.0;  SubExp.V4.M128_F32[1] := 126.0;  SubExp.V4.M128_F32[2] := 135.0;  SubExp.V4.M128_F32[3] := 144.0;
														
 
															+
														
 
															+  MixExp.V1.M128_F32[0] := 11.0;   MixExp.V1.M128_F32[1] := 22.0;   MixExp.V1.M128_F32[2] := 33.0;   MixExp.V1.M128_F32[3] := 44.0;
														
 
															+  MixExp.V2.M128_F32[0] := 45.0;   MixExp.V2.M128_F32[1] := 54.0;   MixExp.V2.M128_F32[2] := 63.0;   MixExp.V2.M128_F32[3] := 72.0;
														
 
															+  MixExp.V3.M128_F32[0] := 99.0;   MixExp.V3.M128_F32[1] := 110.0;  MixExp.V3.M128_F32[2] := 121.0;  MixExp.V3.M128_F32[3] := 132.0;
														
 
															+  MixExp.V4.M128_F32[0] := 117.0;  MixExp.V4.M128_F32[1] := 126.0;  MixExp.V4.M128_F32[2] := 135.0;  MixExp.V4.M128_F32[3] := 144.0;
														
 
															+
														
 
															+  WriteLn('    B1: ', B1.M128_F32[0], ',', B1.M128_F32[1], ',', B1.M128_F32[2], ',', B1.M128_F32[3]);
														
 
															+  WriteLn('    B2: ', B2.M128_F32[0], ',', B2.M128_F32[1], ',', B2.M128_F32[2], ',', B2.M128_F32[3]);
														
 
															+  WriteLn('    B3: ', B3.M128_F32[0], ',', B3.M128_F32[1], ',', B3.M128_F32[2], ',', B3.M128_F32[3]);
														
 
															+  WriteLn('    B4: ', B4.M128_F32[0], ',', B4.M128_F32[1], ',', B4.M128_F32[2], ',', B4.M128_F32[3]);
														
 
															+  WriteLn('HVA.V1: ', HVA.V1.M128_F32[0], ',', HVA.V1.M128_F32[1], ',', HVA.V1.M128_F32[2], ',', HVA.V1.M128_F32[3]);
														
 
															+  WriteLn('HVA.V2: ', HVA.V2.M128_F32[0], ',', HVA.V2.M128_F32[1], ',', HVA.V2.M128_F32[2], ',', HVA.V2.M128_F32[3]);
														
 
															+  WriteLn('HVA.V3: ', HVA.V3.M128_F32[0], ',', HVA.V3.M128_F32[1], ',', HVA.V3.M128_F32[2], ',', HVA.V3.M128_F32[3]);
														
 
															+  WriteLn('HVA.V4: ', HVA.V4.M128_F32[0], ',', HVA.V4.M128_F32[1], ',', HVA.V4.M128_F32[2], ',', HVA.V4.M128_F32[3]);
														
 
															+  AddRes := HVATest(B1, B2, B3, HVA, B4, 1);
														
 
															+  SubRes := HVATest(B1, B2, B3, HVA, B4, 2);
														
 
															+  MixRes := HVATest(B1, B2, B3, HVA, B4, 0);
														
 
															+  WriteLn('----');
														
 
															+  WriteLn('AddRes.V1: ', AddRes.V1.M128_F32[0], ',', AddRes.V1.M128_F32[1], ',', AddRes.V1.M128_F32[2], ',', AddRes.V1.M128_F32[3]);
														
 
															+  WriteLn('AddRes.V2: ', AddRes.V2.M128_F32[0], ',', AddRes.V2.M128_F32[1], ',', AddRes.V2.M128_F32[2], ',', AddRes.V2.M128_F32[3]);
														
 
															+  WriteLn('AddRes.V3: ', AddRes.V3.M128_F32[0], ',', AddRes.V3.M128_F32[1], ',', AddRes.V3.M128_F32[2], ',', AddRes.V3.M128_F32[3]);
														
 
															+  WriteLn('AddRes.V4: ', AddRes.V4.M128_F32[0], ',', AddRes.V4.M128_F32[1], ',', AddRes.V4.M128_F32[2], ',', AddRes.V4.M128_F32[3]);
														
 
															+  WriteLn();
														
 
															+  WriteLn('AddExp.V1: ', AddExp.V1.M128_F32[0], ',', AddExp.V1.M128_F32[1], ',', AddExp.V1.M128_F32[2], ',', AddExp.V1.M128_F32[3]);
														
 
															+  WriteLn('AddExp.V2: ', AddExp.V2.M128_F32[0], ',', AddExp.V2.M128_F32[1], ',', AddExp.V2.M128_F32[2], ',', AddExp.V2.M128_F32[3]);
														
 
															+  WriteLn('AddExp.V3: ', AddExp.V3.M128_F32[0], ',', AddExp.V3.M128_F32[1], ',', AddExp.V3.M128_F32[2], ',', AddExp.V3.M128_F32[3]);
														
 
															+  WriteLn('AddExp.V4: ', AddExp.V4.M128_F32[0], ',', AddExp.V4.M128_F32[1], ',', AddExp.V4.M128_F32[2], ',', AddExp.V4.M128_F32[3]);
														
 
															+  WriteLn('----');
														
 
															+  WriteLn('SubRes.V1: ', SubRes.V1.M128_F32[0], ',', SubRes.V1.M128_F32[1], ',', SubRes.V1.M128_F32[2], ',', SubRes.V1.M128_F32[3]);
														
 
															+  WriteLn('SubRes.V2: ', SubRes.V2.M128_F32[0], ',', SubRes.V2.M128_F32[1], ',', SubRes.V2.M128_F32[2], ',', SubRes.V2.M128_F32[3]);
														
 
															+  WriteLn('SubRes.V3: ', SubRes.V3.M128_F32[0], ',', SubRes.V3.M128_F32[1], ',', SubRes.V3.M128_F32[2], ',', SubRes.V3.M128_F32[3]);
														
 
															+  WriteLn('SubRes.V4: ', SubRes.V4.M128_F32[0], ',', SubRes.V4.M128_F32[1], ',', SubRes.V4.M128_F32[2], ',', SubRes.V4.M128_F32[3]);
														
 
															+  WriteLn();
														
 
															+  WriteLn('SubExp.V1: ', SubExp.V1.M128_F32[0], ',', SubExp.V1.M128_F32[1], ',', SubExp.V1.M128_F32[2], ',', SubExp.V1.M128_F32[3]);
														
 
															+  WriteLn('SubExp.V2: ', SubExp.V2.M128_F32[0], ',', SubExp.V2.M128_F32[1], ',', SubExp.V2.M128_F32[2], ',', SubExp.V2.M128_F32[3]);
														
 
															+  WriteLn('SubExp.V3: ', SubExp.V3.M128_F32[0], ',', SubExp.V3.M128_F32[1], ',', SubExp.V3.M128_F32[2], ',', SubExp.V3.M128_F32[3]);
														
 
															+  WriteLn('SubExp.V4: ', SubExp.V4.M128_F32[0], ',', SubExp.V4.M128_F32[1], ',', SubExp.V4.M128_F32[2], ',', SubExp.V4.M128_F32[3]);
														
 
															+  WriteLn('----');
														
 
															+  WriteLn('MixRes.V1: ', MixRes.V1.M128_F32[0], ',', MixRes.V1.M128_F32[1], ',', MixRes.V1.M128_F32[2], ',', MixRes.V1.M128_F32[3]);
														
 
															+  WriteLn('MixRes.V2: ', MixRes.V2.M128_F32[0], ',', MixRes.V2.M128_F32[1], ',', MixRes.V2.M128_F32[2], ',', MixRes.V2.M128_F32[3]);
														
 
															+  WriteLn('MixRes.V3: ', MixRes.V3.M128_F32[0], ',', MixRes.V3.M128_F32[1], ',', MixRes.V3.M128_F32[2], ',', MixRes.V3.M128_F32[3]);
														
 
															+  WriteLn('MixRes.V4: ', MixRes.V4.M128_F32[0], ',', MixRes.V4.M128_F32[1], ',', MixRes.V4.M128_F32[2], ',', MixRes.V4.M128_F32[3]);
														
 
															+  WriteLn();
														
 
															+  WriteLn('MixExp.V1: ', MixExp.V1.M128_F32[0], ',', MixExp.V1.M128_F32[1], ',', MixExp.V1.M128_F32[2], ',', MixExp.V1.M128_F32[3]);
														
 
															+  WriteLn('MixExp.V2: ', MixExp.V2.M128_F32[0], ',', MixExp.V2.M128_F32[1], ',', MixExp.V2.M128_F32[2], ',', MixExp.V2.M128_F32[3]);
														
 
															+  WriteLn('MixExp.V3: ', MixExp.V3.M128_F32[0], ',', MixExp.V3.M128_F32[1], ',', MixExp.V3.M128_F32[2], ',', MixExp.V3.M128_F32[3]);
														
 
															+  WriteLn('MixExp.V4: ', MixExp.V4.M128_F32[0], ',', MixExp.V4.M128_F32[1], ',', MixExp.V4.M128_F32[2], ',', MixExp.V4.M128_F32[3]);
														
 
															+
														
 
															+  for I := 0 to 3 do
														
 
															+    begin
														
 
															+      if AddRes.V1.M128_F32[I] <> AddExp.V1.M128_F32[I] then
														
 
															+        begin
														
 
															+          WriteLn('FAILURE on AddRes.V1.M128_F32[', I, ']');
														
 
															+          Halt(1);
														
 
															+        end;
														
 
															+      if SubRes.V1.M128_F32[I] <> SubExp.V1.M128_F32[I] then
														
 
															+        begin
														
 
															+          WriteLn('FAILURE on SubRes.V1.M128_F32[', I, ']');
														
 
															+          Halt(1);
														
 
															+        end;
														
 
															+      if MixRes.V1.M128_F32[I] <> MixExp.V1.M128_F32[I] then
														
 
															+        begin
														
 
															+          WriteLn('FAILURE on MixRes.V1.M128_F32[', I, ']');
														
 
															+          Halt(1);
														
 
															+        end;
														
 
															+    end;
														
 
															+
														
 
															+  WriteLn('ok');
														
 
															+end.
														
--- a/tests/test/cg/tvectorcall3.pp
+++ b/tests/test/cg/tvectorcall3.pp
@@ -0,0 +1,158 @@
 
															+{ %CPU=x86_64 } 
														
 
															+program vectorcall_stack_test;
														
 
															+
														
 
															+{$IFNDEF CPUX86_64}
														
 
															+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
														
 
															+{$ENDIF}
														
 
															+
														
 
															+{ This program can be compiled on Linux, and all the vectorcall
														
 
															+  routines should work the same, including the assembler routine.
														
 
															+  'vectorcall' should be ignored by the compiler on this platform. }
														
 
															+
														
 
															+{$push}
														
 
															+{$CODEALIGN RECORDMIN=16}
														
 
															+{$PACKRECORDS C}
														
 
															+type
														
 
															+  TM128 = record
														
 
															+    case Byte of
														
 
															+      0: (M128_F32: array[0..3] of Single);
														
 
															+      1: (M128_F64: array[0..1] of Double);
														
 
															+  end;
														
 
															+
														
 
															+{$CODEALIGN RECORDMIN=32}
														
 
															+{$PACKRECORDS C}
														
 
															+type
														
 
															+  TM256 = record
														
 
															+    case Byte of
														
 
															+      0: (M256_F32: array[0..7] of Single);
														
 
															+      1: (M256_F64: array[0..3] of Double);
														
 
															+      2: (M256_M128: array[0..1] of TM128);
														
 
															+  end;
														
 
															+{$pop}
														
 
															+
														
 
															+  TVector4f = packed record
														
 
															+    case Byte of
														
 
															+      0: (M128: TM128);
														
 
															+      1: (X, Y, Z, W: Single);
														
 
															+  end;
														
 
															+
														
 
															+  TVectorPair4f = packed record
														
 
															+    case Byte of
														
 
															+      0: (M256: TM256);
														
 
															+      1: (V: array[0..1] of TVector4f);
														
 
															+      2: (X1, Y1, Z1, W1, X2, Y2, Z2, W2: Single);
														
 
															+  end;
														
 
															+
														
 
															+function TestFloat(TP: Single): Single; vectorcall; { vectorcall should have no effect on how this function behaves }
														
 
															+begin
														
 
															+  TestFloat := TP * 1.5;
														
 
															+end;
														
 
															+
														
 
															+function AddVectors(V1, V2: TVector4f): TVector4f; vectorcall;
														
 
															+begin
														
 
															+  AddVectors.X := V1.X + V2.X;
														
 
															+  AddVectors.Y := V1.Y + V2.Y;
														
 
															+  AddVectors.Z := V1.Z + V2.Z;
														
 
															+  AddVectors.W := V1.W + V2.W;
														
 
															+end;
														
 
															+
														
 
															+{$ASMMODE Intel}
														
 
															+function AddVectorsAsm(V1, V2: TVector4f): TVector4f; vectorcall; assembler; nostackframe; inline; { The inline is for a future test }
														
 
															+asm
														
 
															+  ADDPS XMM0, XMM1
														
 
															+end;
														
 
															+
														
 
															+{ Note: V1, V2 and the result will go on the stack until FPC fully supports 256-bit vectors }
														
 
															+function AddVectors(V1, V2: TVectorPair4f): TVectorPair4f; vectorcall;
														
 
															+var
														
 
															+  C: Integer;
														
 
															+begin
														
 
															+  for C := 0 to 1 do
														
 
															+  begin
														
 
															+    AddVectors.V[C].X := V1.V[C].X + V2.V[C].X;
														
 
															+    AddVectors.V[C].Y := V1.V[C].Y + V2.V[C].Y;
														
 
															+    AddVectors.V[C].Z := V1.V[C].Z + V2.V[C].Z;
														
 
															+    AddVectors.V[C].W := V1.V[C].W + V2.V[C].W;
														
 
															+  end;
														
 
															+end;
														
 
															+
														
 
															+var
														
 
															+  Vecs: array[0..1] of TVector4f; Res, ResAsm, Exp: TVector4f;
														
 
															+  Pairs: array[0..1] of TVectorPair4f; ResPair, ExpPair: TVectorPair4f;
														
 
															+  I: Integer;
														
 
															+begin
														
 
															+  FillDWord(Vecs[0], 0, 8);
														
 
															+  Vecs[0].X := TestFloat(2.0);
														
 
															+  Vecs[0].Y := 1.0;
														
 
															+  Vecs[0].Z := -4.0;
														
 
															+  Vecs[0].W := 1.0;
														
 
															+
														
 
															+  Vecs[1].X := 0.0;
														
 
															+  Vecs[1].Y := -2.0;
														
 
															+  Vecs[1].Z := TestFloat(4.0);
														
 
															+  Vecs[1].W := 0.0;
														
 
															+
														
 
															+  Exp.X := 3.0;
														
 
															+  Exp.Y := -1.0;
														
 
															+  Exp.Z := 2.0;
														
 
															+  Exp.W := 1.0;
														
 
															+
														
 
															+  Pairs[0].V[0].X := 1.0;     Pairs[0].V[1].X := 5.0;
														
 
															+  Pairs[0].V[0].Y := 2.0;     Pairs[0].V[1].Y := 6.0;
														
 
															+  Pairs[0].V[0].Z := 3.0;     Pairs[0].V[1].Z := 7.0;
														
 
															+  Pairs[0].V[0].W := 4.0;     Pairs[0].V[1].W := 8.0;
														
 
															+
														
 
															+  Pairs[1].V[0].X := 9.0;     Pairs[1].V[1].X := 13.0;
														
 
															+  Pairs[1].V[0].Y := 10.0;    Pairs[1].V[1].Y := 14.0;
														
 
															+  Pairs[1].V[0].Z := 11.0;    Pairs[1].V[1].Z := 15.0;
														
 
															+  Pairs[1].V[0].W := 12.0;    Pairs[1].V[1].W := 16.0;
														
 
															+
														
 
															+  ExpPair.V[0].X := 10.0;     ExpPair.V[1].X := 18.0;
														
 
															+  ExpPair.V[0].Y := 12.0;     ExpPair.V[1].Y := 20.0;
														
 
															+  ExpPair.V[0].Z := 14.0;     ExpPair.V[1].Z := 22.0;
														
 
															+  ExpPair.V[0].W := 16.0;     ExpPair.V[1].W := 24.0;
														
 
															+
														
 
															+  WriteLn('Vecs[0]  = (', Vecs[0].X, ', ', Vecs[0].Y, ', ', Vecs[0].Z, ', ', Vecs[0].W, ')');
														
 
															+  WriteLn('Vecs[1]  = (', Vecs[1].X, ', ', Vecs[1].Y, ', ', Vecs[1].Z, ', ', Vecs[1].W, ')');
														
 
															+
														
 
															+  Res := AddVectors(Vecs[0], Vecs[1]);
														
 
															+  ResAsm := AddVectorsAsm(Vecs[0], Vecs[1]);
														
 
															+
														
 
															+  WriteLn('Result   = (', Res.X, ', ', Res.Y, ', ', Res.Z, ', ', Res.W, ')');
														
 
															+  WriteLn('ResAsm   = (', ResAsm.X, ', ', ResAsm.Y, ', ', ResAsm.Z, ', ', ResAsm.W, ')');
														
 
															+  WriteLn('Expected = (', Exp.X, ', ', Exp.Y, ', ', Exp.Z, ', ', Exp.W, ')');
														
 
															+
														
 
															+  WriteLn('Pairs[0] = (', Pairs[0].V[0].X, ', ', Pairs[0].V[0].Y, ', ', Pairs[0].V[0].Z, ', ', Pairs[0].V[0].W, ', ', Pairs[0].V[1].X, ', ', Pairs[0].V[1].Y, ', ', Pairs[0].V[1].Z, ', ', Pairs[0].V[1].W, ')');
														
 
															+  WriteLn('Pairs[1] = (', Pairs[1].V[0].X, ', ', Pairs[1].V[0].Y, ', ', Pairs[1].V[0].Z, ', ', Pairs[1].V[0].W, ', ', Pairs[1].V[1].X, ', ', Pairs[1].V[1].Y, ', ', Pairs[1].V[1].Z, ', ', Pairs[1].V[1].W, ')');
														
 
															+
														
 
															+  ResPair := AddVectors(Pairs[0], Pairs[1]);
														
 
															+
														
 
															+  WriteLn('ResPair  = (', ResPair.V[0].X, ', ', ResPair.V[0].Y, ', ', ResPair.V[0].Z, ', ', ResPair.V[0].W, ', ', ResPair.V[1].X, ', ', ResPair.V[1].Y, ', ', ResPair.V[1].Z, ', ', ResPair.V[1].W, ')');
														
 
															+  WriteLn('Expected = (', ExpPair.V[0].X, ', ', ExpPair.V[0].Y, ', ', ExpPair.V[0].Z, ', ', ExpPair.V[0].W, ', ', ExpPair.V[1].X, ', ', ExpPair.V[1].Y, ', ', ExpPair.V[1].Z, ', ', ExpPair.V[1].W, ')');
														
 
															+
														
 
															+  for I := 0 to 3 do
														
 
															+  begin
														
 
															+    if Res.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
														
 
															+    begin
														
 
															+      WriteLn('FAILURE on Res.M128.M128_F32[', I, ']');
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+
														
 
															+    if ResAsm.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
														
 
															+    begin
														
 
															+      WriteLn('FAILURE on ResAsm.M128.M128_F32[', I, ']');
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  for I := 0 to 7 do
														
 
															+  begin
														
 
															+    if ResPair.M256.M256_F32[I] <> ExpPair.M256.M256_F32[I] then
														
 
															+    begin
														
 
															+      WriteLn('FAILURE on ResPair.M256.M256_F32[', I, ']');
														
 
															+      Halt(1);
														
 
															+    end;
														
 
															+  end;
														
 
															+
														
 
															+  WriteLn('ok');
														
 
															+end.