7 жил өмнө · 31f78ea2b6
--- a/.gitattributes
+++ b/.gitattributes
@@ -11999,6 +11999,9 @@ tests/test/cg/ttryfin4.pp svneol=native#text/plain
 
				 tests/test/cg/ttryfin5.pp svneol=native#text/plain
			
 
				 tests/test/cg/tumin.pp svneol=native#text/plain
			
 
				 tests/test/cg/tvec.pp svneol=native#text/plain
			
 
				+tests/test/cg/tvectorcall1.pp svneol=native#text/pascal
			
 
				+tests/test/cg/tvectorcall2.pp svneol=native#text/pascal
			
 
				+tests/test/cg/tvectorcall3.pp svneol=native#text/pascal
			
 
				 tests/test/cg/uandorxorassign.pp svneol=native#text/plain
			
 
				 tests/test/cg/unegnotassign.pp svneol=native#text/plain
			
 
				 tests/test/cg/uprintf3.pp svneol=native#text/plain
			
--- a/compiler/cgbase.pas
+++ b/compiler/cgbase.pas
@@ -164,14 +164,18 @@ interface
 
				        { OS_NO is also used memory references with large data that can
			
 
				          not be loaded in a register directly }
			
 
				        TCgSize = (OS_NO,
			
 
				-                 { integer registers }
			
 
				-                  OS_8,OS_16,OS_32,OS_64,OS_128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
			
 
				-                 { single,double,extended,comp,float128 }
			
 
				-                  OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
			
 
				+                  OS_8,   OS_16,   OS_32,   OS_64,   OS_128,
			
 
				+                  OS_S8,  OS_S16,  OS_S32,  OS_S64,  OS_S128,
			
 
				+                 { single, double, extended, comp, float128 }
			
 
				+                  OS_F32, OS_F64,  OS_F80,  OS_C64,  OS_F128,
			
 
				                  { multi-media sizes: split in byte, word, dword, ... }
			
 
				                  { entities, then the signed counterparts             }
			
 
				-                  OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,  
			
 
				-                  OS_MS8,OS_MS16,OS_MS32,OS_MS64,OS_MS128,OS_MS256 );  
			
 
				+                  OS_M8,  OS_M16,  OS_M32,  OS_M64,  OS_M128,  OS_M256,  OS_M512,
			
 
				+                  OS_MS8, OS_MS16, OS_MS32, OS_MS64, OS_MS128, OS_MS256, OS_MS512,
			
 
				+                 { multi-media sizes: single-precision floating-point }
			
 
				+                  OS_MF32, OS_MF128, OS_MF256, OS_MF512,
			
 
				+                 { multi-media sizes: double-precision floating-point }
			
 
				+                  OS_MD64, OS_MD128, OS_MD256, OS_MD512);
			
 
				 
			
 
				       { Register types }
			
 
				       TRegisterType = (
			
@@ -205,15 +209,16 @@ interface
 
				         { For Intel X86 AVX-Register }
			
 
				         R_SUBMMX,     { = 12; 128 BITS }
			
 
				         R_SUBMMY,     { = 13; 256 BITS }
			
 
				+        R_SUBMMZ,     { = 14; 512 BITS }
			
 
				         { Subregisters for the flags register (x86) }
			
 
				-        R_SUBFLAGCARRY,     { = 14; Carry flag }
			
 
				-        R_SUBFLAGPARITY,    { = 15; Parity flag }
			
 
				-        R_SUBFLAGAUXILIARY, { = 16; Auxiliary flag }
			
 
				-        R_SUBFLAGZERO,      { = 17; Zero flag }
			
 
				-        R_SUBFLAGSIGN,      { = 18; Sign flag }
			
 
				-        R_SUBFLAGOVERFLOW,  { = 19; Overflow flag }
			
 
				-        R_SUBFLAGINTERRUPT, { = 20; Interrupt enable flag }
			
 
				-        R_SUBFLAGDIRECTION  { = 21; Direction flag }
			
 
				+        R_SUBFLAGCARRY,     { = 15; Carry flag }
			
 
				+        R_SUBFLAGPARITY,    { = 16; Parity flag }
			
 
				+        R_SUBFLAGAUXILIARY, { = 17; Auxiliary flag }
			
 
				+        R_SUBFLAGZERO,      { = 18; Zero flag }
			
 
				+        R_SUBFLAGSIGN,      { = 19; Sign flag }
			
 
				+        R_SUBFLAGOVERFLOW,  { = 20; Overflow flag }
			
 
				+        R_SUBFLAGINTERRUPT, { = 21; Interrupt enable flag }
			
 
				+        R_SUBFLAGDIRECTION  { = 22; Direction flag }
			
 
				       );
			
 
				       TSubRegisterSet = set of TSubRegister;
			
 
				 
			
@@ -307,12 +312,19 @@ interface
 
				        NR_INVALID    = tregister($fffffffff);
			
 
				 
			
 
				        tcgsize2size : Array[tcgsize] of integer =
			
 
				+        (0,
			
 
				          { integer values }
			
 
				-        (0,1,2,4,8,16,1,2,4,8,16,
			
 
				+         1,  2,  4,  8, 16,
			
 
				+         1,  2,  4,  8, 16,
			
 
				          { floating point values }
			
 
				-         4,8,10,8,16,
			
 
				+         4,  8, 10,  8, 16,
			
 
				          { multimedia values }
			
 
				-         1,2,4,8,16,32,1,2,4,8,16,32); 
			
 
				+         1,  2,  4,  8, 16, 32, 64,
			
 
				+         1,  2,  4,  8, 16, 32, 64,
			
 
				+         { single-precision multimedia values }
			
 
				+         4, 16, 32, 64,
			
 
				+         { double-precision multimedia values }
			
 
				+         8, 16, 32, 64);
			
 
				 
			
 
				        tfloat2tcgsize: array[tfloattype] of tcgsize =
			
 
				          (OS_F32,OS_F64,OS_F80,OS_F80,OS_C64,OS_C64,OS_F128);
			
@@ -348,16 +360,25 @@ interface
 
				        { Table to convert tcgsize variables to the correspondending
			
 
				          unsigned types }
			
 
				        tcgsize2unsigned : array[tcgsize] of tcgsize = (OS_NO,
			
 
				-          OS_8,OS_16,OS_32,OS_64,OS_128,OS_8,OS_16,OS_32,OS_64,OS_128,
			
 
				-          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
			
 
				-          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
			
 
				-          OS_M64,OS_M128,OS_M256);
			
 
				+         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
			
 
				+         OS_8,    OS_16,   OS_32,   OS_64,   OS_128,
			
 
				+
			
 
				+         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
			
 
				+         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
			
 
				+         OS_M8,   OS_M16,  OS_M32,  OS_M64,  OS_M128, OS_M256, OS_M512,
			
 
				+         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
			
 
				+         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
			
 
				+
			
 
				 
			
 
				        tcgsize2signed : array[tcgsize] of tcgsize = (OS_NO,
			
 
				-          OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,OS_S8,OS_S16,OS_S32,OS_S64,OS_S128,
			
 
				-          OS_F32,OS_F64,OS_F80,OS_C64,OS_F128,
			
 
				-          OS_M8,OS_M16,OS_M32,OS_M64,OS_M128,OS_M256,OS_M8,OS_M16,OS_M32,
			
 
				-          OS_M64,OS_M128,OS_M256);
			
 
				+         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
			
 
				+         OS_S8,   OS_S16,  OS_S32,  OS_S64,  OS_S128,
			
 
				+
			
 
				+         OS_F32,  OS_F64,  OS_F80,  OS_C64,  OS_F128,
			
 
				+         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
			
 
				+         OS_MS8,  OS_MS16, OS_MS32, OS_MS64, OS_MS128,OS_MS256,OS_MS512,
			
 
				+         OS_MF32, OS_MF128,OS_MF256,OS_MF512,
			
 
				+         OS_MD64, OS_MD128,OS_MD256,OS_MD512);
			
 
				 
			
 
				 
			
 
				        tcgloc2str : array[TCGLoc] of string[12] = (
			
@@ -404,6 +425,8 @@ interface
 
				     }
			
 
				     function int_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
			
 
				     function int_float_cgsize(const a: tcgint): tcgsize;
			
 
				+    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
			
 
				+    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
			
 
				 
			
 
				     function tcgsize2str(cgsize: tcgsize):string;
			
 
				 
			
@@ -660,6 +683,8 @@ implementation
 
				             result:=result+'mx';
			
 
				           R_SUBMMY:
			
 
				             result:=result+'my';
			
 
				+          R_SUBMMZ:
			
 
				+            result:=result+'mz';
			
 
				           else
			
 
				             internalerror(200308252);
			
 
				         end;
			
@@ -701,6 +726,39 @@ implementation
 
				       end;
			
 
				 
			
 
				 
			
 
				+    function float_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
			
 
				+      begin
			
 
				+        case a of
			
 
				+          4:
			
 
				+            result := OS_MF32;
			
 
				+          16:
			
 
				+            result := OS_MF128;
			
 
				+          32:
			
 
				+            result := OS_MF256;
			
 
				+          64:
			
 
				+            result := OS_MF512;
			
 
				+          else
			
 
				+            result := int_cgsize(a);
			
 
				+        end;
			
 
				+      end;
			
 
				+
			
 
				+    function double_array_cgsize(const a: tcgint): tcgsize;{$ifdef USEINLINE}inline;{$endif}
			
 
				+      begin
			
 
				+        case a of
			
 
				+          8:
			
 
				+            result := OS_MD64;
			
 
				+          16:
			
 
				+            result := OS_MD128;
			
 
				+          32:
			
 
				+            result := OS_MD256;
			
 
				+          64:
			
 
				+            result := OS_MD512;
			
 
				+          else
			
 
				+            result := int_cgsize(a);
			
 
				+        end;
			
 
				+      end;
			
 
				+
			
 
				+
			
 
				     function tcgsize2str(cgsize: tcgsize):string;
			
 
				       begin
			
 
				         Str(cgsize, Result);
			
--- a/compiler/defutil.pas
+++ b/compiler/defutil.pas
@@ -1338,7 +1338,24 @@ implementation
 
				           arraydef :
			
 
				             begin
			
 
				               if is_dynamic_array(def) or not is_special_array(def) then
			
 
				-                result := int_cgsize(def.size)
			
 
				+                begin
			
 
				+                  if (cs_support_vectors in current_settings.globalswitches) and is_vector(def) and ((TArrayDef(def).elementdef.typ = floatdef) and not (cs_fp_emulation in current_settings.moduleswitches)) then
			
 
				+                    begin
			
 
				+                      { Determine if, based on the floating-point type and the size
			
 
				+                        of the array, if it can be made into a vector }
			
 
				+                      case TFloatDef(def).floattype of
			
 
				+                        s32real:
			
 
				+                          result := float_array_cgsize(def.size);
			
 
				+                        s64real:
			
 
				+                          result := double_array_cgsize(def.size);
			
 
				+                        else
			
 
				+                          { If not, fall back }
			
 
				+                          result := int_cgsize(def.size);
			
 
				+                      end;
			
 
				+                    end
			
 
				+                  else
			
 
				+                    result := int_cgsize(def.size);
			
 
				+                end
			
 
				               else
			
 
				                 result := OS_NO;
			
 
				             end;
			
@@ -1379,25 +1396,53 @@ implementation
 
				         case def.typ of
			
 
				           arraydef:
			
 
				             begin
			
 
				-              if tarraydef(def).elementdef.typ in [orddef,floatdef] then
			
 
				-                begin
			
 
				-                  { this is not correct, OS_MX normally mean that the vector
			
 
				-                    contains elements of size X. However, vectors themselves
			
 
				-                    can also have different sizes (e.g. a vector of 2 singles on
			
 
				-                    SSE) and the total size is currently more important }
			
 
				-                  case def.size of
			
 
				-                    1: result:=OS_M8;
			
 
				-                    2: result:=OS_M16;
			
 
				-                    4: result:=OS_M32;
			
 
				-                    8: result:=OS_M64;
			
 
				-                    16: result:=OS_M128;
			
 
				-                    32: result:=OS_M256;
			
 
				-                    else
			
 
				-                      internalerror(2013060103);
			
 
				+              case tarraydef(def).elementdef.typ of
			
 
				+                orddef:
			
 
				+                  begin
			
 
				+                    { this is not correct, OS_MX normally mean that the vector
			
 
				+                      contains elements of size X. However, vectors themselves
			
 
				+                      can also have different sizes (e.g. a vector of 2 singles on
			
 
				+                      SSE) and the total size is currently more important }
			
 
				+                    case def.size of
			
 
				+                      1: result:=OS_M8;
			
 
				+                      2: result:=OS_M16;
			
 
				+                      4: result:=OS_M32;
			
 
				+                      8: result:=OS_M64;
			
 
				+                      16: result:=OS_M128;
			
 
				+                      32: result:=OS_M256;
			
 
				+                      64: result:=OS_M512;
			
 
				+                      else
			
 
				+                        internalerror(2013060103);
			
 
				+                    end;
			
 
				                   end;
			
 
				-                end
			
 
				-              else
			
 
				-                result:=def_cgsize(def);
			
 
				+                floatdef:
			
 
				+                  begin
			
 
				+                    case TFloatDef(tarraydef(def).elementdef).floattype of
			
 
				+                      s32real:
			
 
				+                        case def.size of
			
 
				+                          4:  result:=OS_MF32;
			
 
				+                          16: result:=OS_MF128;
			
 
				+                          32: result:=OS_MF256;
			
 
				+                          64: result:=OS_MF512;
			
 
				+                          else
			
 
				+                            internalerror(2017121400);
			
 
				+                        end;
			
 
				+                      s64real:
			
 
				+                        case def.size of
			
 
				+                          8:  result:=OS_MD64;
			
 
				+                          16: result:=OS_MD128;
			
 
				+                          32: result:=OS_MD256;
			
 
				+                          64: result:=OS_MD512;
			
 
				+                          else
			
 
				+                            internalerror(2017121401);
			
 
				+                        end;
			
 
				+                      else
			
 
				+                        internalerror(2017121402);
			
 
				+                    end;
			
 
				+                  end;
			
 
				+                else
			
 
				+                  result:=def_cgsize(def);
			
 
				+              end;
			
 
				             end
			
 
				           else
			
 
				             result:=def_cgsize(def);
			
--- a/compiler/globals.pas
+++ b/compiler/globals.pas
@@ -1112,7 +1112,8 @@ implementation
 
				          'SYSV_ABI_DEFAULT',
			
 
				          'SYSV_ABI_CDECL',
			
 
				          'MS_ABI_DEFAULT',
			
 
				-         'MS_ABI_CDECL'
			
 
				+         'MS_ABI_CDECL',
			
 
				+         'VECTORCALL'
			
 
				         );
			
 
				       var
			
 
				         t  : tproccalloption;
			
--- a/compiler/globtype.pas
+++ b/compiler/globtype.pas
@@ -539,7 +539,9 @@ interface
 
				          pocall_sysv_abi_cdecl,
			
 
				          { for x86-64: forces Microsoft ABI (Pascal resp. C) }
			
 
				          pocall_ms_abi_default,
			
 
				-         pocall_ms_abi_cdecl
			
 
				+         pocall_ms_abi_cdecl,
			
 
				+         { for x86-64: Microsoft's "vectorcall" ABI }
			
 
				+         pocall_vectorcall
			
 
				        );
			
 
				        tproccalloptions = set of tproccalloption;
			
 
				 
			
@@ -560,9 +562,10 @@ interface
 
				            'Interrupt',
			
 
				            'HardFloat',
			
 
				            'SysV_ABI_Default',
			
 
				-           'MS_ABI_CDecl',
			
 
				+           'MS_ABI_CDecl', { TODO: Is this correct? Shouldn't it be SysV_ABI_Default }
			
 
				            'MS_ABI_Default',
			
 
				-           'MS_ABI_CDecl'
			
 
				+           'MS_ABI_CDecl',
			
 
				+           'VectorCall'
			
 
				          );
			
 
				 
			
 
				        { Default calling convention }
			
--- a/compiler/hlcg2ll.pas
+++ b/compiler/hlcg2ll.pas
@@ -1538,6 +1538,8 @@ implementation
 
				               result:=OS_F32;
			
 
				             OS_64:
			
 
				               result:=OS_F64;
			
 
				+            OS_128:
			
 
				+              result:=OS_M128;
			
 
				           end;
			
 
				         end;
			
 
				     end;
			
--- a/compiler/i386/cpubase.inc
+++ b/compiler/i386/cpubase.inc
@@ -35,7 +35,8 @@
 
				         S_NEAR,S_FAR,S_SHORT,
			
 
				         S_T,
			
 
				         S_XMM,
			
 
				-        S_YMM
			
 
				+        S_YMM,
			
 
				+        S_ZMM
			
 
				       );
			
 
				 
			
 
				       TOpSizes = set of topsize;
			
--- a/compiler/i8086/cpubase.inc
+++ b/compiler/i8086/cpubase.inc
@@ -35,7 +35,8 @@
 
				         S_NEAR,S_FAR,S_SHORT,
			
 
				         S_T,
			
 
				         S_XMM,
			
 
				-        S_YMM
			
 
				+        S_YMM,
			
 
				+        S_ZMM
			
 
				       );
			
 
				 
			
 
				       TOpSizes = set of topsize;
			
--- a/compiler/ncgld.pas
+++ b/compiler/ncgld.pas
@@ -682,6 +682,7 @@ implementation
 
				 
			
 
				     procedure tcgassignmentnode.pass_generate_code;
			
 
				       var
			
 
				+         shuffle : pmmshuffle;
			
 
				          hlabel : tasmlabel;
			
 
				          href : treference;
			
 
				          releaseright : boolean;
			
@@ -968,22 +969,21 @@ implementation
 
				               LOC_MMREGISTER,
			
 
				               LOC_CMMREGISTER:
			
 
				                 begin
			
 
				-                  if left.resultdef.typ=arraydef then
			
 
				-                    begin
			
 
				-                    end
			
 
				+                  if (is_vector(left.resultdef)) then
			
 
				+                    shuffle := nil
			
 
				                   else
			
 
				-                    begin
			
 
				-                      case left.location.loc of
			
 
				-                        LOC_CMMREGISTER,
			
 
				-                        LOC_MMREGISTER:
			
 
				-                          hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register,mms_movescalar);
			
 
				-                        LOC_REFERENCE,
			
 
				-                        LOC_CREFERENCE:
			
 
				-                          hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference,mms_movescalar);
			
 
				-                        else
			
 
				-                          internalerror(2009112601);
			
 
				-                      end;
			
 
				-                    end;
			
 
				+                    shuffle := mms_movescalar;
			
 
				+
			
 
				+                  case left.location.loc of
			
 
				+                    LOC_CMMREGISTER,
			
 
				+                    LOC_MMREGISTER:
			
 
				+                      hlcg.a_loadmm_reg_reg(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.register, shuffle);
			
 
				+                    LOC_REFERENCE,
			
 
				+                    LOC_CREFERENCE:
			
 
				+                      hlcg.a_loadmm_reg_ref(current_asmdata.CurrAsmList,right.resultdef,left.resultdef,right.location.register,left.location.reference, shuffle);
			
 
				+                    else
			
 
				+                      internalerror(2009112601);
			
 
				+                  end;
			
 
				                 end;
			
 
				               LOC_REGISTER,
			
 
				               LOC_CREGISTER :
			
--- a/compiler/ncgrtti.pas
+++ b/compiler/ncgrtti.pas
@@ -335,7 +335,8 @@ implementation
 
				          { pocall_sysv_abi_default } 14,
			
 
				          { pocall_sysv_abi_cdecl }   15,
			
 
				          { pocall_ms_abi_default }   16,
			
 
				-         { pocall_ms_abi_cdecl }     17
			
 
				+         { pocall_ms_abi_cdecl }     17,
			
 
				+         { pocall_vectorcall }       18
			
 
				         );
			
 
				       begin
			
 
				         tcb.emit_ord_const(ProcCallOptionToCallConv[def.proccalloption],u8inttype);
			
--- a/compiler/pdecsub.pas
+++ b/compiler/pdecsub.pas
@@ -2382,7 +2382,7 @@ type
 
				    end;
			
 
				 const
			
 
				   {Should contain the number of procedure directives we support.}
			
 
				-  num_proc_directives=50;
			
 
				+  num_proc_directives=51;
			
 
				   proc_direcdata:array[1..num_proc_directives] of proc_dir_rec=
			
 
				    (
			
 
				     (
			
@@ -2849,6 +2849,15 @@ const
 
				       mutexclpocall : [];
			
 
				       mutexclpotype : [potype_constructor,potype_destructor,potype_class_constructor,potype_class_destructor];
			
 
				       mutexclpo     : [po_interrupt]
			
 
				+    ),(
			
 
				+      idtok:_VECTORCALL;
			
 
				+      pd_flags : [pd_interface,pd_implemen,pd_body,pd_procvar];
			
 
				+      handler  : nil;
			
 
				+      pocall   : pocall_vectorcall;
			
 
				+      pooption : [];
			
 
				+      mutexclpocall : [];
			
 
				+      mutexclpotype : [potype_constructor,potype_destructor,potype_class_constructor,potype_class_destructor];
			
 
				+      mutexclpo     : [po_interrupt]
			
 
				     )
			
 
				    );
			
 
				 
			
--- a/compiler/symtype.pas
+++ b/compiler/symtype.pas
@@ -82,6 +82,7 @@ interface
 
				          function  alignment:shortint;virtual;abstract;
			
 
				          { alignment when this type appears in a record/class/... }
			
 
				          function  structalignment:shortint;virtual;
			
 
				+         function  aggregatealignment:shortint;virtual;
			
 
				          function  getvardef:longint;virtual;abstract;
			
 
				          function  getparentdef:tdef;virtual;
			
 
				          function  getsymtable(t:tgetsymtable):TSymtable;virtual;
			
@@ -379,6 +380,14 @@ implementation
 
				         result:=alignment;
			
 
				       end;
			
 
				 
			
 
				+    function tdef.aggregatealignment: shortint;
			
 
				+      begin
			
 
				+        if Assigned(Owner) and Assigned(Owner.defowner) and (Owner.defowner is TDef) and (Owner.defowner <> Self) then
			
 
				+          Result := max(structalignment, TDef(Owner.defowner).aggregatealignment)
			
 
				+        else
			
 
				+          Result := structalignment;
			
 
				+      end;
			
 
				+
			
 
				 
			
 
				     procedure tdef.ChangeOwner(st:TSymtable);
			
 
				       begin
			
--- a/compiler/tokens.pas
+++ b/compiler/tokens.pas
@@ -289,6 +289,7 @@ type
 
				     _OPENSTRING,
			
 
				     _RIGHTSHIFT,
			
 
				     _SPECIALIZE,
			
 
				+    _VECTORCALL,
			
 
				     _CONSTRUCTOR,
			
 
				     _GREATERTHAN,
			
 
				     _INTERNCONST,
			
@@ -628,6 +629,7 @@ const
 
				       (str:'OPENSTRING'    ;special:false;keyword:[m_none];op:NOTOKEN),
			
 
				       (str:'RIGHTSHIFT'    ;special:false;keyword:[m_none];op:NOTOKEN), { delphi operator name }
			
 
				       (str:'SPECIALIZE'    ;special:false;keyword:[m_none];op:NOTOKEN),
			
 
				+      (str:'VECTORCALL'    ;special:false;keyword:[m_none];op:NOTOKEN),
			
 
				       (str:'CONSTRUCTOR'   ;special:false;keyword:alllanguagemodes-[m_iso,m_extpas];op:NOTOKEN),
			
 
				       (str:'GREATERTHAN'   ;special:false;keyword:[m_none];op:NOTOKEN), { delphi operator name }
			
 
				       (str:'INTERNCONST'   ;special:false;keyword:[m_none];op:NOTOKEN),
			
--- a/compiler/x86/aasmcpu.pas
+++ b/compiler/x86/aasmcpu.pas
@@ -52,6 +52,7 @@ interface
 
				       OT_BITS64    = $00000008;  { x86_64 and FPU }
			
 
				       OT_BITS128   = $10000000;  { 16 byte SSE }
			
 
				       OT_BITS256   = $20000000;  { 32 byte AVX }
			
 
				+      OT_BITS512   = $40000000;  { 64 byte AVX512 }
			
 
				       OT_BITS80    = $00000010;  { FPU only  }
			
 
				       OT_FAR       = $00000020;  { this means 16:16 or 16:32, like in CALL/JMP }
			
 
				       OT_NEAR      = $00000040;
			
@@ -612,7 +613,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          ),
			
 
				          (OT_NONE,
			
 
				           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,OT_BITS8,OT_BITS16,OT_BITS32,
			
@@ -622,7 +624,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          ),
			
 
				          (OT_NONE,
			
 
				           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,OT_NONE,
			
@@ -632,7 +635,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          )
			
 
				        );
			
 
				 
			
@@ -650,7 +654,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          ),
			
 
				          (OT_NONE,
			
 
				           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
			
@@ -660,7 +665,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          ),
			
 
				          (OT_NONE,
			
 
				           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
			
@@ -670,7 +676,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          )
			
 
				       );
			
 
				 
			
@@ -688,7 +695,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          ),
			
 
				          (OT_NONE,
			
 
				           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_BITS8,OT_BITS8,OT_BITS16,
			
@@ -698,7 +706,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          ),
			
 
				          (OT_NONE,
			
 
				           OT_BITS8,OT_BITS16,OT_BITS32,OT_BITS64,OT_NONE,OT_NONE,OT_NONE,
			
@@ -708,7 +717,8 @@ implementation
 
				           OT_NEAR,OT_FAR,OT_SHORT,
			
 
				           OT_NONE,
			
 
				           OT_BITS128,
			
 
				-          OT_BITS256
			
 
				+          OT_BITS256,
			
 
				+          OT_BITS512
			
 
				          )
			
 
				       );
			
 
				 
			
--- a/compiler/x86/cgx86.pas
+++ b/compiler/x86/cgx86.pas
@@ -158,20 +158,26 @@ unit cgx86;
 
				       TCGSize2OpSize: Array[tcgsize] of topsize =
			
 
				         (S_NO,S_B,S_W,S_L,S_Q,S_XMM,S_B,S_W,S_L,S_Q,S_XMM,
			
 
				          S_FS,S_FL,S_FX,S_IQ,S_FXX,
			
 
				-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
			
 
				-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
			
 
				+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_XMM,S_YMM,S_ZMM);
			
 
				 {$elseif defined(i386)}
			
 
				       TCGSize2OpSize: Array[tcgsize] of topsize =
			
 
				         (S_NO,S_B,S_W,S_L,S_L,S_T,S_B,S_W,S_L,S_L,S_L,
			
 
				          S_FS,S_FL,S_FX,S_IQ,S_FXX,
			
 
				-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
			
 
				-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
			
 
				+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_XMM,S_YMM,S_ZMM);
			
 
				 {$elseif defined(i8086)}
			
 
				       TCGSize2OpSize: Array[tcgsize] of topsize =
			
 
				         (S_NO,S_B,S_W,S_W,S_W,S_T,S_B,S_W,S_W,S_W,S_W,
			
 
				          S_FS,S_FL,S_FX,S_IQ,S_FXX,
			
 
				-         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,
			
 
				-         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM);
			
 
				+         S_NO,S_NO,S_NO,S_MD,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_XMM,S_YMM,S_ZMM,
			
 
				+         S_NO,S_XMM,S_YMM,S_ZMM);
			
 
				 {$endif}
			
 
				 
			
 
				 {$ifndef NOTARGETWIN}
			
@@ -185,6 +191,9 @@ unit cgx86;
 
				     { returns true, if the compiler should use leave instead of mov/pop }
			
 
				     function UseLeave: boolean;
			
 
				 
			
 
				+    { Gets the byte alignment of a reference }
			
 
				+    function GetRefAlignment(ref: treference): Byte;
			
 
				+
			
 
				   implementation
			
 
				 
			
 
				     uses
			
@@ -225,6 +234,22 @@ unit cgx86;
 
				 {$endif}
			
 
				       end;
			
 
				 
			
 
				+    function GetRefAlignment(ref: treference): Byte; {$IFDEF USEINLINE}inline;{$ENDIF}
			
 
				+      begin
			
 
				+{$ifdef x86_64}
			
 
				+        { The stack pointer and base pointer will be aligned to 16-byte boundaries if the machine code is well-behaved }
			
 
				+        if (ref.base = NR_RSP) or (ref.base = NR_RBP) then
			
 
				+          begin
			
 
				+            if (ref.index = NR_NO) and ((ref.offset mod 16) = 0) then
			
 
				+              Result := 16
			
 
				+            else
			
 
				+              Result := ref.alignment;
			
 
				+          end
			
 
				+        else
			
 
				+{$endif x86_64}
			
 
				+          Result := ref.alignment;
			
 
				+      end;
			
 
				+
			
 
				     const
			
 
				       TOpCG2AsmOp: Array[topcg] of TAsmOp = (A_NONE,A_MOV,A_ADD,A_AND,A_DIV,
			
 
				                             A_IDIV,A_IMUL,A_MUL,A_NEG,A_NOT,A_OR,
			
@@ -268,8 +293,19 @@ unit cgx86;
 
				             result:=rg[R_MMREGISTER].getregister(list,R_SUBMMS);
			
 
				           OS_M64:
			
 
				             result:=rg[R_MMREGISTER].getregister(list,R_SUBQ);
			
 
				-          OS_M128:
			
 
				-            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMWHOLE);
			
 
				+          OS_M128,
			
 
				+          OS_F128,
			
 
				+          OS_MF128,
			
 
				+          OS_MD128:
			
 
				+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMX); { R_SUBMMWHOLE seems a bit dangerous and ambiguous, so changed to R_SUBMMX. [Kit] }
			
 
				+          OS_M256,
			
 
				+          OS_MF256,
			
 
				+          OS_MD256:
			
 
				+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMY);
			
 
				+          OS_M512,
			
 
				+          OS_MF512,
			
 
				+          OS_MD512:
			
 
				+            result:=rg[R_MMREGISTER].getregister(list,R_SUBMMZ);
			
 
				           else
			
 
				             internalerror(200506041);
			
 
				         end;
			
@@ -1260,13 +1296,13 @@ unit cgx86;
 
				           (A_CVTSD2SS,A_MOVSD,A_NONE,A_NONE,A_NONE),
			
 
				           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
			
 
				           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
			
 
				-          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
			
 
				+          (A_NONE,A_NONE,A_NONE,A_NONE,A_MOVAPS));
			
 
				         convertopavx : array[OS_F32..OS_F128,OS_F32..OS_F128] of tasmop = (
			
 
				           (A_VMOVSS,A_VCVTSS2SD,A_NONE,A_NONE,A_NONE),
			
 
				           (A_VCVTSD2SS,A_VMOVSD,A_NONE,A_NONE,A_NONE),
			
 
				           (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE),
			
 
				           (A_NONE,A_NONE,A_NONE,A_MOVQ,A_NONE),
			
 
				-          (A_NONE,A_NONE,A_NONE,A_NONE,A_NONE));
			
 
				+          (A_NONE,A_NONE,A_NONE,A_NONE,A_VMOVAPS));
			
 
				       begin
			
 
				         { we can have OS_F32/OS_F64 (record in function result/LOC_MMREGISTER) to
			
 
				           OS_32/OS_64 (record in memory/LOC_REFERENCE) }
			
@@ -1288,13 +1324,33 @@ unit cgx86;
 
				           end
			
 
				         { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
			
 
				           OS_64 (record in memory/LOC_REFERENCE) }
			
 
				-        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) and
			
 
				-                (fromsize=OS_M64) then
			
 
				+        else if (tcgsize2size[fromsize]=tcgsize2size[tosize]) then
			
 
				           begin
			
 
				-            if UseAVX then
			
 
				-              result:=A_VMOVQ
			
 
				-            else
			
 
				-              result:=A_MOVQ;
			
 
				+            case fromsize of
			
 
				+              OS_M64:
			
 
				+                { we can have OS_M64 (record in function result/LOC_MMREGISTER) to
			
 
				+                  OS_64 (record in memory/LOC_REFERENCE) }
			
 
				+                if UseAVX then
			
 
				+                  result:=A_VMOVQ
			
 
				+                else
			
 
				+                  result:=A_MOVQ;
			
 
				+              OS_M128:
			
 
				+                { 128-bit aligned vector }
			
 
				+                if UseAVX then
			
 
				+                  result:=A_VMOVAPS
			
 
				+                else
			
 
				+                  result:=A_MOVAPS;
			
 
				+              OS_M256,
			
 
				+              OS_M512:
			
 
				+                { 256-bit aligned vector }
			
 
				+                if UseAVX then
			
 
				+                  result:=A_VMOVAPS
			
 
				+                else
			
 
				+                  { SSE does not support 256-bit or 512-bit vectors }
			
 
				+                  InternalError(2018012930);
			
 
				+              else
			
 
				+                InternalError(2018012920);
			
 
				+            end;
			
 
				           end
			
 
				         else
			
 
				           internalerror(2010060104);
			
@@ -1313,12 +1369,14 @@ unit cgx86;
 
				             if fromsize=tosize then
			
 
				               { needs correct size in case of spilling }
			
 
				               case fromsize of
			
 
				-                OS_F32:
			
 
				+                OS_F32,
			
 
				+                OS_MF128:
			
 
				                   if UseAVX then
			
 
				                     instr:=taicpu.op_reg_reg(A_VMOVAPS,S_NO,reg1,reg2)
			
 
				                   else
			
 
				                     instr:=taicpu.op_reg_reg(A_MOVAPS,S_NO,reg1,reg2);
			
 
				-                OS_F64:
			
 
				+                OS_F64,
			
 
				+                OS_MD128:
			
 
				                   if UseAVX then
			
 
				                     instr:=taicpu.op_reg_reg(A_VMOVAPD,S_NO,reg1,reg2)
			
 
				                   else
			
@@ -1328,6 +1386,32 @@ unit cgx86;
 
				                     instr:=taicpu.op_reg_reg(A_VMOVQ,S_NO,reg1,reg2)
			
 
				                   else
			
 
				                     instr:=taicpu.op_reg_reg(A_MOVQ,S_NO,reg1,reg2);
			
 
				+                OS_M128, OS_MS128:
			
 
				+                  if UseAVX then
			
 
				+                    instr:=taicpu.op_reg_reg(A_VMOVDQA,S_NO,reg1,reg2)
			
 
				+                  else
			
 
				+                    instr:=taicpu.op_reg_reg(A_MOVDQA,S_NO,reg1,reg2);
			
 
				+                OS_MF256,
			
 
				+                OS_MF512:
			
 
				+                  if UseAVX then
			
 
				+                    instr:=taicpu.op_reg_reg(A_VMOVAPS,S_NO,reg1,reg2)
			
 
				+                  else
			
 
				+                    { SSE doesn't support 512-bit vectors }
			
 
				+                    InternalError(2018012931);
			
 
				+                OS_MD256,
			
 
				+                OS_MD512:
			
 
				+                  if UseAVX then
			
 
				+                    instr:=taicpu.op_reg_reg(A_VMOVAPD,S_NO,reg1,reg2)
			
 
				+                  else
			
 
				+                    { SSE doesn't support 512-bit vectors }
			
 
				+                    InternalError(2018012932);
			
 
				+                OS_M256, OS_MS256,
			
 
				+                OS_M512, OS_MS512:
			
 
				+                  if UseAVX then
			
 
				+                    instr:=taicpu.op_reg_reg(A_VMOVDQA,S_NO,reg1,reg2)
			
 
				+                  else
			
 
				+                    { SSE doesn't support 512-bit vectors }
			
 
				+                    InternalError(2018012933);
			
 
				                 else
			
 
				                   internalerror(2006091201);
			
 
				               end
			
@@ -1385,15 +1469,152 @@ unit cgx86;
 
				          make_simple_ref(list,tmpref);
			
 
				          if shuffle=nil then
			
 
				            begin
			
 
				-             if fromsize=OS_M64 then
			
 
				-               list.concat(taicpu.op_ref_reg(A_MOVQ,S_NO,tmpref,reg))
			
 
				-             else
			
 
				-{$ifdef x86_64}
			
 
				-               { x86-64 has always properly aligned data }
			
 
				-               list.concat(taicpu.op_ref_reg(A_MOVDQA,S_NO,tmpref,reg));
			
 
				-{$else x86_64}
			
 
				-               list.concat(taicpu.op_ref_reg(A_MOVDQU,S_NO,tmpref,reg));
			
 
				-{$endif x86_64}
			
 
				+             case fromsize of
			
 
				+               OS_F32:
			
 
				+                 if UseAVX then
			
 
				+                   op := A_VMOVSS
			
 
				+                 else
			
 
				+                   op := A_MOVSS;
			
 
				+               OS_F64:
			
 
				+                 if UseAVX then
			
 
				+                   op := A_VMOVSD
			
 
				+                 else
			
 
				+                   op := A_MOVSD;
			
 
				+               OS_M32, OS_32, OS_S32:
			
 
				+                 if UseAVX then
			
 
				+                   op := A_VMOVD
			
 
				+                 else
			
 
				+                   op := A_MOVD;
			
 
				+               OS_M64, OS_64, OS_S64:
			
 
				+                 if UseAVX then
			
 
				+                   op := A_VMOVQ
			
 
				+                 else
			
 
				+                   op := A_MOVQ;
			
 
				+               OS_MF128:
			
 
				+                 { Use XMM transfer of packed singles }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 16 then
			
 
				+                       op := A_VMOVAPS
			
 
				+                     else
			
 
				+                       op := A_VMOVUPS
			
 
				+                   end
			
 
				+                 else
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 16 then
			
 
				+                       op := A_MOVAPS
			
 
				+                     else
			
 
				+                       op := A_MOVUPS
			
 
				+                   end;
			
 
				+               OS_MD128:
			
 
				+                 { Use XMM transfer of packed doubles }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 16 then
			
 
				+                       op := A_VMOVAPD
			
 
				+                     else
			
 
				+                       op := A_VMOVUPD
			
 
				+                   end
			
 
				+                 else
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 16 then
			
 
				+                       op := A_MOVAPD
			
 
				+                     else
			
 
				+                       op := A_MOVUPD
			
 
				+                   end;
			
 
				+               OS_M128, OS_MS128:
			
 
				+                 { Use XMM integer transfer }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 16 then
			
 
				+                       op := A_VMOVDQA
			
 
				+                     else
			
 
				+                       op := A_VMOVDQU
			
 
				+                   end
			
 
				+                 else
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 16 then
			
 
				+                       op := A_MOVDQA
			
 
				+                     else
			
 
				+                       op := A_MOVDQU
			
 
				+                   end;
			
 
				+               OS_MF256:
			
 
				+                 { Use YMM transfer of packed singles }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 32 then
			
 
				+                       op := A_VMOVAPS
			
 
				+                     else
			
 
				+                       op := A_VMOVUPS
			
 
				+                   end
			
 
				+                 else
			
 
				+                   { SSE doesn't support 256-bit vectors }
			
 
				+                   InternalError(2018012934);
			
 
				+               OS_MD256:
			
 
				+                 { Use YMM transfer of packed doubles }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 32 then
			
 
				+                       op := A_VMOVAPD
			
 
				+                     else
			
 
				+                       op := A_VMOVUPD
			
 
				+                   end
			
 
				+                 else
			
 
				+                   { SSE doesn't support 256-bit vectors }
			
 
				+                   InternalError(2018012935);
			
 
				+               OS_M256, OS_MS256:
			
 
				+                 { Use YMM integer transfer }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 32 then
			
 
				+                       op := A_VMOVDQA
			
 
				+                     else
			
 
				+                       op := A_VMOVDQU
			
 
				+                   end
			
 
				+                 else
			
 
				+                   { SSE doesn't support 256-bit vectors }
			
 
				+                   InternalError(2018012936);
			
 
				+               OS_MF512:
			
 
				+                 { Use ZMM transfer of packed singles }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 64 then
			
 
				+                       op := A_VMOVAPS
			
 
				+                     else
			
 
				+                       op := A_VMOVUPS
			
 
				+                   end
			
 
				+                 else
			
 
				+                   { SSE doesn't support 512-bit vectors }
			
 
				+                   InternalError(2018012937);
			
 
				+               OS_MD512:
			
 
				+                 { Use ZMM transfer of packed doubles }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 64 then
			
 
				+                       op := A_VMOVAPD
			
 
				+                     else
			
 
				+                       op := A_VMOVUPD
			
 
				+                   end
			
 
				+                 else
			
 
				+                   { SSE doesn't support 512-bit vectors }
			
 
				+                   InternalError(2018012938);
			
 
				+               OS_M512, OS_MS512:
			
 
				+                 { Use ZMM integer transfer }
			
 
				+                 if UseAVX then
			
 
				+                   begin
			
 
				+                     if GetRefAlignment(tmpref) = 64 then
			
 
				+                       op := A_VMOVDQA
			
 
				+                     else
			
 
				+                       op := A_VMOVDQU
			
 
				+                   end
			
 
				+                 else
			
 
				+                   { SSE doesn't support 512-bit vectors }
			
 
				+                   InternalError(2018012939);
			
 
				+               else
			
 
				+                 { No valid transfer command available }
			
 
				+                 internalerror(2017121410);
			
 
				+             end;
			
 
				+             list.concat(taicpu.op_ref_reg(op,S_NO,tmpref,reg));
			
 
				            end
			
 
				          else if shufflescalar(shuffle) then
			
 
				            begin
			
@@ -1415,20 +1636,149 @@ unit cgx86;
 
				          hreg : tregister;
			
 
				          tmpref  : treference;
			
 
				          op : tasmop;
			
 
				+
			
 
				        begin
			
 
				          tmpref:=ref;
			
 
				          make_simple_ref(list,tmpref);
			
 
				          if shuffle=nil then
			
 
				            begin
			
 
				-             if fromsize=OS_M64 then
			
 
				-               list.concat(taicpu.op_reg_ref(A_MOVQ,S_NO,reg,tmpref))
			
 
				-             else
			
 
				-{$ifdef x86_64}
			
 
				-               { x86-64 has always properly aligned data }
			
 
				-               list.concat(taicpu.op_reg_ref(A_MOVDQA,S_NO,reg,tmpref))
			
 
				-{$else x86_64}
			
 
				-               list.concat(taicpu.op_reg_ref(A_MOVDQU,S_NO,reg,tmpref))
			
 
				-{$endif x86_64}
			
 
				+             case fromsize of
			
 
				+               OS_F32:
			
 
				+                 if UseAVX then
			
 
				+                   op := A_VMOVSS
			
 
				+                 else
			
 
				+                   op := A_MOVSS;
			
 
				+               OS_F64:
			
 
				+                 if UseAVX then
			
 
				+                   op := A_VMOVSD
			
 
				+                 else
			
 
				+                   op := A_MOVSD;
			
 
				+               OS_M32, OS_32, OS_S32:
			
 
				+                 if UseAVX then
			
 
				+                   op := A_VMOVD
			
 
				+                 else
			
 
				+                   op := A_MOVD;
			
 
				+               OS_M64, OS_64, OS_S64:
			
 
				+                 if UseAVX then
			
 
				+                   op := A_VMOVQ
			
 
				+                 else
			
 
				+                   op := A_MOVQ;
			
 
				+               OS_MF128:
			
 
				+                 { Use XMM transfer of packed singles }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 16 then
			
 
				+                     op := A_VMOVAPS
			
 
				+                   else
			
 
				+                     op := A_VMOVUPS
			
 
				+                 end else
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 16 then
			
 
				+                     op := A_MOVAPS
			
 
				+                   else
			
 
				+                     op := A_MOVUPS
			
 
				+                 end;
			
 
				+               OS_MD128:
			
 
				+                 { Use XMM transfer of packed doubles }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 16 then
			
 
				+                     op := A_VMOVAPD
			
 
				+                   else
			
 
				+                     op := A_VMOVUPD
			
 
				+                 end else
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 16 then
			
 
				+                     op := A_MOVAPD
			
 
				+                   else
			
 
				+                     op := A_MOVUPD
			
 
				+                 end;
			
 
				+               OS_M128, OS_MS128:
			
 
				+                 { Use XMM integer transfer }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 16 then
			
 
				+                     op := A_VMOVDQA
			
 
				+                   else
			
 
				+                     op := A_VMOVDQU
			
 
				+                 end else
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 16 then
			
 
				+                     op := A_MOVDQA
			
 
				+                   else
			
 
				+                     op := A_MOVDQU
			
 
				+                 end;
			
 
				+               OS_MF256:
			
 
				+                 { Use XMM transfer of packed singles }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 32 then
			
 
				+                     op := A_VMOVAPS
			
 
				+                   else
			
 
				+                     op := A_VMOVUPS
			
 
				+                 end else
			
 
				+                   { SSE doesn't support 256-bit vectors }
			
 
				+                   InternalError(2018012940);
			
 
				+               OS_MD256:
			
 
				+                 { Use XMM transfer of packed doubles }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 32 then
			
 
				+                     op := A_VMOVAPD
			
 
				+                   else
			
 
				+                     op := A_VMOVUPD
			
 
				+                 end else
			
 
				+                   { SSE doesn't support 256-bit vectors }
			
 
				+                   InternalError(2018012941);
			
 
				+               OS_M256, OS_MS256:
			
 
				+                 { Use XMM integer transfer }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 32 then
			
 
				+                     op := A_VMOVDQA
			
 
				+                   else
			
 
				+                     op := A_VMOVDQU
			
 
				+                 end else
			
 
				+                   { SSE doesn't support 256-bit vectors }
			
 
				+                   InternalError(2018012942);
			
 
				+               OS_MF512:
			
 
				+                 { Use XMM transfer of packed singles }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 64 then
			
 
				+                     op := A_VMOVAPS
			
 
				+                   else
			
 
				+                     op := A_VMOVUPS
			
 
				+                 end else
			
 
				+                   { SSE doesn't support 512-bit vectors }
			
 
				+                   InternalError(2018012943);
			
 
				+               OS_MD512:
			
 
				+                 { Use XMM transfer of packed doubles }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 64 then
			
 
				+                     op := A_VMOVAPD
			
 
				+                   else
			
 
				+                     op := A_VMOVUPD
			
 
				+                 end else
			
 
				+                   { SSE doesn't support 512-bit vectors }
			
 
				+                   InternalError(2018012944);
			
 
				+               OS_M512, OS_MS512:
			
 
				+                 { Use XMM integer transfer }
			
 
				+                 if UseAVX then
			
 
				+                 begin
			
 
				+                   if GetRefAlignment(tmpref) = 64 then
			
 
				+                     op := A_VMOVDQA
			
 
				+                   else
			
 
				+                     op := A_VMOVDQU
			
 
				+                 end else
			
 
				+                   { SSE doesn't support 512-bit vectors }
			
 
				+                   InternalError(2018012945);
			
 
				+               else
			
 
				+                 { No valid transfer command available }
			
 
				+                 internalerror(2017121411);
			
 
				+             end;
			
 
				+             list.concat(taicpu.op_reg_ref(op,S_NO,reg,tmpref));
			
 
				            end
			
 
				          else if shufflescalar(shuffle) then
			
 
				            begin
			
--- a/compiler/x86/cpubase.pas
+++ b/compiler/x86/cpubase.pas
@@ -419,10 +419,12 @@ implementation
 
				               else
			
 
				                 internalerror(2009071902);
			
 
				             end;
			
 
				-          OS_M128,OS_MS128:
			
 
				+          OS_M128,OS_MS128,OS_MF128,OS_MD128:
			
 
				             cgsize2subreg:=R_SUBMMX;
			
 
				-          OS_M256,OS_MS256:
			
 
				+          OS_M256,OS_MS256,OS_MF256,OS_MD256:
			
 
				             cgsize2subreg:=R_SUBMMY;
			
 
				+          OS_M512,OS_MS512,OS_MF512,OS_MD512:
			
 
				+            cgsize2subreg:=R_SUBMMZ;
			
 
				           OS_NO:
			
 
				             { error message should have been thrown already before, so avoid only
			
 
				               an internal error }
			
@@ -435,7 +437,7 @@ implementation
 
				 
			
 
				     function reg_cgsize(const reg: tregister): tcgsize;
			
 
				       const subreg2cgsize:array[Tsubregister] of Tcgsize =
			
 
				-            (OS_NO,OS_8,OS_8,OS_16,OS_32,OS_64,OS_NO,OS_NO,OS_NO,OS_F32,OS_F64,OS_NO,OS_M128,OS_M256,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO);
			
 
				+            (OS_NO,OS_8,OS_8,OS_16,OS_32,OS_64,OS_NO,OS_NO,OS_NO,OS_F32,OS_F64,OS_NO,OS_M128,OS_M256,OS_M512,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO,OS_NO);
			
 
				       begin
			
 
				         case getregtype(reg) of
			
 
				           R_INTREGISTER :
			
@@ -466,7 +468,7 @@ implementation
 
				     function reg2opsize(r:Tregister):topsize;
			
 
				       const
			
 
				         subreg2opsize : array[tsubregister] of topsize =
			
 
				-          (S_NO,S_B,S_B,S_W,S_L,S_Q,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO);
			
 
				+          (S_NO,S_B,S_B,S_W,S_L,S_Q,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO);
			
 
				       begin
			
 
				         reg2opsize:=S_L;
			
 
				         case getregtype(r) of
			
--- a/compiler/x86/itcpugas.pas
+++ b/compiler/x86/itcpugas.pas
@@ -52,27 +52,28 @@ interface
 
				        'd',
			
 
				        '','','',
			
 
				        't',
			
 
				-        'x',
			
 
				-        'y'
			
 
				+       'x',
			
 
				+       'y',
			
 
				+       'z'
			
 
				      );
			
 
				      { suffix-to-opsize conversion tables, used in asmreadrer }
			
 
				      { !! S_LQ excluded: movzlq does not exist, movslq is processed
			
 
				        as a separate instruction w/o suffix (aka movsxd), and there are
			
 
				        no more instructions needing it. }
			
 
				-     att_sizesuffixstr : array[0..13] of string[2] = (
			
 
				-       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y'
			
 
				+     att_sizesuffixstr : array[0..14] of string[2] = (
			
 
				+       '','BW','BL','WL','BQ','WQ',{'LQ',}'B','W','L','S','Q','T','X','Y','Z'
			
 
				      );
			
 
				-     att_sizesuffix : array[0..13] of topsize = (
			
 
				-       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO
			
 
				+     att_sizesuffix : array[0..14] of topsize = (
			
 
				+       S_NO,S_BW,S_BL,S_WL,S_BQ,S_WQ,{S_LQ,}S_B,S_W,S_L,S_NO,S_Q,S_NO,S_NO,S_NO,S_NO
			
 
				      );
			
 
				-     att_sizefpusuffix : array[0..13] of topsize = (
			
 
				-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
			
 
				+     att_sizefpusuffix : array[0..14] of topsize = (
			
 
				+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
			
 
				      );
			
 
				-     att_sizefpuintsuffix : array[0..13] of topsize = (
			
 
				-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
			
 
				+     att_sizefpuintsuffix : array[0..14] of topsize = (
			
 
				+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
			
 
				      );
			
 
				-     att_sizemmsuffix : array[0..13] of topsize = (
			
 
				-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
			
 
				+     att_sizemmsuffix : array[0..14] of topsize = (
			
 
				+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,{S_NO,}S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
			
 
				      );
			
 
				 {$else x86_64}
			
 
				      gas_opsize2str : array[topsize] of string[2] = ('',
			
@@ -82,24 +83,25 @@ interface
 
				        'd',
			
 
				        '','','',
			
 
				        't',
			
 
				-        'x',
			
 
				-        'y'
			
 
				+       'x',
			
 
				+       'y',
			
 
				+       'z'
			
 
				      );
			
 
				      { suffix-to-opsize conversion tables, used in asmreadrer }
			
 
				-     att_sizesuffixstr : array[0..11] of string[2] = (
			
 
				-       '','BW','BL','WL','B','W','L','S','Q','T','X','Y'
			
 
				+     att_sizesuffixstr : array[0..12] of string[2] = (
			
 
				+       '','BW','BL','WL','B','W','L','S','Q','T','X','Y','Z'
			
 
				      );
			
 
				-     att_sizesuffix : array[0..11] of topsize = (
			
 
				-       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO
			
 
				+     att_sizesuffix : array[0..12] of topsize = (
			
 
				+       S_NO,S_BW,S_BL,S_WL,S_B,S_W,S_L,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO
			
 
				      );
			
 
				-     att_sizefpusuffix : array[0..11] of topsize = (
			
 
				-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO
			
 
				+     att_sizefpusuffix : array[0..12] of topsize = (
			
 
				+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_FL,S_FS,S_NO,S_FX,S_NO,S_NO,S_NO
			
 
				      );
			
 
				-     att_sizefpuintsuffix : array[0..11] of topsize = (
			
 
				-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO
			
 
				+     att_sizefpuintsuffix : array[0..12] of topsize = (
			
 
				+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_IL,S_IS,S_IQ,S_NO,S_NO,S_NO,S_NO
			
 
				      );
			
 
				-     att_sizemmsuffix : array[0..11] of topsize = (
			
 
				-       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM
			
 
				+     att_sizemmsuffix : array[0..12] of topsize = (
			
 
				+       S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_NO,S_XMM,S_YMM,S_ZMM
			
 
				      );
			
 
				 
			
 
				 {$endif x86_64}
			
--- a/compiler/x86/rax86.pas
+++ b/compiler/x86/rax86.pas
@@ -343,7 +343,8 @@ const
 
				      0,0,0,
			
 
				      80,
			
 
				      128,
			
 
				-     256
			
 
				+     256,
			
 
				+     512
			
 
				     );
			
 
				 {$else}
			
 
				 topsize2memsize: array[topsize] of integer =
			
@@ -354,7 +355,8 @@ topsize2memsize: array[topsize] of integer =
 
				    0,0,0,
			
 
				    80,
			
 
				    128,
			
 
				-   256
			
 
				+   256,
			
 
				+   512
			
 
				   );
			
 
				 {$endif}
			
 
				 
			
--- a/compiler/x86_64/aoptcpu.pas
+++ b/compiler/x86_64/aoptcpu.pas
@@ -74,10 +74,14 @@ uses
 
				               A_MOVZX:
			
 
				                 Result:=OptPass1Movx(p);
			
 
				               A_VMOVAPS,
			
 
				-              A_VMOVAPD:
			
 
				+              A_VMOVAPD,
			
 
				+              A_VMOVUPS,
			
 
				+              A_VMOVUPD:
			
 
				                 result:=OptPass1VMOVAP(p);
			
 
				               A_MOVAPD,
			
 
				-              A_MOVAPS:
			
 
				+              A_MOVAPS,
			
 
				+              A_MOVUPD,
			
 
				+              A_MOVUPS:
			
 
				                 result:=OptPass1MOVAP(p);
			
 
				               A_VDIVSD,
			
 
				               A_VDIVSS,
			
--- a/compiler/x86_64/cpubase.inc
+++ b/compiler/x86_64/cpubase.inc
@@ -35,7 +35,8 @@ type
 
				     S_NEAR,S_FAR,S_SHORT,
			
 
				     S_T,
			
 
				     S_XMM,
			
 
				-    S_YMM
			
 
				+    S_YMM,
			
 
				+    S_ZMM
			
 
				   );
			
 
				 
			
 
				   TOpSizes = set of topsize;
			
--- a/compiler/x86_64/cpuinfo.pas
+++ b/compiler/x86_64/cpuinfo.pas
@@ -108,7 +108,8 @@ Const
 
				      pocall_sysv_abi_default,
			
 
				      pocall_sysv_abi_cdecl,
			
 
				      pocall_ms_abi_default,
			
 
				-     pocall_ms_abi_cdecl
			
 
				+     pocall_ms_abi_cdecl,
			
 
				+     pocall_vectorcall
			
 
				    ];
			
 
				 
			
 
				    cputypestr : array[tcputype] of string[10] = ('',
			
--- a/compiler/x86_64/cpupara.pas
+++ b/compiler/x86_64/cpupara.pas
--- a/compiler/x86_64/cpupi.pas
+++ b/compiler/x86_64/cpupi.pas
@@ -173,7 +173,7 @@ implementation
 
				         result:=
			
 
				            ((target_info.system=system_x86_64_win64) and
			
 
				             not(proccall in [pocall_sysv_abi_default,pocall_sysv_abi_cdecl])) or
			
 
				-           (proccall in [pocall_ms_abi_default,pocall_ms_abi_cdecl]);
			
 
				+            (proccall in [pocall_ms_abi_default,pocall_ms_abi_cdecl,pocall_vectorcall]);
			
 
				       end;
			
 
				 
			
 
				 
			
--- a/tests/test/cg/tvectorcall1.pp
+++ b/tests/test/cg/tvectorcall1.pp
@@ -0,0 +1,869 @@
 
				+{ %CPU=x86_64 }
			
 
				+program vectorcall_hva_test1;
			
 
				+
			
 
				+{$IFNDEF CPUX86_64}
			
 
				+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
			
 
				+{$ENDIF}
			
 
				+
			
 
				+{$ASMMODE Intel}
			
 
				+{$PUSH}
			
 
				+{$CODEALIGN RECORDMIN=16}
			
 
				+{$PACKRECORDS C}
			
 
				+type
			
 
				+  TM128 = record
			
 
				+    case Byte of
			
 
				+      0: (M128_F32: array[0..3] of Single);
			
 
				+      1: (M128_F64: array[0..1] of Double);
			
 
				+  end;
			
 
				+{$POP}
			
 
				+
			
 
				+{ HFA test: field style. }
			
 
				+
			
 
				+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
			
 
				+  turned into vectors rather than HFAs. }
			
 
				+
			
 
				+  THFA1_SF = packed record
			
 
				+    F1: Single;
			
 
				+  end;
			
 
				+
			
 
				+{$IFDEF WIN64}
			
 
				+  THFA2_SF = packed record
			
 
				+    F1, F2: Single;
			
 
				+  end;
			
 
				+
			
 
				+  THFA3_SF = packed record
			
 
				+    F1, F2, F3: Single;
			
 
				+  end;
			
 
				+
			
 
				+  THFA4_SF = packed record
			
 
				+    F1, F2, F3, F4: Single;
			
 
				+  end;
			
 
				+{$ENDIF}
			
 
				+
			
 
				+  THFA1_DF = packed record
			
 
				+    F1: Double;
			
 
				+  end;
			
 
				+
			
 
				+{$IFDEF WIN64}
			
 
				+  THFA2_DF = packed record
			
 
				+    F1, F2: Double;
			
 
				+  end;
			
 
				+
			
 
				+  THFA3_DF = packed record
			
 
				+    F1, F2, F3: Double;
			
 
				+  end;
			
 
				+
			
 
				+  THFA4_DF = packed record
			
 
				+    F1, F2, F3, F4: Double;
			
 
				+  end;
			
 
				+{$ENDIF}
			
 
				+
			
 
				+{ HFA test - array style }
			
 
				+
			
 
				+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
			
 
				+  turned into vectors rather than HFAs. }
			
 
				+
			
 
				+  THFA1_SA = packed record
			
 
				+    F: array[0..0] of Single;
			
 
				+  end;
			
 
				+
			
 
				+{$IFDEF WIN64}
			
 
				+  THFA2_SA = packed record
			
 
				+    F: array[0..1] of Single;
			
 
				+  end;
			
 
				+
			
 
				+  THFA3_SA = packed record
			
 
				+    F: array[0..2] of Single;
			
 
				+  end;
			
 
				+
			
 
				+  THFA4_SA = packed record
			
 
				+    F: array[0..3] of Single;
			
 
				+  end;
			
 
				+{$ENDIF}
			
 
				+
			
 
				+  THFA1_DA = packed record
			
 
				+    F: array[0..0] of Double;
			
 
				+  end;
			
 
				+
			
 
				+{$IFDEF WIN64}
			
 
				+  THFA2_DA = packed record
			
 
				+    F: array[0..1] of Double;
			
 
				+  end;
			
 
				+
			
 
				+  THFA3_DA = packed record
			
 
				+    F: array[0..2] of Double;
			
 
				+  end;
			
 
				+
			
 
				+  THFA4_DA = packed record
			
 
				+    F: array[0..3] of Double;
			
 
				+  end;
			
 
				+{$ENDIF}
			
 
				+
			
 
				+{ Single-type vector }
			
 
				+
			
 
				+function HorizontalAddSingle(V: TM128): Single; vectorcall;
			
 
				+begin
			
 
				+  HorizontalAddSingle := V.M128_F32[0] + V.M128_F32[1] + V.M128_F32[2] + V.M128_F32[3];
			
 
				+end;
			
 
				+
			
 
				+function HorizontalAddSingle_ASM(V: TM128): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  HADDPS XMM0, XMM0
			
 
				+  HADDPS XMM0, XMM0
			
 
				+end;
			
 
				+
			
 
				+{ Double-type vector }
			
 
				+
			
 
				+function HorizontalAddDouble(V: TM128): Double; vectorcall;
			
 
				+begin
			
 
				+  HorizontalAddDouble := V.M128_F64[0] + V.M128_F64[1];
			
 
				+end;
			
 
				+
			
 
				+function HorizontalAddDouble_ASM(V: TM128): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  HADDPD XMM0, XMM0
			
 
				+end;
			
 
				+
			
 
				+{ 3-element aggregate }
			
 
				+
			
 
				+function AddSingles1F(HFA: THFA1_SF): Single; vectorcall;
			
 
				+begin
			
 
				+  AddSingles1F := HFA.F1;
			
 
				+end;
			
 
				+
			
 
				+function AddSingles1F_ASM(HFA: THFA1_SF): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  { Do absolutely nothing! }
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles1F(HFA: THFA1_DF): Double; vectorcall;
			
 
				+begin
			
 
				+  AddDoubles1F := HFA.F1;
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles1F_ASM(HFA: THFA1_DF): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  { Do absolutely nothing! }
			
 
				+end;
			
 
				+
			
 
				+function AddSingles1A(HFA: THFA1_SA): Single; vectorcall;
			
 
				+begin
			
 
				+  AddSingles1A := HFA.F[0];
			
 
				+end;
			
 
				+
			
 
				+function AddSingles1A_ASM(HFA: THFA1_SA): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  { Do absolutely nothing! }
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles1A(HFA: THFA1_DA): Double; vectorcall;
			
 
				+begin
			
 
				+  AddDoubles1A := HFA.F[0];
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles1A_ASM(HFA: THFA1_DA): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  { Do absolutely nothing! }
			
 
				+end;
			
 
				+
			
 
				+{$IFDEF WIN64}
			
 
				+{ 2-element aggregate }
			
 
				+
			
 
				+function AddSingles2F(HFA: THFA2_SF): Single; vectorcall;
			
 
				+begin
			
 
				+  AddSingles2F := HFA.F1 + HFA.F2;
			
 
				+end;
			
 
				+
			
 
				+function AddSingles2F_ASM(HFA: THFA2_SF): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSS XMM0, XMM1
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles2F(HFA: THFA2_DF): Double; vectorcall;
			
 
				+begin
			
 
				+  AddDoubles2F := HFA.F1 + HFA.F2;
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles2F_ASM(HFA: THFA2_DF): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSD XMM0, XMM1
			
 
				+end;
			
 
				+
			
 
				+function AddSingles2A(HFA: THFA2_SA): Single; vectorcall;
			
 
				+begin
			
 
				+  AddSingles2A := HFA.F[0] + HFA.F[1];
			
 
				+end;
			
 
				+
			
 
				+function AddSingles2A_ASM(HFA: THFA2_SA): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSS XMM0, XMM1
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles2A(HFA: THFA2_DA): Double; vectorcall;
			
 
				+begin
			
 
				+  AddDoubles2A := HFA.F[0] + HFA.F[1];
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles2A_ASM(HFA: THFA2_DA): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSD XMM0, XMM1
			
 
				+end;
			
 
				+
			
 
				+{ 3-element aggregate }
			
 
				+
			
 
				+function AddSingles3F(HFA: THFA3_SF): Single; vectorcall;
			
 
				+begin
			
 
				+  AddSingles3F := HFA.F1 + HFA.F2 + HFA.F3;
			
 
				+end;
			
 
				+
			
 
				+function AddSingles3F_ASM(HFA: THFA3_SF): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSS XMM0, XMM1
			
 
				+  ADDSS XMM0, XMM2
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles3F(HFA: THFA3_DF): Double; vectorcall;
			
 
				+begin
			
 
				+  AddDoubles3F := HFA.F1 + HFA.F2 + HFA.F3;
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles3F_ASM(HFA: THFA3_DF): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSD XMM0, XMM1
			
 
				+  ADDSD XMM0, XMM2
			
 
				+end;
			
 
				+
			
 
				+function AddSingles3A(HFA: THFA3_SA): Single; vectorcall;
			
 
				+begin
			
 
				+  AddSingles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
			
 
				+end;
			
 
				+
			
 
				+function AddSingles3A_ASM(HFA: THFA3_SA): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSS XMM0, XMM1
			
 
				+  ADDSS XMM0, XMM2
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles3A(HFA: THFA3_DA): Double; vectorcall;
			
 
				+begin
			
 
				+  AddDoubles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles3A_ASM(HFA: THFA3_DA): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSD XMM0, XMM1
			
 
				+  ADDSD XMM0, XMM2
			
 
				+end;
			
 
				+
			
 
				+{ 4-element aggregate }
			
 
				+
			
 
				+function AddSingles4F(HFA: THFA4_SF): Single; vectorcall;
			
 
				+begin
			
 
				+  AddSingles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
			
 
				+end;
			
 
				+
			
 
				+function AddSingles4F_ASM(HFA: THFA4_SF): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSS XMM0, XMM1
			
 
				+  ADDSS XMM0, XMM2
			
 
				+  ADDSS XMM0, XMM3
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles4F(HFA: THFA4_DF): Double; vectorcall;
			
 
				+begin
			
 
				+  AddDoubles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles4F_ASM(HFA: THFA4_DF): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSD XMM0, XMM1
			
 
				+  ADDSD XMM0, XMM2
			
 
				+  ADDSD XMM0, XMM3
			
 
				+end;
			
 
				+
			
 
				+function AddSingles4A(HFA: THFA4_SA): Single; vectorcall;
			
 
				+begin
			
 
				+  AddSingles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
			
 
				+end;
			
 
				+
			
 
				+function AddSingles4A_ASM(HFA: THFA4_SA): Single; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSS XMM0, XMM1
			
 
				+  ADDSS XMM0, XMM2
			
 
				+  ADDSS XMM0, XMM3
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles4A(HFA: THFA4_DA): Double; vectorcall;
			
 
				+begin
			
 
				+  AddDoubles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
			
 
				+end;
			
 
				+
			
 
				+function AddDoubles4A_ASM(HFA: THFA4_DA): Double; vectorcall; assembler; nostackframe;
			
 
				+asm
			
 
				+  ADDSD XMM0, XMM1
			
 
				+  ADDSD XMM0, XMM2
			
 
				+  ADDSD XMM0, XMM3
			
 
				+end;
			
 
				+{$ENDIF}
			
 
				+
			
 
				+var
			
 
				+  HVA: TM128;
			
 
				+  HFA1_SF: THFA1_SF;
			
 
				+  HFA1_DF: THFA1_DF;
			
 
				+  HFA1_SA: THFA1_SA;
			
 
				+  HFA1_DA: THFA1_DA;
			
 
				+{$IFDEF WIN64}
			
 
				+  HFA2_SF: THFA2_SF;
			
 
				+  HFA2_DF: THFA2_DF;
			
 
				+  HFA2_SA: THFA2_SA;
			
 
				+  HFA2_DA: THFA2_DA;
			
 
				+  HFA3_SF: THFA3_SF;
			
 
				+  HFA3_DF: THFA3_DF;
			
 
				+  HFA3_SA: THFA3_SA;
			
 
				+  HFA3_DA: THFA3_DA;
			
 
				+  HFA4_SF: THFA4_SF;
			
 
				+  HFA4_DF: THFA4_DF;
			
 
				+  HFA4_SA: THFA4_SA;
			
 
				+  HFA4_DA: THFA4_DA;
			
 
				+{$ENDIF}
			
 
				+  TestPointer: PtrUInt;
			
 
				+  I, J: Integer;
			
 
				+  ResS, ResSA: Single;
			
 
				+  ResD, ResDA: Double;
			
 
				+  Addresses: array[0..3] of Pointer;
			
 
				+  FieldAddresses: array[0..3, 0..3] of Pointer;
			
 
				+const
			
 
				+  AddressNames1: array[0..3] of ShortString = ('HFA1_SF', 'HFA1_DF', 'HFA1_SA', 'HFA1_DA');
			
 
				+{$IFDEF WIN64}
			
 
				+  AddressNames2: array[0..3] of ShortString = ('HFA2_SF', 'HFA2_DF', 'HFA2_SA', 'HFA2_DA');
			
 
				+  AddressNames3: array[0..3] of ShortString = ('HFA3_SF', 'HFA3_DF', 'HFA3_SA', 'HFA3_DA');
			
 
				+  AddressNames4: array[0..3] of ShortString = ('HFA4_SF', 'HFA4_DF', 'HFA4_SA', 'HFA4_DA');
			
 
				+{$ENDIF}
			
 
				+  FieldAddressNames: array[0..3] of ShortString = ('F1', 'F2', 'F3', 'F4');
			
 
				+
			
 
				+  ExpS1: Single = 5.0;
			
 
				+{$IFDEF WIN64}
			
 
				+  ExpS2: Single = -5.0;
			
 
				+  ExpS3: Single = 10.0;
			
 
				+{$ENDIF}
			
 
				+  ExpS4: Single = -10.0;
			
 
				+  ExpD1: Double = 5.0;
			
 
				+  ExpD2: Double = -5.0;
			
 
				+{$IFDEF WIN64}
			
 
				+  ExpD3: Double = 10.0;
			
 
				+  ExpD4: Double = -10.0;
			
 
				+{$ENDIF}
			
 
				+begin
			
 
				+
			
 
				+  if (PtrUInt(@HVA) and $F) <> 0 then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: HVA is not correctly aligned.');
			
 
				+    Halt(1);
			
 
				+  end;
			
 
				+
			
 
				+  { array of singles }
			
 
				+  WriteLn('- horizontal add (4 singles)');
			
 
				+  HVA.M128_F32[0] := 5.0;
			
 
				+  HVA.M128_F32[1] := -10.0;
			
 
				+  HVA.M128_F32[2] := 15.0;
			
 
				+  HVA.M128_F32[3] := -20.0;
			
 
				+  ResS := HorizontalAddSingle(HVA);
			
 
				+  ResSA := HorizontalAddSingle_ASM(HVA);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: HorizontalAddSingle(HVA) has the vector in the wrong register.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS4 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: HorizontalAddSingle(HVA) returned ', ResS, ' instead of ', ExpS4);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  { array of doubles }
			
 
				+  WriteLn('- horizontal add (2 doubles)');
			
 
				+  HVA.M128_F64[0] := 5.0;
			
 
				+  HVA.M128_F64[1] := -10.0;
			
 
				+  ResD := HorizontalAddDouble(HVA);
			
 
				+  ResDA := HorizontalAddDouble_ASM(HVA);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: HorizontalAddDouble(HVA) has the vector in the wrong register.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD2 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: HorizontalAddDouble(HVA) returned ', ResD, ' instead of ', ExpD2);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  { 1-field aggregates }
			
 
				+  WriteLn('- 1-field aggregates');
			
 
				+
			
 
				+  Addresses[0] := @HFA1_SF;
			
 
				+  Addresses[1] := @HFA1_SA;
			
 
				+  Addresses[2] := @HFA1_DF;
			
 
				+  Addresses[3] := @HFA1_DA;
			
 
				+  FieldAddresses[0][0] := @(HFA1_SF.F1);
			
 
				+  FieldAddresses[1][0] := @(HFA1_SA.F[0]);
			
 
				+  FieldAddresses[2][0] := @(HFA1_DF.F1);
			
 
				+  FieldAddresses[3][0] := @(HFA1_DA.F[0]);
			
 
				+
			
 
				+  { Check alignment }
			
 
				+  for I := 0 to 1 do
			
 
				+  begin
			
 
				+    TestPointer := PtrUInt(Addresses[I]);
			
 
				+    if Pointer(TestPointer) <> FieldAddresses[I][0] then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: ', AddressNames1[I], ' is not correctly packed; field F1 is not in the expected place.');
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA1_SF.F1 := 5.0;
			
 
				+  ResS := AddSingles1F(HFA1_SF);
			
 
				+  ResSA := AddSingles1F_ASM(HFA1_SF);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS1 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA1_DF.F1 := 5.0;
			
 
				+  ResD := AddDoubles1F(HFA1_DF);
			
 
				+  ResDA := AddDoubles1F_ASM(HFA1_DF);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD1 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA1_SA.F[0] := 5.0;
			
 
				+  ResS := AddSingles1A(HFA1_SA);
			
 
				+  ResSA := AddSingles1A_ASM(HFA1_SA);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS1 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA1_DA.F[0] := 5.0;
			
 
				+  ResD := AddDoubles1A(HFA1_DA);
			
 
				+  ResDA := AddDoubles1A_ASM(HFA1_DA);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD1 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+{$IFDEF WIN64}
			
 
				+  { 2-field aggregates }
			
 
				+  WriteLn('- 2-field aggregates');
			
 
				+
			
 
				+  Addresses[0] := @HFA2_SF;
			
 
				+  Addresses[1] := @HFA2_SA;
			
 
				+  FieldAddresses[0][0] := @(HFA2_SF.F1);
			
 
				+  FieldAddresses[0][1] := @(HFA2_SF.F2);
			
 
				+  FieldAddresses[1][0] := @(HFA2_SA.F[0]);
			
 
				+  FieldAddresses[1][1] := @(HFA2_SA.F[1]);
			
 
				+
			
 
				+  { Check alignment of Singles }
			
 
				+  for I := 0 to 1 do
			
 
				+  begin
			
 
				+    TestPointer := PtrUInt(Addresses[I]);
			
 
				+    for J := 0 to 1 do
			
 
				+    begin
			
 
				+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
			
 
				+      begin
			
 
				+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
			
 
				+        Halt(1);
			
 
				+      end;
			
 
				+
			
 
				+      Inc(TestPointer, $4);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  Addresses[2] := @HFA2_DF;
			
 
				+  Addresses[3] := @HFA2_DA;
			
 
				+  FieldAddresses[2][0] := @(HFA2_DF.F1);
			
 
				+  FieldAddresses[2][1] := @(HFA2_DF.F2);
			
 
				+  FieldAddresses[3][0] := @(HFA2_DA.F[0]);
			
 
				+  FieldAddresses[3][1] := @(HFA2_DA.F[1]);
			
 
				+
			
 
				+  { Check alignment of Doubles }
			
 
				+  for I := 2 to 3 do
			
 
				+  begin
			
 
				+    TestPointer := PtrUInt(Addresses[I]);
			
 
				+    for J := 0 to 1 do
			
 
				+    begin
			
 
				+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
			
 
				+      begin
			
 
				+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
			
 
				+        Halt(1);
			
 
				+      end;
			
 
				+
			
 
				+      Inc(TestPointer, $8);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA2_SF.F1 := 5.0;
			
 
				+  HFA2_SF.F2 := -10.0;
			
 
				+  ResS := AddSingles2F(HFA2_SF);
			
 
				+  ResSA := AddSingles2F_ASM(HFA2_SF);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddSingles2F(HFA2_SF) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS2 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddSingles2F(HFA2_SF) returned ', ResS, ' instead of ', ExpS2);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA2_DF.F1 := 5.0;
			
 
				+  HFA2_DF.F2 := -10.0;
			
 
				+  ResD := AddDoubles2F(HFA2_DF);
			
 
				+  ResDA := AddDoubles2F_ASM(HFA2_DF);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddDoubles2F(HFA2_DF) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD2 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddDoubles2F(HFA2_DF) returned ', ResD, ' instead of ', ExpD2);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA2_SA.F[0] := 5.0;
			
 
				+  HFA2_SA.F[1] := -10.0;
			
 
				+  ResS := AddSingles2A(HFA2_SA);
			
 
				+  ResSA := AddSingles2A_ASM(HFA2_SA);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddSingles2A(HFA2_SA) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS2 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddSingles2A(HFA2_SA) returned ', ResS, ' instead of ', ExpS2);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA2_DA.F[0] := 5.0;
			
 
				+  HFA2_DA.F[1] := -10.0;
			
 
				+  ResD := AddDoubles2A(HFA2_DA);
			
 
				+  ResDA := AddDoubles2A_ASM(HFA2_DA);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddDoubles2A(HFA2_DA) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD2 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddDoubles2A(HFA2_DA) returned ', ResD, ' instead of ', ExpD2);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  { 3-field aggregates }
			
 
				+  WriteLn('- 3-field aggregates');
			
 
				+
			
 
				+  Addresses[0] := @HFA3_SF;
			
 
				+  Addresses[1] := @HFA3_SA;
			
 
				+  FieldAddresses[0][0] := @(HFA3_SF.F1);
			
 
				+  FieldAddresses[0][1] := @(HFA3_SF.F2);
			
 
				+  FieldAddresses[0][2] := @(HFA3_SF.F3);
			
 
				+  FieldAddresses[1][0] := @(HFA3_SA.F[0]);
			
 
				+  FieldAddresses[1][1] := @(HFA3_SA.F[1]);
			
 
				+  FieldAddresses[1][2] := @(HFA3_SA.F[2]);
			
 
				+
			
 
				+  { Check alignment of Singles }
			
 
				+  for I := 0 to 1 do
			
 
				+  begin
			
 
				+    TestPointer := PtrUInt(Addresses[I]);
			
 
				+    for J := 0 to 2 do
			
 
				+    begin
			
 
				+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
			
 
				+      begin
			
 
				+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
			
 
				+        Halt(1);
			
 
				+      end;
			
 
				+
			
 
				+      Inc(TestPointer, $4);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  Addresses[2] := @HFA3_DF;
			
 
				+  Addresses[3] := @HFA3_DA;
			
 
				+  FieldAddresses[2][0] := @(HFA3_DF.F1);
			
 
				+  FieldAddresses[2][1] := @(HFA3_DF.F2);
			
 
				+  FieldAddresses[2][2] := @(HFA3_DF.F3);
			
 
				+  FieldAddresses[3][0] := @(HFA3_DA.F[0]);
			
 
				+  FieldAddresses[3][1] := @(HFA3_DA.F[1]);
			
 
				+  FieldAddresses[3][2] := @(HFA3_DA.F[2]);
			
 
				+
			
 
				+  { Check alignment of Doubles }
			
 
				+  for I := 2 to 3 do
			
 
				+  begin
			
 
				+    TestPointer := PtrUInt(Addresses[I]);
			
 
				+    for J := 0 to 2 do
			
 
				+    begin
			
 
				+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
			
 
				+      begin
			
 
				+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
			
 
				+        Halt(1);
			
 
				+      end;
			
 
				+
			
 
				+      Inc(TestPointer, $8);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA3_SF.F1 := 5.0;
			
 
				+  HFA3_SF.F2 := -10.0;
			
 
				+  HFA3_SF.F3 := 15.0;
			
 
				+  ResS := AddSingles3F(HFA3_SF);
			
 
				+  ResSA := AddSingles3F_ASM(HFA3_SF);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddSingles3F(HFA3_SF) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS3 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddSingles3F(HFA3_SF) returned ', ResS, ' instead of ', ExpS3);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA3_DF.F1 := 5.0;
			
 
				+  HFA3_DF.F2 := -10.0;
			
 
				+  HFA3_DF.F3 := 15.0;
			
 
				+  ResD := AddDoubles3F(HFA3_DF);
			
 
				+  ResDA := AddDoubles3F_ASM(HFA3_DF);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddDoubles3F(HFA3_DF) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD3 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddDoubles3F(HFA3_DF) returned ', ResD, ' instead of ', ExpD3);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA3_SA.F[0] := 5.0;
			
 
				+  HFA3_SA.F[1] := -10.0;
			
 
				+  HFA3_SA.F[2] := 15.0;
			
 
				+  ResS := AddSingles3A(HFA3_SA);
			
 
				+  ResSA := AddSingles3A_ASM(HFA3_SA);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddSingles3A(HFA3_SA) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS3 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddSingles3A(HFA3_SA) returned ', ResS, ' instead of ', ExpS3);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA3_DA.F[0] := 5.0;
			
 
				+  HFA3_DA.F[1] := -10.0;
			
 
				+  HFA3_DA.F[2] := 15.0;
			
 
				+  ResD := AddDoubles3A(HFA3_DA);
			
 
				+  ResDA := AddDoubles3A_ASM(HFA3_DA);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddDoubles3A(HFA3_DA) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD3 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddDoubles3A(HFA3_DA) returned ', ResD, ' instead of ', ExpD3);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  { 4-field aggregates }
			
 
				+  WriteLn('- 4-field aggregates');
			
 
				+
			
 
				+  Addresses[0] := @HFA4_SF;
			
 
				+  Addresses[1] := @HFA4_SA;
			
 
				+  FieldAddresses[0][0] := @(HFA4_SF.F1);
			
 
				+  FieldAddresses[0][1] := @(HFA4_SF.F2);
			
 
				+  FieldAddresses[0][2] := @(HFA4_SF.F3);
			
 
				+  FieldAddresses[0][3] := @(HFA4_SF.F4);
			
 
				+  FieldAddresses[1][0] := @(HFA4_SA.F[0]);
			
 
				+  FieldAddresses[1][1] := @(HFA4_SA.F[1]);
			
 
				+  FieldAddresses[1][2] := @(HFA4_SA.F[2]);
			
 
				+  FieldAddresses[1][3] := @(HFA4_SA.F[3]);
			
 
				+
			
 
				+  { Check alignment of Singles }
			
 
				+  for I := 0 to 1 do
			
 
				+  begin
			
 
				+    TestPointer := PtrUInt(Addresses[I]);
			
 
				+    for J := 0 to 3 do
			
 
				+    begin
			
 
				+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
			
 
				+      begin
			
 
				+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
			
 
				+        Halt(1);
			
 
				+      end;
			
 
				+
			
 
				+      Inc(TestPointer, $4);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  Addresses[2] := @HFA4_DF;
			
 
				+  Addresses[3] := @HFA4_DA;
			
 
				+  FieldAddresses[2][0] := @(HFA4_DF.F1);
			
 
				+  FieldAddresses[2][1] := @(HFA4_DF.F2);
			
 
				+  FieldAddresses[2][2] := @(HFA4_DF.F3);
			
 
				+  FieldAddresses[2][3] := @(HFA4_DF.F4);
			
 
				+  FieldAddresses[3][0] := @(HFA4_DA.F[0]);
			
 
				+  FieldAddresses[3][1] := @(HFA4_DA.F[1]);
			
 
				+  FieldAddresses[3][2] := @(HFA4_DA.F[2]);
			
 
				+  FieldAddresses[3][3] := @(HFA4_DA.F[3]);
			
 
				+
			
 
				+  { Check alignment of Doubles }
			
 
				+  for I := 2 to 3 do
			
 
				+  begin
			
 
				+    TestPointer := PtrUInt(Addresses[I]);
			
 
				+    for J := 0 to 3 do
			
 
				+    begin
			
 
				+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
			
 
				+      begin
			
 
				+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
			
 
				+        Halt(1);
			
 
				+      end;
			
 
				+
			
 
				+      Inc(TestPointer, $8);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA4_SF.F1 := 5.0;
			
 
				+  HFA4_SF.F2 := -10.0;
			
 
				+  HFA4_SF.F3 := 15.0;
			
 
				+  HFA4_SF.F4 := -20.0;
			
 
				+  ResS := AddSingles4F(HFA4_SF);
			
 
				+  ResSA := AddSingles4F_ASM(HFA4_SF);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddSingles4F(HFA4_SF) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS4 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddSingles4F(HFA4_SF) returned ', ResS, ' instead of ', ExpS4);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA4_DF.F1 := 5.0;
			
 
				+  HFA4_DF.F2 := -10.0;
			
 
				+  HFA4_DF.F3 := 15.0;
			
 
				+  HFA4_DF.F4 := -20.0;
			
 
				+  ResD := AddDoubles4F(HFA4_DF);
			
 
				+  ResDA := AddDoubles4F_ASM(HFA4_DF);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddDoubles4F(HFA4_DF) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD4 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddDoubles4F(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA4_SA.F[0] := 5.0;
			
 
				+  HFA4_SA.F[1] := -10.0;
			
 
				+  HFA4_SA.F[2] := 15.0;
			
 
				+  HFA4_SA.F[3] := -20.0;
			
 
				+  ResS := AddSingles4A(HFA4_SA);
			
 
				+  ResSA := AddSingles4A_ASM(HFA4_SA);
			
 
				+  if (ResS <> ResSA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddSingles4A(HFA4_SA) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResS <> ExpS4 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddSingles4A(HFA4_SA) returned ', ResS, ' instead of ', ExpS4);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  HFA4_DA.F[0] := 5.0;
			
 
				+  HFA4_DA.F[1] := -10.0;
			
 
				+  HFA4_DA.F[2] := 15.0;
			
 
				+  HFA4_DA.F[3] := -20.0;
			
 
				+  ResD := AddDoubles4A(HFA4_DA);
			
 
				+  ResDA := AddDoubles4A_ASM(HFA4_DA);
			
 
				+  if (ResD <> ResDA) then
			
 
				+  begin
			
 
				+    WriteLn('FAIL: AddDoubles4A(HFA4_DF) is not passing the aggregate correctly.');
			
 
				+    Halt(1);
			
 
				+  end else
			
 
				+  begin
			
 
				+    if ResD <> ExpD4 then
			
 
				+    begin
			
 
				+      WriteLn('FAIL: AddDoubles4A(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+{$ENDIF}
			
 
				+  WriteLn('ok');
			
 
				+end.
			
 
				+
			
--- a/tests/test/cg/tvectorcall2.pp
+++ b/tests/test/cg/tvectorcall2.pp
@@ -0,0 +1,162 @@
 
				+{ %CPU=x86_64 }
			
 
				+program vectorcall_hva_test2;
			
 
				+
			
 
				+{$IFNDEF CPUX86_64}
			
 
				+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
			
 
				+{$ENDIF}
			
 
				+
			
 
				+{$push}
			
 
				+{$CODEALIGN RECORDMIN=16}
			
 
				+{$PACKRECORDS C}
			
 
				+type
			
 
				+  TM128 = record
			
 
				+    case Byte of
			
 
				+      0: (M128_F32: array[0..3] of Single);
			
 
				+      1: (M128_F64: array[0..1] of Double);
			
 
				+  end;
			
 
				+{$pop}
			
 
				+
			
 
				+{ HVA test }
			
 
				+  THVA = record
			
 
				+    V1, V2, V3, V4: TM128;
			
 
				+  end;
			
 
				+
			
 
				+operator +(X, Y: TM128)Z: TM128; vectorcall;
			
 
				+  var
			
 
				+    I: Integer;
			
 
				+  begin
			
 
				+    for I := 0 to 3 do
			
 
				+      Z.M128_F32[I] := X.M128_F32[I] + Y.M128_F32[I];
			
 
				+  end;
			
 
				+
			
 
				+operator -(X, Y: TM128)Z: TM128; vectorcall;
			
 
				+  var
			
 
				+    I: Integer;
			
 
				+  begin
			
 
				+    for I := 0 to 3 do
			
 
				+      Z.M128_F32[I] := X.M128_F32[I] - Y.M128_F32[I];
			
 
				+  end;
			
 
				+
			
 
				+{ - InputHVA goes on the stack because there are not enough free XMM registers to contain the entire argument
			
 
				+  - A4 does NOT go on the stack and goes into an XMM register.
			
 
				+}
			
 
				+function HVATest(A1, A2, A3: TM128; InputHVA: THVA; A4: TM128; Op: Integer): THVA; vectorcall;
			
 
				+  begin
			
 
				+    { FIXME: There is an internal stack misalignment for A4, necessitating the
			
 
				+      use of (V)MOVDQU instead of (V)MOVDQA in the compiled code. }
			
 
				+    case Op of
			
 
				+      1:
			
 
				+        begin
			
 
				+          HVATest.V1 := InputHVA.V1 + A1;
			
 
				+          HVATest.V2 := InputHVA.V2 + A2;
			
 
				+          HVATest.V3 := InputHVA.V3 + A3;
			
 
				+          HVATest.V4 := InputHVA.V4 + A4;
			
 
				+        end;
			
 
				+      2:
			
 
				+        begin
			
 
				+          HVATest.V1 := InputHVA.V1 - A1;
			
 
				+          HVATest.V2 := InputHVA.V2 - A2;
			
 
				+          HVATest.V3 := InputHVA.V3 - A3;
			
 
				+          HVATest.V4 := InputHVA.V4 - A4;
			
 
				+        end;
			
 
				+      else
			
 
				+        begin
			
 
				+          HVATest.V1 := InputHVA.V1 + A1;
			
 
				+          HVATest.V2 := InputHVA.V2 - A2;
			
 
				+          HVATest.V3 := InputHVA.V3 + A3;
			
 
				+          HVATest.V4 := InputHVA.V4 - A4;
			
 
				+        end;
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+var
			
 
				+  B1, B2, B3, B4: TM128; HVA, AddRes, SubRes, MixRes, AddExp, SubExp, MixExp: THVA; I: Integer;
			
 
				+begin
			
 
				+  B1.M128_F32[0] := 1.0;        B1.M128_F32[1] := 2.0;        B1.M128_F32[2] := 3.0;        B1.M128_F32[3] := 4.0;
			
 
				+  B2.M128_F32[0] := 5.0;        B2.M128_F32[1] := 6.0;        B2.M128_F32[2] := 7.0;        B2.M128_F32[3] := 8.0;
			
 
				+  B3.M128_F32[0] := 9.0;        B3.M128_F32[1] := 10.0;       B3.M128_F32[2] := 11.0;       B3.M128_F32[3] := 12.0;
			
 
				+  B4.M128_F32[0] := 13.0;       B4.M128_F32[1] := 14.0;       B4.M128_F32[2] := 15.0;       B4.M128_F32[3] := 16.0;
			
 
				+
			
 
				+  HVA.V1.M128_F32[0] := 10.0;   HVA.V1.M128_F32[1] := 20.0;   HVA.V1.M128_F32[2] := 30.0;   HVA.V1.M128_F32[3] := 40.0;
			
 
				+  HVA.V2.M128_F32[0] := 50.0;   HVA.V2.M128_F32[1] := 60.0;   HVA.V2.M128_F32[2] := 70.0;   HVA.V2.M128_F32[3] := 80.0;
			
 
				+  HVA.V3.M128_F32[0] := 90.0;   HVA.V3.M128_F32[1] := 100.0;  HVA.V3.M128_F32[2] := 110.0;  HVA.V3.M128_F32[3] := 120.0;
			
 
				+  HVA.V4.M128_F32[0] := 130.0;  HVA.V4.M128_F32[1] := 140.0;  HVA.V4.M128_F32[2] := 150.0;  HVA.V4.M128_F32[3] := 160.0;
			
 
				+
			
 
				+  AddExp.V1.M128_F32[0] := 11.0;   AddExp.V1.M128_F32[1] := 22.0;   AddExp.V1.M128_F32[2] := 33.0;   AddExp.V1.M128_F32[3] := 44.0;
			
 
				+  AddExp.V2.M128_F32[0] := 55.0;   AddExp.V2.M128_F32[1] := 66.0;   AddExp.V2.M128_F32[2] := 77.0;   AddExp.V2.M128_F32[3] := 88.0;
			
 
				+  AddExp.V3.M128_F32[0] := 99.0;   AddExp.V3.M128_F32[1] := 110.0;  AddExp.V3.M128_F32[2] := 121.0;  AddExp.V3.M128_F32[3] := 132.0;
			
 
				+  AddExp.V4.M128_F32[0] := 143.0;  AddExp.V4.M128_F32[1] := 154.0;  AddExp.V4.M128_F32[2] := 165.0;  AddExp.V4.M128_F32[3] := 176.0;
			
 
				+
			
 
				+  SubExp.V1.M128_F32[0] := 9.0;    SubExp.V1.M128_F32[1] := 18.0;   SubExp.V1.M128_F32[2] := 27.0;   SubExp.V1.M128_F32[3] := 36.0;
			
 
				+  SubExp.V2.M128_F32[0] := 45.0;   SubExp.V2.M128_F32[1] := 54.0;   SubExp.V2.M128_F32[2] := 63.0;   SubExp.V2.M128_F32[3] := 72.0;
			
 
				+  SubExp.V3.M128_F32[0] := 81.0;   SubExp.V3.M128_F32[1] := 90.0;   SubExp.V3.M128_F32[2] := 99.0;   SubExp.V3.M128_F32[3] := 108.0;
			
 
				+  SubExp.V4.M128_F32[0] := 117.0;  SubExp.V4.M128_F32[1] := 126.0;  SubExp.V4.M128_F32[2] := 135.0;  SubExp.V4.M128_F32[3] := 144.0;
			
 
				+
			
 
				+  MixExp.V1.M128_F32[0] := 11.0;   MixExp.V1.M128_F32[1] := 22.0;   MixExp.V1.M128_F32[2] := 33.0;   MixExp.V1.M128_F32[3] := 44.0;
			
 
				+  MixExp.V2.M128_F32[0] := 45.0;   MixExp.V2.M128_F32[1] := 54.0;   MixExp.V2.M128_F32[2] := 63.0;   MixExp.V2.M128_F32[3] := 72.0;
			
 
				+  MixExp.V3.M128_F32[0] := 99.0;   MixExp.V3.M128_F32[1] := 110.0;  MixExp.V3.M128_F32[2] := 121.0;  MixExp.V3.M128_F32[3] := 132.0;
			
 
				+  MixExp.V4.M128_F32[0] := 117.0;  MixExp.V4.M128_F32[1] := 126.0;  MixExp.V4.M128_F32[2] := 135.0;  MixExp.V4.M128_F32[3] := 144.0;
			
 
				+
			
 
				+  WriteLn('    B1: ', B1.M128_F32[0], ',', B1.M128_F32[1], ',', B1.M128_F32[2], ',', B1.M128_F32[3]);
			
 
				+  WriteLn('    B2: ', B2.M128_F32[0], ',', B2.M128_F32[1], ',', B2.M128_F32[2], ',', B2.M128_F32[3]);
			
 
				+  WriteLn('    B3: ', B3.M128_F32[0], ',', B3.M128_F32[1], ',', B3.M128_F32[2], ',', B3.M128_F32[3]);
			
 
				+  WriteLn('    B4: ', B4.M128_F32[0], ',', B4.M128_F32[1], ',', B4.M128_F32[2], ',', B4.M128_F32[3]);
			
 
				+  WriteLn('HVA.V1: ', HVA.V1.M128_F32[0], ',', HVA.V1.M128_F32[1], ',', HVA.V1.M128_F32[2], ',', HVA.V1.M128_F32[3]);
			
 
				+  WriteLn('HVA.V2: ', HVA.V2.M128_F32[0], ',', HVA.V2.M128_F32[1], ',', HVA.V2.M128_F32[2], ',', HVA.V2.M128_F32[3]);
			
 
				+  WriteLn('HVA.V3: ', HVA.V3.M128_F32[0], ',', HVA.V3.M128_F32[1], ',', HVA.V3.M128_F32[2], ',', HVA.V3.M128_F32[3]);
			
 
				+  WriteLn('HVA.V4: ', HVA.V4.M128_F32[0], ',', HVA.V4.M128_F32[1], ',', HVA.V4.M128_F32[2], ',', HVA.V4.M128_F32[3]);
			
 
				+  AddRes := HVATest(B1, B2, B3, HVA, B4, 1);
			
 
				+  SubRes := HVATest(B1, B2, B3, HVA, B4, 2);
			
 
				+  MixRes := HVATest(B1, B2, B3, HVA, B4, 0);
			
 
				+  WriteLn('----');
			
 
				+  WriteLn('AddRes.V1: ', AddRes.V1.M128_F32[0], ',', AddRes.V1.M128_F32[1], ',', AddRes.V1.M128_F32[2], ',', AddRes.V1.M128_F32[3]);
			
 
				+  WriteLn('AddRes.V2: ', AddRes.V2.M128_F32[0], ',', AddRes.V2.M128_F32[1], ',', AddRes.V2.M128_F32[2], ',', AddRes.V2.M128_F32[3]);
			
 
				+  WriteLn('AddRes.V3: ', AddRes.V3.M128_F32[0], ',', AddRes.V3.M128_F32[1], ',', AddRes.V3.M128_F32[2], ',', AddRes.V3.M128_F32[3]);
			
 
				+  WriteLn('AddRes.V4: ', AddRes.V4.M128_F32[0], ',', AddRes.V4.M128_F32[1], ',', AddRes.V4.M128_F32[2], ',', AddRes.V4.M128_F32[3]);
			
 
				+  WriteLn();
			
 
				+  WriteLn('AddExp.V1: ', AddExp.V1.M128_F32[0], ',', AddExp.V1.M128_F32[1], ',', AddExp.V1.M128_F32[2], ',', AddExp.V1.M128_F32[3]);
			
 
				+  WriteLn('AddExp.V2: ', AddExp.V2.M128_F32[0], ',', AddExp.V2.M128_F32[1], ',', AddExp.V2.M128_F32[2], ',', AddExp.V2.M128_F32[3]);
			
 
				+  WriteLn('AddExp.V3: ', AddExp.V3.M128_F32[0], ',', AddExp.V3.M128_F32[1], ',', AddExp.V3.M128_F32[2], ',', AddExp.V3.M128_F32[3]);
			
 
				+  WriteLn('AddExp.V4: ', AddExp.V4.M128_F32[0], ',', AddExp.V4.M128_F32[1], ',', AddExp.V4.M128_F32[2], ',', AddExp.V4.M128_F32[3]);
			
 
				+  WriteLn('----');
			
 
				+  WriteLn('SubRes.V1: ', SubRes.V1.M128_F32[0], ',', SubRes.V1.M128_F32[1], ',', SubRes.V1.M128_F32[2], ',', SubRes.V1.M128_F32[3]);
			
 
				+  WriteLn('SubRes.V2: ', SubRes.V2.M128_F32[0], ',', SubRes.V2.M128_F32[1], ',', SubRes.V2.M128_F32[2], ',', SubRes.V2.M128_F32[3]);
			
 
				+  WriteLn('SubRes.V3: ', SubRes.V3.M128_F32[0], ',', SubRes.V3.M128_F32[1], ',', SubRes.V3.M128_F32[2], ',', SubRes.V3.M128_F32[3]);
			
 
				+  WriteLn('SubRes.V4: ', SubRes.V4.M128_F32[0], ',', SubRes.V4.M128_F32[1], ',', SubRes.V4.M128_F32[2], ',', SubRes.V4.M128_F32[3]);
			
 
				+  WriteLn();
			
 
				+  WriteLn('SubExp.V1: ', SubExp.V1.M128_F32[0], ',', SubExp.V1.M128_F32[1], ',', SubExp.V1.M128_F32[2], ',', SubExp.V1.M128_F32[3]);
			
 
				+  WriteLn('SubExp.V2: ', SubExp.V2.M128_F32[0], ',', SubExp.V2.M128_F32[1], ',', SubExp.V2.M128_F32[2], ',', SubExp.V2.M128_F32[3]);
			
 
				+  WriteLn('SubExp.V3: ', SubExp.V3.M128_F32[0], ',', SubExp.V3.M128_F32[1], ',', SubExp.V3.M128_F32[2], ',', SubExp.V3.M128_F32[3]);
			
 
				+  WriteLn('SubExp.V4: ', SubExp.V4.M128_F32[0], ',', SubExp.V4.M128_F32[1], ',', SubExp.V4.M128_F32[2], ',', SubExp.V4.M128_F32[3]);
			
 
				+  WriteLn('----');
			
 
				+  WriteLn('MixRes.V1: ', MixRes.V1.M128_F32[0], ',', MixRes.V1.M128_F32[1], ',', MixRes.V1.M128_F32[2], ',', MixRes.V1.M128_F32[3]);
			
 
				+  WriteLn('MixRes.V2: ', MixRes.V2.M128_F32[0], ',', MixRes.V2.M128_F32[1], ',', MixRes.V2.M128_F32[2], ',', MixRes.V2.M128_F32[3]);
			
 
				+  WriteLn('MixRes.V3: ', MixRes.V3.M128_F32[0], ',', MixRes.V3.M128_F32[1], ',', MixRes.V3.M128_F32[2], ',', MixRes.V3.M128_F32[3]);
			
 
				+  WriteLn('MixRes.V4: ', MixRes.V4.M128_F32[0], ',', MixRes.V4.M128_F32[1], ',', MixRes.V4.M128_F32[2], ',', MixRes.V4.M128_F32[3]);
			
 
				+  WriteLn();
			
 
				+  WriteLn('MixExp.V1: ', MixExp.V1.M128_F32[0], ',', MixExp.V1.M128_F32[1], ',', MixExp.V1.M128_F32[2], ',', MixExp.V1.M128_F32[3]);
			
 
				+  WriteLn('MixExp.V2: ', MixExp.V2.M128_F32[0], ',', MixExp.V2.M128_F32[1], ',', MixExp.V2.M128_F32[2], ',', MixExp.V2.M128_F32[3]);
			
 
				+  WriteLn('MixExp.V3: ', MixExp.V3.M128_F32[0], ',', MixExp.V3.M128_F32[1], ',', MixExp.V3.M128_F32[2], ',', MixExp.V3.M128_F32[3]);
			
 
				+  WriteLn('MixExp.V4: ', MixExp.V4.M128_F32[0], ',', MixExp.V4.M128_F32[1], ',', MixExp.V4.M128_F32[2], ',', MixExp.V4.M128_F32[3]);
			
 
				+
			
 
				+  for I := 0 to 3 do
			
 
				+    begin
			
 
				+      if AddRes.V1.M128_F32[I] <> AddExp.V1.M128_F32[I] then
			
 
				+        begin
			
 
				+          WriteLn('FAILURE on AddRes.V1.M128_F32[', I, ']');
			
 
				+          Halt(1);
			
 
				+        end;
			
 
				+      if SubRes.V1.M128_F32[I] <> SubExp.V1.M128_F32[I] then
			
 
				+        begin
			
 
				+          WriteLn('FAILURE on SubRes.V1.M128_F32[', I, ']');
			
 
				+          Halt(1);
			
 
				+        end;
			
 
				+      if MixRes.V1.M128_F32[I] <> MixExp.V1.M128_F32[I] then
			
 
				+        begin
			
 
				+          WriteLn('FAILURE on MixRes.V1.M128_F32[', I, ']');
			
 
				+          Halt(1);
			
 
				+        end;
			
 
				+    end;
			
 
				+
			
 
				+  WriteLn('ok');
			
 
				+end.
			
--- a/tests/test/cg/tvectorcall3.pp
+++ b/tests/test/cg/tvectorcall3.pp
@@ -0,0 +1,158 @@
 
				+{ %CPU=x86_64 } 
			
 
				+program vectorcall_stack_test;
			
 
				+
			
 
				+{$IFNDEF CPUX86_64}
			
 
				+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
			
 
				+{$ENDIF}
			
 
				+
			
 
				+{ This program can be compiled on Linux, and all the vectorcall
			
 
				+  routines should work the same, including the assembler routine.
			
 
				+  'vectorcall' should be ignored by the compiler on this platform. }
			
 
				+
			
 
				+{$push}
			
 
				+{$CODEALIGN RECORDMIN=16}
			
 
				+{$PACKRECORDS C}
			
 
				+type
			
 
				+  TM128 = record
			
 
				+    case Byte of
			
 
				+      0: (M128_F32: array[0..3] of Single);
			
 
				+      1: (M128_F64: array[0..1] of Double);
			
 
				+  end;
			
 
				+
			
 
				+{$CODEALIGN RECORDMIN=32}
			
 
				+{$PACKRECORDS C}
			
 
				+type
			
 
				+  TM256 = record
			
 
				+    case Byte of
			
 
				+      0: (M256_F32: array[0..7] of Single);
			
 
				+      1: (M256_F64: array[0..3] of Double);
			
 
				+      2: (M256_M128: array[0..1] of TM128);
			
 
				+  end;
			
 
				+{$pop}
			
 
				+
			
 
				+  TVector4f = packed record
			
 
				+    case Byte of
			
 
				+      0: (M128: TM128);
			
 
				+      1: (X, Y, Z, W: Single);
			
 
				+  end;
			
 
				+
			
 
				+  TVectorPair4f = packed record
			
 
				+    case Byte of
			
 
				+      0: (M256: TM256);
			
 
				+      1: (V: array[0..1] of TVector4f);
			
 
				+      2: (X1, Y1, Z1, W1, X2, Y2, Z2, W2: Single);
			
 
				+  end;
			
 
				+
			
 
				+function TestFloat(TP: Single): Single; vectorcall; { vectorcall should have no effect on how this function behaves }
			
 
				+begin
			
 
				+  TestFloat := TP * 1.5;
			
 
				+end;
			
 
				+
			
 
				+function AddVectors(V1, V2: TVector4f): TVector4f; vectorcall;
			
 
				+begin
			
 
				+  AddVectors.X := V1.X + V2.X;
			
 
				+  AddVectors.Y := V1.Y + V2.Y;
			
 
				+  AddVectors.Z := V1.Z + V2.Z;
			
 
				+  AddVectors.W := V1.W + V2.W;
			
 
				+end;
			
 
				+
			
 
				+{$ASMMODE Intel}
			
 
				+function AddVectorsAsm(V1, V2: TVector4f): TVector4f; vectorcall; assembler; nostackframe; inline; { The inline is for a future test }
			
 
				+asm
			
 
				+  ADDPS XMM0, XMM1
			
 
				+end;
			
 
				+
			
 
				+{ Note: V1, V2 and the result will go on the stack until FPC fully supports 256-bit vectors }
			
 
				+function AddVectors(V1, V2: TVectorPair4f): TVectorPair4f; vectorcall;
			
 
				+var
			
 
				+  C: Integer;
			
 
				+begin
			
 
				+  for C := 0 to 1 do
			
 
				+  begin
			
 
				+    AddVectors.V[C].X := V1.V[C].X + V2.V[C].X;
			
 
				+    AddVectors.V[C].Y := V1.V[C].Y + V2.V[C].Y;
			
 
				+    AddVectors.V[C].Z := V1.V[C].Z + V2.V[C].Z;
			
 
				+    AddVectors.V[C].W := V1.V[C].W + V2.V[C].W;
			
 
				+  end;
			
 
				+end;
			
 
				+
			
 
				+var
			
 
				+  Vecs: array[0..1] of TVector4f; Res, ResAsm, Exp: TVector4f;
			
 
				+  Pairs: array[0..1] of TVectorPair4f; ResPair, ExpPair: TVectorPair4f;
			
 
				+  I: Integer;
			
 
				+begin
			
 
				+  FillDWord(Vecs[0], 0, 8);
			
 
				+  Vecs[0].X := TestFloat(2.0);
			
 
				+  Vecs[0].Y := 1.0;
			
 
				+  Vecs[0].Z := -4.0;
			
 
				+  Vecs[0].W := 1.0;
			
 
				+
			
 
				+  Vecs[1].X := 0.0;
			
 
				+  Vecs[1].Y := -2.0;
			
 
				+  Vecs[1].Z := TestFloat(4.0);
			
 
				+  Vecs[1].W := 0.0;
			
 
				+
			
 
				+  Exp.X := 3.0;
			
 
				+  Exp.Y := -1.0;
			
 
				+  Exp.Z := 2.0;
			
 
				+  Exp.W := 1.0;
			
 
				+
			
 
				+  Pairs[0].V[0].X := 1.0;     Pairs[0].V[1].X := 5.0;
			
 
				+  Pairs[0].V[0].Y := 2.0;     Pairs[0].V[1].Y := 6.0;
			
 
				+  Pairs[0].V[0].Z := 3.0;     Pairs[0].V[1].Z := 7.0;
			
 
				+  Pairs[0].V[0].W := 4.0;     Pairs[0].V[1].W := 8.0;
			
 
				+
			
 
				+  Pairs[1].V[0].X := 9.0;     Pairs[1].V[1].X := 13.0;
			
 
				+  Pairs[1].V[0].Y := 10.0;    Pairs[1].V[1].Y := 14.0;
			
 
				+  Pairs[1].V[0].Z := 11.0;    Pairs[1].V[1].Z := 15.0;
			
 
				+  Pairs[1].V[0].W := 12.0;    Pairs[1].V[1].W := 16.0;
			
 
				+
			
 
				+  ExpPair.V[0].X := 10.0;     ExpPair.V[1].X := 18.0;
			
 
				+  ExpPair.V[0].Y := 12.0;     ExpPair.V[1].Y := 20.0;
			
 
				+  ExpPair.V[0].Z := 14.0;     ExpPair.V[1].Z := 22.0;
			
 
				+  ExpPair.V[0].W := 16.0;     ExpPair.V[1].W := 24.0;
			
 
				+
			
 
				+  WriteLn('Vecs[0]  = (', Vecs[0].X, ', ', Vecs[0].Y, ', ', Vecs[0].Z, ', ', Vecs[0].W, ')');
			
 
				+  WriteLn('Vecs[1]  = (', Vecs[1].X, ', ', Vecs[1].Y, ', ', Vecs[1].Z, ', ', Vecs[1].W, ')');
			
 
				+
			
 
				+  Res := AddVectors(Vecs[0], Vecs[1]);
			
 
				+  ResAsm := AddVectorsAsm(Vecs[0], Vecs[1]);
			
 
				+
			
 
				+  WriteLn('Result   = (', Res.X, ', ', Res.Y, ', ', Res.Z, ', ', Res.W, ')');
			
 
				+  WriteLn('ResAsm   = (', ResAsm.X, ', ', ResAsm.Y, ', ', ResAsm.Z, ', ', ResAsm.W, ')');
			
 
				+  WriteLn('Expected = (', Exp.X, ', ', Exp.Y, ', ', Exp.Z, ', ', Exp.W, ')');
			
 
				+
			
 
				+  WriteLn('Pairs[0] = (', Pairs[0].V[0].X, ', ', Pairs[0].V[0].Y, ', ', Pairs[0].V[0].Z, ', ', Pairs[0].V[0].W, ', ', Pairs[0].V[1].X, ', ', Pairs[0].V[1].Y, ', ', Pairs[0].V[1].Z, ', ', Pairs[0].V[1].W, ')');
			
 
				+  WriteLn('Pairs[1] = (', Pairs[1].V[0].X, ', ', Pairs[1].V[0].Y, ', ', Pairs[1].V[0].Z, ', ', Pairs[1].V[0].W, ', ', Pairs[1].V[1].X, ', ', Pairs[1].V[1].Y, ', ', Pairs[1].V[1].Z, ', ', Pairs[1].V[1].W, ')');
			
 
				+
			
 
				+  ResPair := AddVectors(Pairs[0], Pairs[1]);
			
 
				+
			
 
				+  WriteLn('ResPair  = (', ResPair.V[0].X, ', ', ResPair.V[0].Y, ', ', ResPair.V[0].Z, ', ', ResPair.V[0].W, ', ', ResPair.V[1].X, ', ', ResPair.V[1].Y, ', ', ResPair.V[1].Z, ', ', ResPair.V[1].W, ')');
			
 
				+  WriteLn('Expected = (', ExpPair.V[0].X, ', ', ExpPair.V[0].Y, ', ', ExpPair.V[0].Z, ', ', ExpPair.V[0].W, ', ', ExpPair.V[1].X, ', ', ExpPair.V[1].Y, ', ', ExpPair.V[1].Z, ', ', ExpPair.V[1].W, ')');
			
 
				+
			
 
				+  for I := 0 to 3 do
			
 
				+  begin
			
 
				+    if Res.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
			
 
				+    begin
			
 
				+      WriteLn('FAILURE on Res.M128.M128_F32[', I, ']');
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+
			
 
				+    if ResAsm.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
			
 
				+    begin
			
 
				+      WriteLn('FAILURE on ResAsm.M128.M128_F32[', I, ']');
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  for I := 0 to 7 do
			
 
				+  begin
			
 
				+    if ResPair.M256.M256_F32[I] <> ExpPair.M256.M256_F32[I] then
			
 
				+    begin
			
 
				+      WriteLn('FAILURE on ResPair.M256.M256_F32[', I, ']');
			
 
				+      Halt(1);
			
 
				+    end;
			
 
				+  end;
			
 
				+
			
 
				+  WriteLn('ok');
			
 
				+end.