Răsfoiți Sursa

+ more tests

git-svn-id: trunk@38207 -
florian 7 ani în urmă
părinte
comite
96ffb8abc9
4 a modificat fișierele cu 1916 adăugiri și 0 ștergeri
  1. 3 0
      .gitattributes
  2. 869 0
      tests/test/cg/tvectorcall1a.pp
  3. 886 0
      tests/test/cg/tvectorcall1b.pp
  4. 158 0
      tests/test/cg/tvectorcall3a.pp

+ 3 - 0
.gitattributes

@@ -12000,8 +12000,11 @@ tests/test/cg/ttryfin5.pp svneol=native#text/plain
 tests/test/cg/tumin.pp svneol=native#text/plain
 tests/test/cg/tvec.pp svneol=native#text/plain
 tests/test/cg/tvectorcall1.pp svneol=native#text/pascal
+tests/test/cg/tvectorcall1a.pp svneol=native#text/pascal
+tests/test/cg/tvectorcall1b.pp svneol=native#text/pascal
 tests/test/cg/tvectorcall2.pp svneol=native#text/pascal
 tests/test/cg/tvectorcall3.pp svneol=native#text/pascal
+tests/test/cg/tvectorcall3a.pp svneol=native#text/pascal
 tests/test/cg/uandorxorassign.pp svneol=native#text/plain
 tests/test/cg/unegnotassign.pp svneol=native#text/plain
 tests/test/cg/uprintf3.pp svneol=native#text/plain

+ 869 - 0
tests/test/cg/tvectorcall1a.pp

@@ -0,0 +1,869 @@
+{ %CPU=x86_64 }
+program vectorcall_hva_test1;
+
+{$IFNDEF CPUX86_64}
+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
+{$ENDIF}
+
+{$ASMMODE Intel}
+{$PUSH}
+{$CODEALIGN RECORDMIN=16}
+{$PACKRECORDS C}
+type
+  TM128 = record
+    case Byte of
+      0: (M128_F32: array[0..3] of Single);
+      1: (M128_F64: array[0..1] of Double);
+  end;
+{$POP}
+
+{ HFA test: field style. }
+
+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
+  turned into vectors rather than HFAs. }
+
+  THFA1_SF = record
+    F1: Single;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_SF = record
+    F1, F2: Single;
+  end;
+
+  THFA3_SF = record
+    F1, F2, F3: Single;
+  end;
+
+  THFA4_SF = record
+    F1, F2, F3, F4: Single;
+  end;
+{$ENDIF}
+
+  THFA1_DF = record
+    F1: Double;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_DF = record
+    F1, F2: Double;
+  end;
+
+  THFA3_DF = record
+    F1, F2, F3: Double;
+  end;
+
+  THFA4_DF = record
+    F1, F2, F3, F4: Double;
+  end;
+{$ENDIF}
+
+{ HFA test - array style }
+
+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
+  turned into vectors rather than HFAs. }
+
+  THFA1_SA = record
+    F: array[0..0] of Single;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_SA = record
+    F: array[0..1] of Single;
+  end;
+
+  THFA3_SA = record
+    F: array[0..2] of Single;
+  end;
+
+  THFA4_SA = record
+    F: array[0..3] of Single;
+  end;
+{$ENDIF}
+
+  THFA1_DA = record
+    F: array[0..0] of Double;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_DA = record
+    F: array[0..1] of Double;
+  end;
+
+  THFA3_DA = record
+    F: array[0..2] of Double;
+  end;
+
+  THFA4_DA = record
+    F: array[0..3] of Double;
+  end;
+{$ENDIF}
+
+{ Single-type vector }
+
+function HorizontalAddSingle(V: TM128): Single; vectorcall;
+begin
+  HorizontalAddSingle := V.M128_F32[0] + V.M128_F32[1] + V.M128_F32[2] + V.M128_F32[3];
+end;
+
+function HorizontalAddSingle_ASM(V: TM128): Single; vectorcall; assembler; nostackframe;
+asm
+  HADDPS XMM0, XMM0
+  HADDPS XMM0, XMM0
+end;
+
+{ Double-type vector }
+
+function HorizontalAddDouble(V: TM128): Double; vectorcall;
+begin
+  HorizontalAddDouble := V.M128_F64[0] + V.M128_F64[1];
+end;
+
+function HorizontalAddDouble_ASM(V: TM128): Double; vectorcall; assembler; nostackframe;
+asm
+  HADDPD XMM0, XMM0
+end;
+
+{ 3-element aggregate }
+
+function AddSingles1F(HFA: THFA1_SF): Single; vectorcall;
+begin
+  AddSingles1F := HFA.F1;
+end;
+
+function AddSingles1F_ASM(HFA: THFA1_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddDoubles1F(HFA: THFA1_DF): Double; vectorcall;
+begin
+  AddDoubles1F := HFA.F1;
+end;
+
+function AddDoubles1F_ASM(HFA: THFA1_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddSingles1A(HFA: THFA1_SA): Single; vectorcall;
+begin
+  AddSingles1A := HFA.F[0];
+end;
+
+function AddSingles1A_ASM(HFA: THFA1_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddDoubles1A(HFA: THFA1_DA): Double; vectorcall;
+begin
+  AddDoubles1A := HFA.F[0];
+end;
+
+function AddDoubles1A_ASM(HFA: THFA1_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+{$IFDEF WIN64}
+{ 2-element aggregate }
+
+function AddSingles2F(HFA: THFA2_SF): Single; vectorcall;
+begin
+  AddSingles2F := HFA.F1 + HFA.F2;
+end;
+
+function AddSingles2F_ASM(HFA: THFA2_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+end;
+
+function AddDoubles2F(HFA: THFA2_DF): Double; vectorcall;
+begin
+  AddDoubles2F := HFA.F1 + HFA.F2;
+end;
+
+function AddDoubles2F_ASM(HFA: THFA2_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+end;
+
+function AddSingles2A(HFA: THFA2_SA): Single; vectorcall;
+begin
+  AddSingles2A := HFA.F[0] + HFA.F[1];
+end;
+
+function AddSingles2A_ASM(HFA: THFA2_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+end;
+
+function AddDoubles2A(HFA: THFA2_DA): Double; vectorcall;
+begin
+  AddDoubles2A := HFA.F[0] + HFA.F[1];
+end;
+
+function AddDoubles2A_ASM(HFA: THFA2_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+end;
+
+{ 3-element aggregate }
+
+function AddSingles3F(HFA: THFA3_SF): Single; vectorcall;
+begin
+  AddSingles3F := HFA.F1 + HFA.F2 + HFA.F3;
+end;
+
+function AddSingles3F_ASM(HFA: THFA3_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+end;
+
+function AddDoubles3F(HFA: THFA3_DF): Double; vectorcall;
+begin
+  AddDoubles3F := HFA.F1 + HFA.F2 + HFA.F3;
+end;
+
+function AddDoubles3F_ASM(HFA: THFA3_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+end;
+
+function AddSingles3A(HFA: THFA3_SA): Single; vectorcall;
+begin
+  AddSingles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
+end;
+
+function AddSingles3A_ASM(HFA: THFA3_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+end;
+
+function AddDoubles3A(HFA: THFA3_DA): Double; vectorcall;
+begin
+  AddDoubles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
+end;
+
+function AddDoubles3A_ASM(HFA: THFA3_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+end;
+
+{ 4-element aggregate }
+
+function AddSingles4F(HFA: THFA4_SF): Single; vectorcall;
+begin
+  AddSingles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
+end;
+
+function AddSingles4F_ASM(HFA: THFA4_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+  ADDSS XMM0, XMM3
+end;
+
+function AddDoubles4F(HFA: THFA4_DF): Double; vectorcall;
+begin
+  AddDoubles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
+end;
+
+function AddDoubles4F_ASM(HFA: THFA4_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+  ADDSD XMM0, XMM3
+end;
+
+function AddSingles4A(HFA: THFA4_SA): Single; vectorcall;
+begin
+  AddSingles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
+end;
+
+function AddSingles4A_ASM(HFA: THFA4_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+  ADDSS XMM0, XMM3
+end;
+
+function AddDoubles4A(HFA: THFA4_DA): Double; vectorcall;
+begin
+  AddDoubles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
+end;
+
+function AddDoubles4A_ASM(HFA: THFA4_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+  ADDSD XMM0, XMM3
+end;
+{$ENDIF}
+
+var
+  HVA: TM128;
+  HFA1_SF: THFA1_SF;
+  HFA1_DF: THFA1_DF;
+  HFA1_SA: THFA1_SA;
+  HFA1_DA: THFA1_DA;
+{$IFDEF WIN64}
+  HFA2_SF: THFA2_SF;
+  HFA2_DF: THFA2_DF;
+  HFA2_SA: THFA2_SA;
+  HFA2_DA: THFA2_DA;
+  HFA3_SF: THFA3_SF;
+  HFA3_DF: THFA3_DF;
+  HFA3_SA: THFA3_SA;
+  HFA3_DA: THFA3_DA;
+  HFA4_SF: THFA4_SF;
+  HFA4_DF: THFA4_DF;
+  HFA4_SA: THFA4_SA;
+  HFA4_DA: THFA4_DA;
+{$ENDIF}
+  TestPointer: PtrUInt;
+  I, J: Integer;
+  ResS, ResSA: Single;
+  ResD, ResDA: Double;
+  Addresses: array[0..3] of Pointer;
+  FieldAddresses: array[0..3, 0..3] of Pointer;
+const
+  AddressNames1: array[0..3] of ShortString = ('HFA1_SF', 'HFA1_DF', 'HFA1_SA', 'HFA1_DA');
+{$IFDEF WIN64}
+  AddressNames2: array[0..3] of ShortString = ('HFA2_SF', 'HFA2_DF', 'HFA2_SA', 'HFA2_DA');
+  AddressNames3: array[0..3] of ShortString = ('HFA3_SF', 'HFA3_DF', 'HFA3_SA', 'HFA3_DA');
+  AddressNames4: array[0..3] of ShortString = ('HFA4_SF', 'HFA4_DF', 'HFA4_SA', 'HFA4_DA');
+{$ENDIF}
+  FieldAddressNames: array[0..3] of ShortString = ('F1', 'F2', 'F3', 'F4');
+
+  ExpS1: Single = 5.0;
+{$IFDEF WIN64}
+  ExpS2: Single = -5.0;
+  ExpS3: Single = 10.0;
+{$ENDIF}
+  ExpS4: Single = -10.0;
+  ExpD1: Double = 5.0;
+  ExpD2: Double = -5.0;
+{$IFDEF WIN64}
+  ExpD3: Double = 10.0;
+  ExpD4: Double = -10.0;
+{$ENDIF}
+begin
+
+  if (PtrUInt(@HVA) and $F) <> 0 then
+  begin
+    WriteLn('FAIL: HVA is not correctly aligned.');
+    Halt(1);
+  end;
+
+  { array of singles }
+  WriteLn('- horizontal add (4 singles)');
+  HVA.M128_F32[0] := 5.0;
+  HVA.M128_F32[1] := -10.0;
+  HVA.M128_F32[2] := 15.0;
+  HVA.M128_F32[3] := -20.0;
+  ResS := HorizontalAddSingle(HVA);
+  ResSA := HorizontalAddSingle_ASM(HVA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: HorizontalAddSingle(HVA) has the vector in the wrong register.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: HorizontalAddSingle(HVA) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  { array of doubles }
+  WriteLn('- horizontal add (2 doubles)');
+  HVA.M128_F64[0] := 5.0;
+  HVA.M128_F64[1] := -10.0;
+  ResD := HorizontalAddDouble(HVA);
+  ResDA := HorizontalAddDouble_ASM(HVA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: HorizontalAddDouble(HVA) has the vector in the wrong register.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: HorizontalAddDouble(HVA) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  { 1-field aggregates }
+  WriteLn('- 1-field aggregates');
+
+  Addresses[0] := @HFA1_SF;
+  Addresses[1] := @HFA1_SA;
+  Addresses[2] := @HFA1_DF;
+  Addresses[3] := @HFA1_DA;
+  FieldAddresses[0][0] := @(HFA1_SF.F1);
+  FieldAddresses[1][0] := @(HFA1_SA.F[0]);
+  FieldAddresses[2][0] := @(HFA1_DF.F1);
+  FieldAddresses[3][0] := @(HFA1_DA.F[0]);
+
+  { Check alignment }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    if Pointer(TestPointer) <> FieldAddresses[I][0] then
+    begin
+      WriteLn('FAIL: ', AddressNames1[I], ' is not correctly packed; field F1 is not in the expected place.');
+      Halt(1);
+    end;
+  end;
+
+  HFA1_SF.F1 := 5.0;
+  ResS := AddSingles1F(HFA1_SF);
+  ResSA := AddSingles1F_ASM(HFA1_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS1 then
+    begin
+      WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_DF.F1 := 5.0;
+  ResD := AddDoubles1F(HFA1_DF);
+  ResDA := AddDoubles1F_ASM(HFA1_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD1 then
+    begin
+      WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_SA.F[0] := 5.0;
+  ResS := AddSingles1A(HFA1_SA);
+  ResSA := AddSingles1A_ASM(HFA1_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS1 then
+    begin
+      WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_DA.F[0] := 5.0;
+  ResD := AddDoubles1A(HFA1_DA);
+  ResDA := AddDoubles1A_ASM(HFA1_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD1 then
+    begin
+      WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
+      Halt(1);
+    end;
+  end;
+
+{$IFDEF WIN64}
+  { 2-field aggregates }
+  WriteLn('- 2-field aggregates');
+
+  Addresses[0] := @HFA2_SF;
+  Addresses[1] := @HFA2_SA;
+  FieldAddresses[0][0] := @(HFA2_SF.F1);
+  FieldAddresses[0][1] := @(HFA2_SF.F2);
+  FieldAddresses[1][0] := @(HFA2_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA2_SA.F[1]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 1 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA2_DF;
+  Addresses[3] := @HFA2_DA;
+  FieldAddresses[2][0] := @(HFA2_DF.F1);
+  FieldAddresses[2][1] := @(HFA2_DF.F2);
+  FieldAddresses[3][0] := @(HFA2_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA2_DA.F[1]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 1 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA2_SF.F1 := 5.0;
+  HFA2_SF.F2 := -10.0;
+  ResS := AddSingles2F(HFA2_SF);
+  ResSA := AddSingles2F_ASM(HFA2_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles2F(HFA2_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS2 then
+    begin
+      WriteLn('FAIL: AddSingles2F(HFA2_SF) returned ', ResS, ' instead of ', ExpS2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_DF.F1 := 5.0;
+  HFA2_DF.F2 := -10.0;
+  ResD := AddDoubles2F(HFA2_DF);
+  ResDA := AddDoubles2F_ASM(HFA2_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles2F(HFA2_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: AddDoubles2F(HFA2_DF) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_SA.F[0] := 5.0;
+  HFA2_SA.F[1] := -10.0;
+  ResS := AddSingles2A(HFA2_SA);
+  ResSA := AddSingles2A_ASM(HFA2_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles2A(HFA2_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS2 then
+    begin
+      WriteLn('FAIL: AddSingles2A(HFA2_SA) returned ', ResS, ' instead of ', ExpS2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_DA.F[0] := 5.0;
+  HFA2_DA.F[1] := -10.0;
+  ResD := AddDoubles2A(HFA2_DA);
+  ResDA := AddDoubles2A_ASM(HFA2_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles2A(HFA2_DA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: AddDoubles2A(HFA2_DA) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  { 3-field aggregates }
+  WriteLn('- 3-field aggregates');
+
+  Addresses[0] := @HFA3_SF;
+  Addresses[1] := @HFA3_SA;
+  FieldAddresses[0][0] := @(HFA3_SF.F1);
+  FieldAddresses[0][1] := @(HFA3_SF.F2);
+  FieldAddresses[0][2] := @(HFA3_SF.F3);
+  FieldAddresses[1][0] := @(HFA3_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA3_SA.F[1]);
+  FieldAddresses[1][2] := @(HFA3_SA.F[2]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 2 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA3_DF;
+  Addresses[3] := @HFA3_DA;
+  FieldAddresses[2][0] := @(HFA3_DF.F1);
+  FieldAddresses[2][1] := @(HFA3_DF.F2);
+  FieldAddresses[2][2] := @(HFA3_DF.F3);
+  FieldAddresses[3][0] := @(HFA3_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA3_DA.F[1]);
+  FieldAddresses[3][2] := @(HFA3_DA.F[2]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 2 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA3_SF.F1 := 5.0;
+  HFA3_SF.F2 := -10.0;
+  HFA3_SF.F3 := 15.0;
+  ResS := AddSingles3F(HFA3_SF);
+  ResSA := AddSingles3F_ASM(HFA3_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles3F(HFA3_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS3 then
+    begin
+      WriteLn('FAIL: AddSingles3F(HFA3_SF) returned ', ResS, ' instead of ', ExpS3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_DF.F1 := 5.0;
+  HFA3_DF.F2 := -10.0;
+  HFA3_DF.F3 := 15.0;
+  ResD := AddDoubles3F(HFA3_DF);
+  ResDA := AddDoubles3F_ASM(HFA3_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles3F(HFA3_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD3 then
+    begin
+      WriteLn('FAIL: AddDoubles3F(HFA3_DF) returned ', ResD, ' instead of ', ExpD3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_SA.F[0] := 5.0;
+  HFA3_SA.F[1] := -10.0;
+  HFA3_SA.F[2] := 15.0;
+  ResS := AddSingles3A(HFA3_SA);
+  ResSA := AddSingles3A_ASM(HFA3_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles3A(HFA3_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS3 then
+    begin
+      WriteLn('FAIL: AddSingles3A(HFA3_SA) returned ', ResS, ' instead of ', ExpS3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_DA.F[0] := 5.0;
+  HFA3_DA.F[1] := -10.0;
+  HFA3_DA.F[2] := 15.0;
+  ResD := AddDoubles3A(HFA3_DA);
+  ResDA := AddDoubles3A_ASM(HFA3_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles3A(HFA3_DA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD3 then
+    begin
+      WriteLn('FAIL: AddDoubles3A(HFA3_DA) returned ', ResD, ' instead of ', ExpD3);
+      Halt(1);
+    end;
+  end;
+
+  { 4-field aggregates }
+  WriteLn('- 4-field aggregates');
+
+  Addresses[0] := @HFA4_SF;
+  Addresses[1] := @HFA4_SA;
+  FieldAddresses[0][0] := @(HFA4_SF.F1);
+  FieldAddresses[0][1] := @(HFA4_SF.F2);
+  FieldAddresses[0][2] := @(HFA4_SF.F3);
+  FieldAddresses[0][3] := @(HFA4_SF.F4);
+  FieldAddresses[1][0] := @(HFA4_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA4_SA.F[1]);
+  FieldAddresses[1][2] := @(HFA4_SA.F[2]);
+  FieldAddresses[1][3] := @(HFA4_SA.F[3]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 3 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA4_DF;
+  Addresses[3] := @HFA4_DA;
+  FieldAddresses[2][0] := @(HFA4_DF.F1);
+  FieldAddresses[2][1] := @(HFA4_DF.F2);
+  FieldAddresses[2][2] := @(HFA4_DF.F3);
+  FieldAddresses[2][3] := @(HFA4_DF.F4);
+  FieldAddresses[3][0] := @(HFA4_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA4_DA.F[1]);
+  FieldAddresses[3][2] := @(HFA4_DA.F[2]);
+  FieldAddresses[3][3] := @(HFA4_DA.F[3]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 3 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA4_SF.F1 := 5.0;
+  HFA4_SF.F2 := -10.0;
+  HFA4_SF.F3 := 15.0;
+  HFA4_SF.F4 := -20.0;
+  ResS := AddSingles4F(HFA4_SF);
+  ResSA := AddSingles4F_ASM(HFA4_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles4F(HFA4_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: AddSingles4F(HFA4_SF) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_DF.F1 := 5.0;
+  HFA4_DF.F2 := -10.0;
+  HFA4_DF.F3 := 15.0;
+  HFA4_DF.F4 := -20.0;
+  ResD := AddDoubles4F(HFA4_DF);
+  ResDA := AddDoubles4F_ASM(HFA4_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles4F(HFA4_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD4 then
+    begin
+      WriteLn('FAIL: AddDoubles4F(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_SA.F[0] := 5.0;
+  HFA4_SA.F[1] := -10.0;
+  HFA4_SA.F[2] := 15.0;
+  HFA4_SA.F[3] := -20.0;
+  ResS := AddSingles4A(HFA4_SA);
+  ResSA := AddSingles4A_ASM(HFA4_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles4A(HFA4_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: AddSingles4A(HFA4_SA) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_DA.F[0] := 5.0;
+  HFA4_DA.F[1] := -10.0;
+  HFA4_DA.F[2] := 15.0;
+  HFA4_DA.F[3] := -20.0;
+  ResD := AddDoubles4A(HFA4_DA);
+  ResDA := AddDoubles4A_ASM(HFA4_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles4A(HFA4_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD4 then
+    begin
+      WriteLn('FAIL: AddDoubles4A(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
+      Halt(1);
+    end;
+  end;
+{$ENDIF}
+  WriteLn('ok');
+end.
+

+ 886 - 0
tests/test/cg/tvectorcall1b.pp

@@ -0,0 +1,886 @@
+{ %CPU=x86_64 }
+program vectorcall_hva_test1;
+
+{$IFNDEF CPUX86_64}
+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
+{$ENDIF}
+
+{$ASMMODE Intel}
+{$PUSH}
+{$CODEALIGN RECORDMIN=16}
+{$PACKRECORDS C}
+type
+  TM128 = record
+    case Byte of
+      0: (M128_F32: array[0..3] of Single);
+      1: (M128_F64: array[0..1] of Double);
+  end;
+{$POP}
+
+{ HFA test: field style. }
+
+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
+  turned into vectors rather than HFAs. }
+
+  THFA1_SF = packed record
+    F1: Single;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_SF = packed record
+    F1, F2: Single;
+  end;
+
+  THFA3_SF = packed record
+    F1, F2, F3: Single;
+  end;
+
+  THFA4_SF = packed record
+    F1, F2, F3, F4: Single;
+  end;
+{$ENDIF}
+
+  THFA1_DF = packed record
+    F1: Double;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_DF = packed record
+    F1, F2: Double;
+  end;
+
+  THFA3_DF = packed record
+    F1, F2, F3: Double;
+  end;
+
+  THFA4_DF = packed record
+    F1, F2, F3, F4: Double;
+  end;
+{$ENDIF}
+
+{ HFA test - array style }
+
+{ NOTE: if the record falls on a 16-byte boundary, the 4-component entries will
+  turned into vectors rather than HFAs. }
+
+  THFA1_SA = packed record
+    F: array[0..0] of Single;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_SA = packed record
+    F: array[0..1] of Single;
+  end;
+
+  THFA3_SA = packed record
+    F: array[0..2] of Single;
+  end;
+
+  THFA4_SA = packed record
+    F: array[0..3] of Single;
+  end;
+{$ENDIF}
+
+  THFA1_DA = packed record
+    F: array[0..0] of Double;
+  end;
+
+{$IFDEF WIN64}
+  THFA2_DA = packed record
+    F: array[0..1] of Double;
+  end;
+
+  THFA3_DA = packed record
+    F: array[0..2] of Double;
+  end;
+
+  THFA4_DA = packed record
+    F: array[0..3] of Double;
+  end;
+{$ENDIF}
+
+{ Single-type vector }
+
+function HorizontalAddSingle(V: TM128): Single; vectorcall;
+begin
+  HorizontalAddSingle := V.M128_F32[0] + V.M128_F32[1] + V.M128_F32[2] + V.M128_F32[3];
+end;
+
+function HorizontalAddSingle_ASM(V: TM128): Single; vectorcall; assembler; nostackframe;
+asm
+  HADDPS XMM0, XMM0
+  HADDPS XMM0, XMM0
+end;
+
+{ Double-type vector }
+
+function HorizontalAddDouble(V: TM128): Double; vectorcall;
+begin
+  HorizontalAddDouble := V.M128_F64[0] + V.M128_F64[1];
+end;
+
+function HorizontalAddDouble_ASM(V: TM128): Double; vectorcall; assembler; nostackframe;
+asm
+  HADDPD XMM0, XMM0
+end;
+
+{ 3-element aggregate }
+
+function AddSingles1F(HFA: THFA1_SF): Single; vectorcall;
+begin
+  AddSingles1F := HFA.F1;
+end;
+
+function AddSingles1F_ASM(HFA: THFA1_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddDoubles1F(HFA: THFA1_DF): Double; vectorcall;
+begin
+  AddDoubles1F := HFA.F1;
+end;
+
+function AddDoubles1F_ASM(HFA: THFA1_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddSingles1A(HFA: THFA1_SA): Single; vectorcall;
+begin
+  AddSingles1A := HFA.F[0];
+end;
+
+function AddSingles1A_ASM(HFA: THFA1_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+function AddDoubles1A(HFA: THFA1_DA): Double; vectorcall;
+begin
+  AddDoubles1A := HFA.F[0];
+end;
+
+function AddDoubles1A_ASM(HFA: THFA1_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  { Do absolutely nothing! }
+end;
+
+{$IFDEF WIN64}
+{ 2-element aggregate }
+
+function AddSingles2F(HFA: THFA2_SF): Single; vectorcall;
+begin
+  AddSingles2F := HFA.F1 + HFA.F2;
+end;
+
+function AddSingles2F_ASM(HFA: THFA2_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+end;
+
+function AddDoubles2F(HFA: THFA2_DF): Double; vectorcall;
+begin
+  AddDoubles2F := HFA.F1 + HFA.F2;
+end;
+
+function AddDoubles2F_ASM(HFA: THFA2_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+end;
+
+function AddSingles2A(HFA: THFA2_SA): Single; vectorcall;
+begin
+  AddSingles2A := HFA.F[0] + HFA.F[1];
+end;
+
+function AddSingles2A_ASM(HFA: THFA2_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+end;
+
+function AddDoubles2A(HFA: THFA2_DA): Double; vectorcall;
+begin
+  AddDoubles2A := HFA.F[0] + HFA.F[1];
+end;
+
+function AddDoubles2A_ASM(HFA: THFA2_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+end;
+
+{ 3-element aggregate }
+
+function AddSingles3F(HFA: THFA3_SF): Single; vectorcall;
+begin
+  AddSingles3F := HFA.F1 + HFA.F2 + HFA.F3;
+end;
+
+function AddSingles3F_ASM(HFA: THFA3_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+end;
+
+function AddDoubles3F(HFA: THFA3_DF): Double; vectorcall;
+begin
+  AddDoubles3F := HFA.F1 + HFA.F2 + HFA.F3;
+end;
+
+function AddDoubles3F_ASM(HFA: THFA3_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+end;
+
+function AddSingles3A(HFA: THFA3_SA): Single; vectorcall;
+begin
+  AddSingles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
+end;
+
+function AddSingles3A_ASM(HFA: THFA3_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+end;
+
+function AddDoubles3A(HFA: THFA3_DA): Double; vectorcall;
+begin
+  AddDoubles3A := HFA.F[0] + HFA.F[1] + HFA.F[2];
+end;
+
+function AddDoubles3A_ASM(HFA: THFA3_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+end;
+
+{ 4-element aggregate }
+
+function AddSingles4F(HFA: THFA4_SF): Single; vectorcall;
+begin
+  AddSingles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
+end;
+
+function AddSingles4F_ASM(HFA: THFA4_SF): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+  ADDSS XMM0, XMM3
+end;
+
+function AddDoubles4F(HFA: THFA4_DF): Double; vectorcall;
+begin
+  AddDoubles4F := HFA.F1 + HFA.F2 + HFA.F3 + HFA.F4;
+end;
+
+function AddDoubles4F_ASM(HFA: THFA4_DF): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+  ADDSD XMM0, XMM3
+end;
+
+function AddSingles4A(HFA: THFA4_SA): Single; vectorcall;
+begin
+  AddSingles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
+end;
+
+function AddSingles4A_ASM(HFA: THFA4_SA): Single; vectorcall; assembler; nostackframe;
+asm
+  ADDSS XMM0, XMM1
+  ADDSS XMM0, XMM2
+  ADDSS XMM0, XMM3
+end;
+
+function AddDoubles4A(HFA: THFA4_DA): Double; vectorcall;
+begin
+  AddDoubles4A := HFA.F[0] + HFA.F[1] + HFA.F[2] + HFA.F[3];
+end;
+
+function AddDoubles4A_ASM(HFA: THFA4_DA): Double; vectorcall; assembler; nostackframe;
+asm
+  ADDSD XMM0, XMM1
+  ADDSD XMM0, XMM2
+  ADDSD XMM0, XMM3
+end;
+{$ENDIF}
+
+var
+  b1 : byte;
+  HVA: TM128;
+  b2 : byte;
+  HFA1_SF: THFA1_SF;
+  b3 : byte;
+  HFA1_DF: THFA1_DF;
+  b4 : byte;
+  HFA1_SA: THFA1_SA;
+  b5 : byte;
+  HFA1_DA: THFA1_DA;
+{$IFDEF WIN64}
+  b6 : byte;
+  HFA2_SF: THFA2_SF;
+  b7 : byte;
+  HFA2_DF: THFA2_DF;
+  b8 : byte;
+  HFA2_SA: THFA2_SA;
+  b9 : byte;
+  HFA2_DA: THFA2_DA;
+  b10 : byte;
+  HFA3_SF: THFA3_SF;
+  b11 : byte;
+  HFA3_DF: THFA3_DF;
+  b12 : byte;
+  HFA3_SA: THFA3_SA;
+  b13 : byte;
+  HFA3_DA: THFA3_DA;
+  b14 : byte;
+  HFA4_SF: THFA4_SF;
+  b15 : byte;
+  HFA4_DF: THFA4_DF;
+  b16 : byte;
+  HFA4_SA: THFA4_SA;
+  b17 : byte;
+  HFA4_DA: THFA4_DA;
+{$ENDIF}
+  TestPointer: PtrUInt;
+  I, J: Integer;
+  ResS, ResSA: Single;
+  ResD, ResDA: Double;
+  Addresses: array[0..3] of Pointer;
+  FieldAddresses: array[0..3, 0..3] of Pointer;
+const
+  AddressNames1: array[0..3] of ShortString = ('HFA1_SF', 'HFA1_DF', 'HFA1_SA', 'HFA1_DA');
+{$IFDEF WIN64}
+  AddressNames2: array[0..3] of ShortString = ('HFA2_SF', 'HFA2_DF', 'HFA2_SA', 'HFA2_DA');
+  AddressNames3: array[0..3] of ShortString = ('HFA3_SF', 'HFA3_DF', 'HFA3_SA', 'HFA3_DA');
+  AddressNames4: array[0..3] of ShortString = ('HFA4_SF', 'HFA4_DF', 'HFA4_SA', 'HFA4_DA');
+{$ENDIF}
+  FieldAddressNames: array[0..3] of ShortString = ('F1', 'F2', 'F3', 'F4');
+
+  ExpS1: Single = 5.0;
+{$IFDEF WIN64}
+  ExpS2: Single = -5.0;
+  ExpS3: Single = 10.0;
+{$ENDIF}
+  ExpS4: Single = -10.0;
+  ExpD1: Double = 5.0;
+  ExpD2: Double = -5.0;
+{$IFDEF WIN64}
+  ExpD3: Double = 10.0;
+  ExpD4: Double = -10.0;
+{$ENDIF}
+begin
+
+  if (PtrUInt(@HVA) and $F) <> 0 then
+  begin
+    WriteLn('FAIL: HVA is not correctly aligned.');
+    Halt(1);
+  end;
+
+  { array of singles }
+  WriteLn('- horizontal add (4 singles)');
+  HVA.M128_F32[0] := 5.0;
+  HVA.M128_F32[1] := -10.0;
+  HVA.M128_F32[2] := 15.0;
+  HVA.M128_F32[3] := -20.0;
+  ResS := HorizontalAddSingle(HVA);
+  ResSA := HorizontalAddSingle_ASM(HVA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: HorizontalAddSingle(HVA) has the vector in the wrong register.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: HorizontalAddSingle(HVA) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  { array of doubles }
+  WriteLn('- horizontal add (2 doubles)');
+  HVA.M128_F64[0] := 5.0;
+  HVA.M128_F64[1] := -10.0;
+  ResD := HorizontalAddDouble(HVA);
+  ResDA := HorizontalAddDouble_ASM(HVA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: HorizontalAddDouble(HVA) has the vector in the wrong register.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: HorizontalAddDouble(HVA) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  { 1-field aggregates }
+  WriteLn('- 1-field aggregates');
+
+  Addresses[0] := @HFA1_SF;
+  Addresses[1] := @HFA1_SA;
+  Addresses[2] := @HFA1_DF;
+  Addresses[3] := @HFA1_DA;
+  FieldAddresses[0][0] := @(HFA1_SF.F1);
+  FieldAddresses[1][0] := @(HFA1_SA.F[0]);
+  FieldAddresses[2][0] := @(HFA1_DF.F1);
+  FieldAddresses[3][0] := @(HFA1_DA.F[0]);
+
+  { Check alignment }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    if Pointer(TestPointer) <> FieldAddresses[I][0] then
+    begin
+      WriteLn('FAIL: ', AddressNames1[I], ' is not correctly packed; field F1 is not in the expected place.');
+      Halt(1);
+    end;
+  end;
+
+  HFA1_SF.F1 := 5.0;
+  ResS := AddSingles1F(HFA1_SF);
+  ResSA := AddSingles1F_ASM(HFA1_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS1 then
+    begin
+      WriteLn('FAIL: AddSingles1F(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_DF.F1 := 5.0;
+  ResD := AddDoubles1F(HFA1_DF);
+  ResDA := AddDoubles1F_ASM(HFA1_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD1 then
+    begin
+      WriteLn('FAIL: AddDoubles1F(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_SA.F[0] := 5.0;
+  ResS := AddSingles1A(HFA1_SA);
+  ResSA := AddSingles1A_ASM(HFA1_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS1 then
+    begin
+      WriteLn('FAIL: AddSingles1A(', AddressNames1[I], ') returned ', ResS, ' instead of ', ExpS1);
+      Halt(1);
+    end;
+  end;
+
+  HFA1_DA.F[0] := 5.0;
+  ResD := AddDoubles1A(HFA1_DA);
+  ResDA := AddDoubles1A_ASM(HFA1_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD1 then
+    begin
+      WriteLn('FAIL: AddDoubles1A(', AddressNames1[I], ') returned ', ResD, ' instead of ', ExpD1);
+      Halt(1);
+    end;
+  end;
+
+{$IFDEF WIN64}
+  { 2-field aggregates }
+  WriteLn('- 2-field aggregates');
+
+  Addresses[0] := @HFA2_SF;
+  Addresses[1] := @HFA2_SA;
+  FieldAddresses[0][0] := @(HFA2_SF.F1);
+  FieldAddresses[0][1] := @(HFA2_SF.F2);
+  FieldAddresses[1][0] := @(HFA2_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA2_SA.F[1]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 1 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA2_DF;
+  Addresses[3] := @HFA2_DA;
+  FieldAddresses[2][0] := @(HFA2_DF.F1);
+  FieldAddresses[2][1] := @(HFA2_DF.F2);
+  FieldAddresses[3][0] := @(HFA2_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA2_DA.F[1]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 1 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames2[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA2_SF.F1 := 5.0;
+  HFA2_SF.F2 := -10.0;
+  ResS := AddSingles2F(HFA2_SF);
+  ResSA := AddSingles2F_ASM(HFA2_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles2F(HFA2_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS2 then
+    begin
+      WriteLn('FAIL: AddSingles2F(HFA2_SF) returned ', ResS, ' instead of ', ExpS2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_DF.F1 := 5.0;
+  HFA2_DF.F2 := -10.0;
+  ResD := AddDoubles2F(HFA2_DF);
+  ResDA := AddDoubles2F_ASM(HFA2_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles2F(HFA2_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: AddDoubles2F(HFA2_DF) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_SA.F[0] := 5.0;
+  HFA2_SA.F[1] := -10.0;
+  ResS := AddSingles2A(HFA2_SA);
+  ResSA := AddSingles2A_ASM(HFA2_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles2A(HFA2_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS2 then
+    begin
+      WriteLn('FAIL: AddSingles2A(HFA2_SA) returned ', ResS, ' instead of ', ExpS2);
+      Halt(1);
+    end;
+  end;
+
+  HFA2_DA.F[0] := 5.0;
+  HFA2_DA.F[1] := -10.0;
+  ResD := AddDoubles2A(HFA2_DA);
+  ResDA := AddDoubles2A_ASM(HFA2_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles2A(HFA2_DA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD2 then
+    begin
+      WriteLn('FAIL: AddDoubles2A(HFA2_DA) returned ', ResD, ' instead of ', ExpD2);
+      Halt(1);
+    end;
+  end;
+
+  { 3-field aggregates }
+  WriteLn('- 3-field aggregates');
+
+  Addresses[0] := @HFA3_SF;
+  Addresses[1] := @HFA3_SA;
+  FieldAddresses[0][0] := @(HFA3_SF.F1);
+  FieldAddresses[0][1] := @(HFA3_SF.F2);
+  FieldAddresses[0][2] := @(HFA3_SF.F3);
+  FieldAddresses[1][0] := @(HFA3_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA3_SA.F[1]);
+  FieldAddresses[1][2] := @(HFA3_SA.F[2]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 2 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA3_DF;
+  Addresses[3] := @HFA3_DA;
+  FieldAddresses[2][0] := @(HFA3_DF.F1);
+  FieldAddresses[2][1] := @(HFA3_DF.F2);
+  FieldAddresses[2][2] := @(HFA3_DF.F3);
+  FieldAddresses[3][0] := @(HFA3_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA3_DA.F[1]);
+  FieldAddresses[3][2] := @(HFA3_DA.F[2]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 2 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames3[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA3_SF.F1 := 5.0;
+  HFA3_SF.F2 := -10.0;
+  HFA3_SF.F3 := 15.0;
+  ResS := AddSingles3F(HFA3_SF);
+  ResSA := AddSingles3F_ASM(HFA3_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles3F(HFA3_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS3 then
+    begin
+      WriteLn('FAIL: AddSingles3F(HFA3_SF) returned ', ResS, ' instead of ', ExpS3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_DF.F1 := 5.0;
+  HFA3_DF.F2 := -10.0;
+  HFA3_DF.F3 := 15.0;
+  ResD := AddDoubles3F(HFA3_DF);
+  ResDA := AddDoubles3F_ASM(HFA3_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles3F(HFA3_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD3 then
+    begin
+      WriteLn('FAIL: AddDoubles3F(HFA3_DF) returned ', ResD, ' instead of ', ExpD3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_SA.F[0] := 5.0;
+  HFA3_SA.F[1] := -10.0;
+  HFA3_SA.F[2] := 15.0;
+  ResS := AddSingles3A(HFA3_SA);
+  ResSA := AddSingles3A_ASM(HFA3_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles3A(HFA3_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS3 then
+    begin
+      WriteLn('FAIL: AddSingles3A(HFA3_SA) returned ', ResS, ' instead of ', ExpS3);
+      Halt(1);
+    end;
+  end;
+
+  HFA3_DA.F[0] := 5.0;
+  HFA3_DA.F[1] := -10.0;
+  HFA3_DA.F[2] := 15.0;
+  ResD := AddDoubles3A(HFA3_DA);
+  ResDA := AddDoubles3A_ASM(HFA3_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles3A(HFA3_DA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD3 then
+    begin
+      WriteLn('FAIL: AddDoubles3A(HFA3_DA) returned ', ResD, ' instead of ', ExpD3);
+      Halt(1);
+    end;
+  end;
+
+  { 4-field aggregates }
+  WriteLn('- 4-field aggregates');
+
+  Addresses[0] := @HFA4_SF;
+  Addresses[1] := @HFA4_SA;
+  FieldAddresses[0][0] := @(HFA4_SF.F1);
+  FieldAddresses[0][1] := @(HFA4_SF.F2);
+  FieldAddresses[0][2] := @(HFA4_SF.F3);
+  FieldAddresses[0][3] := @(HFA4_SF.F4);
+  FieldAddresses[1][0] := @(HFA4_SA.F[0]);
+  FieldAddresses[1][1] := @(HFA4_SA.F[1]);
+  FieldAddresses[1][2] := @(HFA4_SA.F[2]);
+  FieldAddresses[1][3] := @(HFA4_SA.F[3]);
+
+  { Check alignment of Singles }
+  for I := 0 to 1 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 3 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $4);
+    end;
+  end;
+
+  Addresses[2] := @HFA4_DF;
+  Addresses[3] := @HFA4_DA;
+  FieldAddresses[2][0] := @(HFA4_DF.F1);
+  FieldAddresses[2][1] := @(HFA4_DF.F2);
+  FieldAddresses[2][2] := @(HFA4_DF.F3);
+  FieldAddresses[2][3] := @(HFA4_DF.F4);
+  FieldAddresses[3][0] := @(HFA4_DA.F[0]);
+  FieldAddresses[3][1] := @(HFA4_DA.F[1]);
+  FieldAddresses[3][2] := @(HFA4_DA.F[2]);
+  FieldAddresses[3][3] := @(HFA4_DA.F[3]);
+
+  { Check alignment of Doubles }
+  for I := 2 to 3 do
+  begin
+    TestPointer := PtrUInt(Addresses[I]);
+    for J := 0 to 3 do
+    begin
+      if Pointer(TestPointer) <> FieldAddresses[I][J] then
+      begin
+        WriteLn('FAIL: ', AddressNames4[I], ' is not correctly packed; field ', FieldAddressNames[J], ' is not in the expected place.');
+        Halt(1);
+      end;
+
+      Inc(TestPointer, $8);
+    end;
+  end;
+
+  HFA4_SF.F1 := 5.0;
+  HFA4_SF.F2 := -10.0;
+  HFA4_SF.F3 := 15.0;
+  HFA4_SF.F4 := -20.0;
+  ResS := AddSingles4F(HFA4_SF);
+  ResSA := AddSingles4F_ASM(HFA4_SF);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles4F(HFA4_SF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: AddSingles4F(HFA4_SF) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_DF.F1 := 5.0;
+  HFA4_DF.F2 := -10.0;
+  HFA4_DF.F3 := 15.0;
+  HFA4_DF.F4 := -20.0;
+  ResD := AddDoubles4F(HFA4_DF);
+  ResDA := AddDoubles4F_ASM(HFA4_DF);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles4F(HFA4_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD4 then
+    begin
+      WriteLn('FAIL: AddDoubles4F(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_SA.F[0] := 5.0;
+  HFA4_SA.F[1] := -10.0;
+  HFA4_SA.F[2] := 15.0;
+  HFA4_SA.F[3] := -20.0;
+  ResS := AddSingles4A(HFA4_SA);
+  ResSA := AddSingles4A_ASM(HFA4_SA);
+  if (ResS <> ResSA) then
+  begin
+    WriteLn('FAIL: AddSingles4A(HFA4_SA) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResS <> ExpS4 then
+    begin
+      WriteLn('FAIL: AddSingles4A(HFA4_SA) returned ', ResS, ' instead of ', ExpS4);
+      Halt(1);
+    end;
+  end;
+
+  HFA4_DA.F[0] := 5.0;
+  HFA4_DA.F[1] := -10.0;
+  HFA4_DA.F[2] := 15.0;
+  HFA4_DA.F[3] := -20.0;
+  ResD := AddDoubles4A(HFA4_DA);
+  ResDA := AddDoubles4A_ASM(HFA4_DA);
+  if (ResD <> ResDA) then
+  begin
+    WriteLn('FAIL: AddDoubles4A(HFA4_DF) is not passing the aggregate correctly.');
+    Halt(1);
+  end else
+  begin
+    if ResD <> ExpD4 then
+    begin
+      WriteLn('FAIL: AddDoubles4A(HFA4_DF) returned ', ResD, ' instead of ', ExpD4);
+      Halt(1);
+    end;
+  end;
+{$ENDIF}
+  WriteLn('ok');
+end.
+

+ 158 - 0
tests/test/cg/tvectorcall3a.pp

@@ -0,0 +1,158 @@
+{ %CPU=x86_64 }
+program vectorcall_stack_test;
+
+{$IFNDEF CPUX86_64}
+  {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
+{$ENDIF}
+
+{ This program can be compiled on Linux, and all the vectorcall
+  routines should work the same, including the assembler routine.
+  'vectorcall' should be ignored by the compiler on this platform. }
+
+{$push}
+{$CODEALIGN RECORDMIN=16}
+{$PACKRECORDS C}
+type
+  TM128 = record
+    case Byte of
+      0: (M128_F32: array[0..3] of Single);
+      1: (M128_F64: array[0..1] of Double);
+  end;
+
+{$CODEALIGN RECORDMIN=32}
+{$PACKRECORDS C}
+type
+  TM256 = record
+    case Byte of
+      0: (M256_F32: array[0..7] of Single);
+      1: (M256_F64: array[0..3] of Double);
+      2: (M256_M128: array[0..1] of TM128);
+  end;
+{$pop}
+
+  TVector4f = record
+    case Byte of
+      0: (M128: TM128);
+      1: (X, Y, Z, W: Single);
+  end;
+
+  TVectorPair4f = record
+    case Byte of
+      0: (M256: TM256);
+      1: (V: array[0..1] of TVector4f);
+      2: (X1, Y1, Z1, W1, X2, Y2, Z2, W2: Single);
+  end;
+
+function TestFloat(TP: Single): Single; vectorcall; { vectorcall should have no effect on how this function behaves }
+begin
+  TestFloat := TP * 1.5;
+end;
+
+function AddVectors(V1, V2: TVector4f): TVector4f; vectorcall;
+begin
+  AddVectors.X := V1.X + V2.X;
+  AddVectors.Y := V1.Y + V2.Y;
+  AddVectors.Z := V1.Z + V2.Z;
+  AddVectors.W := V1.W + V2.W;
+end;
+
+{$ASMMODE Intel}
+function AddVectorsAsm(V1, V2: TVector4f): TVector4f; vectorcall; assembler; nostackframe; inline; { The inline is for a future test }
+asm
+  ADDPS XMM0, XMM1
+end;
+
+{ Note: V1, V2 and the result will go on the stack until FPC fully supports 256-bit vectors }
+function AddVectors(V1, V2: TVectorPair4f): TVectorPair4f; vectorcall;
+var
+  C: Integer;
+begin
+  for C := 0 to 1 do
+  begin
+    AddVectors.V[C].X := V1.V[C].X + V2.V[C].X;
+    AddVectors.V[C].Y := V1.V[C].Y + V2.V[C].Y;
+    AddVectors.V[C].Z := V1.V[C].Z + V2.V[C].Z;
+    AddVectors.V[C].W := V1.V[C].W + V2.V[C].W;
+  end;
+end;
+
+var
+  Vecs: array[0..1] of TVector4f; Res, ResAsm, Exp: TVector4f;
+  Pairs: array[0..1] of TVectorPair4f; ResPair, ExpPair: TVectorPair4f;
+  I: Integer;
+begin
+  FillDWord(Vecs[0], 0, 8);
+  Vecs[0].X := TestFloat(2.0);
+  Vecs[0].Y := 1.0;
+  Vecs[0].Z := -4.0;
+  Vecs[0].W := 1.0;
+
+  Vecs[1].X := 0.0;
+  Vecs[1].Y := -2.0;
+  Vecs[1].Z := TestFloat(4.0);
+  Vecs[1].W := 0.0;
+
+  Exp.X := 3.0;
+  Exp.Y := -1.0;
+  Exp.Z := 2.0;
+  Exp.W := 1.0;
+
+  Pairs[0].V[0].X := 1.0;     Pairs[0].V[1].X := 5.0;
+  Pairs[0].V[0].Y := 2.0;     Pairs[0].V[1].Y := 6.0;
+  Pairs[0].V[0].Z := 3.0;     Pairs[0].V[1].Z := 7.0;
+  Pairs[0].V[0].W := 4.0;     Pairs[0].V[1].W := 8.0;
+
+  Pairs[1].V[0].X := 9.0;     Pairs[1].V[1].X := 13.0;
+  Pairs[1].V[0].Y := 10.0;    Pairs[1].V[1].Y := 14.0;
+  Pairs[1].V[0].Z := 11.0;    Pairs[1].V[1].Z := 15.0;
+  Pairs[1].V[0].W := 12.0;    Pairs[1].V[1].W := 16.0;
+
+  ExpPair.V[0].X := 10.0;     ExpPair.V[1].X := 18.0;
+  ExpPair.V[0].Y := 12.0;     ExpPair.V[1].Y := 20.0;
+  ExpPair.V[0].Z := 14.0;     ExpPair.V[1].Z := 22.0;
+  ExpPair.V[0].W := 16.0;     ExpPair.V[1].W := 24.0;
+
+  WriteLn('Vecs[0]  = (', Vecs[0].X, ', ', Vecs[0].Y, ', ', Vecs[0].Z, ', ', Vecs[0].W, ')');
+  WriteLn('Vecs[1]  = (', Vecs[1].X, ', ', Vecs[1].Y, ', ', Vecs[1].Z, ', ', Vecs[1].W, ')');
+
+  Res := AddVectors(Vecs[0], Vecs[1]);
+  ResAsm := AddVectorsAsm(Vecs[0], Vecs[1]);
+
+  WriteLn('Result   = (', Res.X, ', ', Res.Y, ', ', Res.Z, ', ', Res.W, ')');
+  WriteLn('ResAsm   = (', ResAsm.X, ', ', ResAsm.Y, ', ', ResAsm.Z, ', ', ResAsm.W, ')');
+  WriteLn('Expected = (', Exp.X, ', ', Exp.Y, ', ', Exp.Z, ', ', Exp.W, ')');
+
+  WriteLn('Pairs[0] = (', Pairs[0].V[0].X, ', ', Pairs[0].V[0].Y, ', ', Pairs[0].V[0].Z, ', ', Pairs[0].V[0].W, ', ', Pairs[0].V[1].X, ', ', Pairs[0].V[1].Y, ', ', Pairs[0].V[1].Z, ', ', Pairs[0].V[1].W, ')');
+  WriteLn('Pairs[1] = (', Pairs[1].V[0].X, ', ', Pairs[1].V[0].Y, ', ', Pairs[1].V[0].Z, ', ', Pairs[1].V[0].W, ', ', Pairs[1].V[1].X, ', ', Pairs[1].V[1].Y, ', ', Pairs[1].V[1].Z, ', ', Pairs[1].V[1].W, ')');
+
+  ResPair := AddVectors(Pairs[0], Pairs[1]);
+
+  WriteLn('ResPair  = (', ResPair.V[0].X, ', ', ResPair.V[0].Y, ', ', ResPair.V[0].Z, ', ', ResPair.V[0].W, ', ', ResPair.V[1].X, ', ', ResPair.V[1].Y, ', ', ResPair.V[1].Z, ', ', ResPair.V[1].W, ')');
+  WriteLn('Expected = (', ExpPair.V[0].X, ', ', ExpPair.V[0].Y, ', ', ExpPair.V[0].Z, ', ', ExpPair.V[0].W, ', ', ExpPair.V[1].X, ', ', ExpPair.V[1].Y, ', ', ExpPair.V[1].Z, ', ', ExpPair.V[1].W, ')');
+
+  for I := 0 to 3 do
+  begin
+    if Res.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
+    begin
+      WriteLn('FAILURE on Res.M128.M128_F32[', I, ']');
+      Halt(1);
+    end;
+
+    if ResAsm.M128.M128_F32[I] <> Exp.M128.M128_F32[I] then
+    begin
+      WriteLn('FAILURE on ResAsm.M128.M128_F32[', I, ']');
+      Halt(1);
+    end;
+  end;
+
+  for I := 0 to 7 do
+  begin
+    if ResPair.M256.M256_F32[I] <> ExpPair.M256.M256_F32[I] then
+    begin
+      WriteLn('FAILURE on ResPair.M256.M256_F32[', I, ']');
+      Halt(1);
+    end;
+  end;
+
+  WriteLn('ok');
+end.