tvectorcall2.pp 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162
  1. { %CPU=x86_64 }
  2. program vectorcall_hva_test2;
  3. {$IFNDEF CPUX86_64}
  4. {$FATAL This test program can only be compiled on Windows or Linux 64-bit with an Intel processor }
  5. {$ENDIF}
  6. {$push}
  7. {$CODEALIGN RECORDMIN=16}
  8. {$PACKRECORDS C}
  9. type
  10. TM128 = record
  11. case Byte of
  12. 0: (M128_F32: array[0..3] of Single);
  13. 1: (M128_F64: array[0..1] of Double);
  14. end;
  15. {$pop}
  16. { HVA test }
  17. THVA = record
  18. V1, V2, V3, V4: TM128;
  19. end;
  20. operator +(X, Y: TM128)Z: TM128; vectorcall;
  21. var
  22. I: Integer;
  23. begin
  24. for I := 0 to 3 do
  25. Z.M128_F32[I] := X.M128_F32[I] + Y.M128_F32[I];
  26. end;
  27. operator -(X, Y: TM128)Z: TM128; vectorcall;
  28. var
  29. I: Integer;
  30. begin
  31. for I := 0 to 3 do
  32. Z.M128_F32[I] := X.M128_F32[I] - Y.M128_F32[I];
  33. end;
  34. { - InputHVA goes on the stack because there are not enough free XMM registers to contain the entire argument
  35. - A4 does NOT go on the stack and goes into an XMM register.
  36. }
  37. function HVATest(A1, A2, A3: TM128; InputHVA: THVA; A4: TM128; Op: Integer): THVA; vectorcall;
  38. begin
  39. { FIXME: There is an internal stack misalignment for A4, necessitating the
  40. use of (V)MOVDQU instead of (V)MOVDQA in the compiled code. }
  41. case Op of
  42. 1:
  43. begin
  44. HVATest.V1 := InputHVA.V1 + A1;
  45. HVATest.V2 := InputHVA.V2 + A2;
  46. HVATest.V3 := InputHVA.V3 + A3;
  47. HVATest.V4 := InputHVA.V4 + A4;
  48. end;
  49. 2:
  50. begin
  51. HVATest.V1 := InputHVA.V1 - A1;
  52. HVATest.V2 := InputHVA.V2 - A2;
  53. HVATest.V3 := InputHVA.V3 - A3;
  54. HVATest.V4 := InputHVA.V4 - A4;
  55. end;
  56. else
  57. begin
  58. HVATest.V1 := InputHVA.V1 + A1;
  59. HVATest.V2 := InputHVA.V2 - A2;
  60. HVATest.V3 := InputHVA.V3 + A3;
  61. HVATest.V4 := InputHVA.V4 - A4;
  62. end;
  63. end;
  64. end;
  65. var
  66. B1, B2, B3, B4: TM128; HVA, AddRes, SubRes, MixRes, AddExp, SubExp, MixExp: THVA; I: Integer;
  67. begin
  68. B1.M128_F32[0] := 1.0; B1.M128_F32[1] := 2.0; B1.M128_F32[2] := 3.0; B1.M128_F32[3] := 4.0;
  69. B2.M128_F32[0] := 5.0; B2.M128_F32[1] := 6.0; B2.M128_F32[2] := 7.0; B2.M128_F32[3] := 8.0;
  70. B3.M128_F32[0] := 9.0; B3.M128_F32[1] := 10.0; B3.M128_F32[2] := 11.0; B3.M128_F32[3] := 12.0;
  71. B4.M128_F32[0] := 13.0; B4.M128_F32[1] := 14.0; B4.M128_F32[2] := 15.0; B4.M128_F32[3] := 16.0;
  72. HVA.V1.M128_F32[0] := 10.0; HVA.V1.M128_F32[1] := 20.0; HVA.V1.M128_F32[2] := 30.0; HVA.V1.M128_F32[3] := 40.0;
  73. HVA.V2.M128_F32[0] := 50.0; HVA.V2.M128_F32[1] := 60.0; HVA.V2.M128_F32[2] := 70.0; HVA.V2.M128_F32[3] := 80.0;
  74. HVA.V3.M128_F32[0] := 90.0; HVA.V3.M128_F32[1] := 100.0; HVA.V3.M128_F32[2] := 110.0; HVA.V3.M128_F32[3] := 120.0;
  75. HVA.V4.M128_F32[0] := 130.0; HVA.V4.M128_F32[1] := 140.0; HVA.V4.M128_F32[2] := 150.0; HVA.V4.M128_F32[3] := 160.0;
  76. AddExp.V1.M128_F32[0] := 11.0; AddExp.V1.M128_F32[1] := 22.0; AddExp.V1.M128_F32[2] := 33.0; AddExp.V1.M128_F32[3] := 44.0;
  77. AddExp.V2.M128_F32[0] := 55.0; AddExp.V2.M128_F32[1] := 66.0; AddExp.V2.M128_F32[2] := 77.0; AddExp.V2.M128_F32[3] := 88.0;
  78. AddExp.V3.M128_F32[0] := 99.0; AddExp.V3.M128_F32[1] := 110.0; AddExp.V3.M128_F32[2] := 121.0; AddExp.V3.M128_F32[3] := 132.0;
  79. AddExp.V4.M128_F32[0] := 143.0; AddExp.V4.M128_F32[1] := 154.0; AddExp.V4.M128_F32[2] := 165.0; AddExp.V4.M128_F32[3] := 176.0;
  80. SubExp.V1.M128_F32[0] := 9.0; SubExp.V1.M128_F32[1] := 18.0; SubExp.V1.M128_F32[2] := 27.0; SubExp.V1.M128_F32[3] := 36.0;
  81. SubExp.V2.M128_F32[0] := 45.0; SubExp.V2.M128_F32[1] := 54.0; SubExp.V2.M128_F32[2] := 63.0; SubExp.V2.M128_F32[3] := 72.0;
  82. SubExp.V3.M128_F32[0] := 81.0; SubExp.V3.M128_F32[1] := 90.0; SubExp.V3.M128_F32[2] := 99.0; SubExp.V3.M128_F32[3] := 108.0;
  83. SubExp.V4.M128_F32[0] := 117.0; SubExp.V4.M128_F32[1] := 126.0; SubExp.V4.M128_F32[2] := 135.0; SubExp.V4.M128_F32[3] := 144.0;
  84. MixExp.V1.M128_F32[0] := 11.0; MixExp.V1.M128_F32[1] := 22.0; MixExp.V1.M128_F32[2] := 33.0; MixExp.V1.M128_F32[3] := 44.0;
  85. MixExp.V2.M128_F32[0] := 45.0; MixExp.V2.M128_F32[1] := 54.0; MixExp.V2.M128_F32[2] := 63.0; MixExp.V2.M128_F32[3] := 72.0;
  86. MixExp.V3.M128_F32[0] := 99.0; MixExp.V3.M128_F32[1] := 110.0; MixExp.V3.M128_F32[2] := 121.0; MixExp.V3.M128_F32[3] := 132.0;
  87. MixExp.V4.M128_F32[0] := 117.0; MixExp.V4.M128_F32[1] := 126.0; MixExp.V4.M128_F32[2] := 135.0; MixExp.V4.M128_F32[3] := 144.0;
  88. WriteLn(' B1: ', B1.M128_F32[0], ',', B1.M128_F32[1], ',', B1.M128_F32[2], ',', B1.M128_F32[3]);
  89. WriteLn(' B2: ', B2.M128_F32[0], ',', B2.M128_F32[1], ',', B2.M128_F32[2], ',', B2.M128_F32[3]);
  90. WriteLn(' B3: ', B3.M128_F32[0], ',', B3.M128_F32[1], ',', B3.M128_F32[2], ',', B3.M128_F32[3]);
  91. WriteLn(' B4: ', B4.M128_F32[0], ',', B4.M128_F32[1], ',', B4.M128_F32[2], ',', B4.M128_F32[3]);
  92. WriteLn('HVA.V1: ', HVA.V1.M128_F32[0], ',', HVA.V1.M128_F32[1], ',', HVA.V1.M128_F32[2], ',', HVA.V1.M128_F32[3]);
  93. WriteLn('HVA.V2: ', HVA.V2.M128_F32[0], ',', HVA.V2.M128_F32[1], ',', HVA.V2.M128_F32[2], ',', HVA.V2.M128_F32[3]);
  94. WriteLn('HVA.V3: ', HVA.V3.M128_F32[0], ',', HVA.V3.M128_F32[1], ',', HVA.V3.M128_F32[2], ',', HVA.V3.M128_F32[3]);
  95. WriteLn('HVA.V4: ', HVA.V4.M128_F32[0], ',', HVA.V4.M128_F32[1], ',', HVA.V4.M128_F32[2], ',', HVA.V4.M128_F32[3]);
  96. AddRes := HVATest(B1, B2, B3, HVA, B4, 1);
  97. SubRes := HVATest(B1, B2, B3, HVA, B4, 2);
  98. MixRes := HVATest(B1, B2, B3, HVA, B4, 0);
  99. WriteLn('----');
  100. WriteLn('AddRes.V1: ', AddRes.V1.M128_F32[0], ',', AddRes.V1.M128_F32[1], ',', AddRes.V1.M128_F32[2], ',', AddRes.V1.M128_F32[3]);
  101. WriteLn('AddRes.V2: ', AddRes.V2.M128_F32[0], ',', AddRes.V2.M128_F32[1], ',', AddRes.V2.M128_F32[2], ',', AddRes.V2.M128_F32[3]);
  102. WriteLn('AddRes.V3: ', AddRes.V3.M128_F32[0], ',', AddRes.V3.M128_F32[1], ',', AddRes.V3.M128_F32[2], ',', AddRes.V3.M128_F32[3]);
  103. WriteLn('AddRes.V4: ', AddRes.V4.M128_F32[0], ',', AddRes.V4.M128_F32[1], ',', AddRes.V4.M128_F32[2], ',', AddRes.V4.M128_F32[3]);
  104. WriteLn();
  105. WriteLn('AddExp.V1: ', AddExp.V1.M128_F32[0], ',', AddExp.V1.M128_F32[1], ',', AddExp.V1.M128_F32[2], ',', AddExp.V1.M128_F32[3]);
  106. WriteLn('AddExp.V2: ', AddExp.V2.M128_F32[0], ',', AddExp.V2.M128_F32[1], ',', AddExp.V2.M128_F32[2], ',', AddExp.V2.M128_F32[3]);
  107. WriteLn('AddExp.V3: ', AddExp.V3.M128_F32[0], ',', AddExp.V3.M128_F32[1], ',', AddExp.V3.M128_F32[2], ',', AddExp.V3.M128_F32[3]);
  108. WriteLn('AddExp.V4: ', AddExp.V4.M128_F32[0], ',', AddExp.V4.M128_F32[1], ',', AddExp.V4.M128_F32[2], ',', AddExp.V4.M128_F32[3]);
  109. WriteLn('----');
  110. WriteLn('SubRes.V1: ', SubRes.V1.M128_F32[0], ',', SubRes.V1.M128_F32[1], ',', SubRes.V1.M128_F32[2], ',', SubRes.V1.M128_F32[3]);
  111. WriteLn('SubRes.V2: ', SubRes.V2.M128_F32[0], ',', SubRes.V2.M128_F32[1], ',', SubRes.V2.M128_F32[2], ',', SubRes.V2.M128_F32[3]);
  112. WriteLn('SubRes.V3: ', SubRes.V3.M128_F32[0], ',', SubRes.V3.M128_F32[1], ',', SubRes.V3.M128_F32[2], ',', SubRes.V3.M128_F32[3]);
  113. WriteLn('SubRes.V4: ', SubRes.V4.M128_F32[0], ',', SubRes.V4.M128_F32[1], ',', SubRes.V4.M128_F32[2], ',', SubRes.V4.M128_F32[3]);
  114. WriteLn();
  115. WriteLn('SubExp.V1: ', SubExp.V1.M128_F32[0], ',', SubExp.V1.M128_F32[1], ',', SubExp.V1.M128_F32[2], ',', SubExp.V1.M128_F32[3]);
  116. WriteLn('SubExp.V2: ', SubExp.V2.M128_F32[0], ',', SubExp.V2.M128_F32[1], ',', SubExp.V2.M128_F32[2], ',', SubExp.V2.M128_F32[3]);
  117. WriteLn('SubExp.V3: ', SubExp.V3.M128_F32[0], ',', SubExp.V3.M128_F32[1], ',', SubExp.V3.M128_F32[2], ',', SubExp.V3.M128_F32[3]);
  118. WriteLn('SubExp.V4: ', SubExp.V4.M128_F32[0], ',', SubExp.V4.M128_F32[1], ',', SubExp.V4.M128_F32[2], ',', SubExp.V4.M128_F32[3]);
  119. WriteLn('----');
  120. WriteLn('MixRes.V1: ', MixRes.V1.M128_F32[0], ',', MixRes.V1.M128_F32[1], ',', MixRes.V1.M128_F32[2], ',', MixRes.V1.M128_F32[3]);
  121. WriteLn('MixRes.V2: ', MixRes.V2.M128_F32[0], ',', MixRes.V2.M128_F32[1], ',', MixRes.V2.M128_F32[2], ',', MixRes.V2.M128_F32[3]);
  122. WriteLn('MixRes.V3: ', MixRes.V3.M128_F32[0], ',', MixRes.V3.M128_F32[1], ',', MixRes.V3.M128_F32[2], ',', MixRes.V3.M128_F32[3]);
  123. WriteLn('MixRes.V4: ', MixRes.V4.M128_F32[0], ',', MixRes.V4.M128_F32[1], ',', MixRes.V4.M128_F32[2], ',', MixRes.V4.M128_F32[3]);
  124. WriteLn();
  125. WriteLn('MixExp.V1: ', MixExp.V1.M128_F32[0], ',', MixExp.V1.M128_F32[1], ',', MixExp.V1.M128_F32[2], ',', MixExp.V1.M128_F32[3]);
  126. WriteLn('MixExp.V2: ', MixExp.V2.M128_F32[0], ',', MixExp.V2.M128_F32[1], ',', MixExp.V2.M128_F32[2], ',', MixExp.V2.M128_F32[3]);
  127. WriteLn('MixExp.V3: ', MixExp.V3.M128_F32[0], ',', MixExp.V3.M128_F32[1], ',', MixExp.V3.M128_F32[2], ',', MixExp.V3.M128_F32[3]);
  128. WriteLn('MixExp.V4: ', MixExp.V4.M128_F32[0], ',', MixExp.V4.M128_F32[1], ',', MixExp.V4.M128_F32[2], ',', MixExp.V4.M128_F32[3]);
  129. for I := 0 to 3 do
  130. begin
  131. if AddRes.V1.M128_F32[I] <> AddExp.V1.M128_F32[I] then
  132. begin
  133. WriteLn('FAILURE on AddRes.V1.M128_F32[', I, ']');
  134. Halt(1);
  135. end;
  136. if SubRes.V1.M128_F32[I] <> SubExp.V1.M128_F32[I] then
  137. begin
  138. WriteLn('FAILURE on SubRes.V1.M128_F32[', I, ']');
  139. Halt(1);
  140. end;
  141. if MixRes.V1.M128_F32[I] <> MixExp.V1.M128_F32[I] then
  142. begin
  143. WriteLn('FAILURE on MixRes.V1.M128_F32[', I, ']');
  144. Halt(1);
  145. end;
  146. end;
  147. WriteLn('ok');
  148. end.