Singular_Value_Decomposition_Main_Kernel_Body.hpp 112 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282
  1. //#####################################################################
  2. // Copyright (c) 2010-2011, Eftychios Sifakis.
  3. //
  4. // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
  5. // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
  6. // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
  7. // other materials provided with the distribution.
  8. //
  9. // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
  10. // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
  11. // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  12. // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  13. // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
  14. // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  15. //#####################################################################
  16. #ifdef __INTEL_COMPILER
  17. #pragma warning( disable : 592 )
  18. #endif
  19. #pragma clang diagnostic push
  20. #pragma clang diagnostic ignored "-Wuninitialized"
  21. // #define USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION
  22. // #define PERFORM_STRICT_QUATERNION_RENORMALIZATION
  23. { // Begin block : Scope of qV (if not maintained)
  24. #ifndef COMPUTE_V_AS_QUATERNION
  25. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvs;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvs;)
  26. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvx;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvx;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvx;)
  27. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvy;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvy;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvy;)
  28. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvz;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvz;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvz;)
  29. #endif
  30. { // Begin block : Symmetric eigenanalysis
  31. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss11;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs11;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs11;)
  32. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss21;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs21;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs21;)
  33. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss31;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs31;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs31;)
  34. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss22;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs22;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs22;)
  35. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss32;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs32;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs32;)
  36. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss33;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs33;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs33;)
  37. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vone;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vone;)
  38. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_xor_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_xor_ps(Vqvvx,Vqvvx);)
  39. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_xor_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_xor_ps(Vqvvy,Vqvvy);)
  40. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_xor_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_xor_ps(Vqvvz,Vqvvz);)
  41. //###########################################################
  42. // Compute normal equations matrix
  43. //###########################################################
  44. ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Sa11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_mul_ps(Va11,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_mul_ps(Va11,Va11);)
  45. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa21.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va21,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va21,Va21);)
  46. ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);)
  47. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa31.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va31,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va31,Va31);)
  48. ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);)
  49. ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Sa12.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_mul_ps(Va12,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_mul_ps(Va12,Va11);)
  50. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va21);)
  51. ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);)
  52. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va31);)
  53. ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);)
  54. ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Sa13.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_mul_ps(Va13,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_mul_ps(Va13,Va11);)
  55. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va21);)
  56. ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);)
  57. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va31);)
  58. ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);)
  59. ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Sa12.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_mul_ps(Va12,Va12);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_mul_ps(Va12,Va12);)
  60. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va22);)
  61. ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);)
  62. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va32);)
  63. ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);)
  64. ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Sa13.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_mul_ps(Va13,Va12);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_mul_ps(Va13,Va12);)
  65. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va22);)
  66. ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);)
  67. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va32);)
  68. ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);)
  69. ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Sa13.f*Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_mul_ps(Va13,Va13);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_mul_ps(Va13,Va13);)
  70. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va23);)
  71. ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);)
  72. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va33);)
  73. ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);)
  74. //###########################################################
  75. // Solve symmetric eigenproblem using Jacobi iteration
  76. //###########################################################
  77. for(int sweep=1;sweep<=4;sweep++){
  78. // First Jacobi conjugation
  79. #define SS11 Ss11
  80. #define SS21 Ss21
  81. #define SS31 Ss31
  82. #define SS22 Ss22
  83. #define SS32 Ss32
  84. #define SS33 Ss33
  85. #define SQVVX Sqvvx
  86. #define SQVVY Sqvvy
  87. #define SQVVZ Sqvvz
  88. #define STMP1 Stmp1
  89. #define STMP2 Stmp2
  90. #define STMP3 Stmp3
  91. #define VS11 Vs11
  92. #define VS21 Vs21
  93. #define VS31 Vs31
  94. #define VS22 Vs22
  95. #define VS32 Vs32
  96. #define VS33 Vs33
  97. #define VQVVX Vqvvx
  98. #define VQVVY Vqvvy
  99. #define VQVVZ Vqvvz
  100. #define VTMP1 Vtmp1
  101. #define VTMP2 Vtmp2
  102. #define VTMP3 Vtmp3
  103. #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
  104. #undef SS11
  105. #undef SS21
  106. #undef SS31
  107. #undef SS22
  108. #undef SS32
  109. #undef SS33
  110. #undef SQVVX
  111. #undef SQVVY
  112. #undef SQVVZ
  113. #undef STMP1
  114. #undef STMP2
  115. #undef STMP3
  116. #undef VS11
  117. #undef VS21
  118. #undef VS31
  119. #undef VS22
  120. #undef VS32
  121. #undef VS33
  122. #undef VQVVX
  123. #undef VQVVY
  124. #undef VQVVZ
  125. #undef VTMP1
  126. #undef VTMP2
  127. #undef VTMP3
  128. // Second Jacobi conjugation
  129. #define SS11 Ss22
  130. #define SS21 Ss32
  131. #define SS31 Ss21
  132. #define SS22 Ss33
  133. #define SS32 Ss31
  134. #define SS33 Ss11
  135. #define SQVVX Sqvvy
  136. #define SQVVY Sqvvz
  137. #define SQVVZ Sqvvx
  138. #define STMP1 Stmp2
  139. #define STMP2 Stmp3
  140. #define STMP3 Stmp1
  141. #define VS11 Vs22
  142. #define VS21 Vs32
  143. #define VS31 Vs21
  144. #define VS22 Vs33
  145. #define VS32 Vs31
  146. #define VS33 Vs11
  147. #define VQVVX Vqvvy
  148. #define VQVVY Vqvvz
  149. #define VQVVZ Vqvvx
  150. #define VTMP1 Vtmp2
  151. #define VTMP2 Vtmp3
  152. #define VTMP3 Vtmp1
  153. #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
  154. #undef SS11
  155. #undef SS21
  156. #undef SS31
  157. #undef SS22
  158. #undef SS32
  159. #undef SS33
  160. #undef SQVVX
  161. #undef SQVVY
  162. #undef SQVVZ
  163. #undef STMP1
  164. #undef STMP2
  165. #undef STMP3
  166. #undef VS11
  167. #undef VS21
  168. #undef VS31
  169. #undef VS22
  170. #undef VS32
  171. #undef VS33
  172. #undef VQVVX
  173. #undef VQVVY
  174. #undef VQVVZ
  175. #undef VTMP1
  176. #undef VTMP2
  177. #undef VTMP3
  178. // Third Jacobi conjugation
  179. #define SS11 Ss33
  180. #define SS21 Ss31
  181. #define SS31 Ss32
  182. #define SS22 Ss11
  183. #define SS32 Ss21
  184. #define SS33 Ss22
  185. #define SQVVX Sqvvz
  186. #define SQVVY Sqvvx
  187. #define SQVVZ Sqvvy
  188. #define STMP1 Stmp3
  189. #define STMP2 Stmp1
  190. #define STMP3 Stmp2
  191. #define VS11 Vs33
  192. #define VS21 Vs31
  193. #define VS31 Vs32
  194. #define VS22 Vs11
  195. #define VS32 Vs21
  196. #define VS33 Vs22
  197. #define VQVVX Vqvvz
  198. #define VQVVY Vqvvx
  199. #define VQVVZ Vqvvy
  200. #define VTMP1 Vtmp3
  201. #define VTMP2 Vtmp1
  202. #define VTMP3 Vtmp2
  203. #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
  204. #undef SS11
  205. #undef SS21
  206. #undef SS31
  207. #undef SS22
  208. #undef SS32
  209. #undef SS33
  210. #undef SQVVX
  211. #undef SQVVY
  212. #undef SQVVZ
  213. #undef STMP1
  214. #undef STMP2
  215. #undef STMP3
  216. #undef VS11
  217. #undef VS21
  218. #undef VS31
  219. #undef VS22
  220. #undef VS32
  221. #undef VS33
  222. #undef VQVVX
  223. #undef VQVVY
  224. #undef VQVVZ
  225. #undef VTMP1
  226. #undef VTMP2
  227. #undef VTMP3
  228. }
  229. #ifdef PRINT_DEBUGGING_OUTPUT
  230. #ifdef USE_SCALAR_IMPLEMENTATION
  231. std::cout<<"Scalar S ="<<std::endl;
  232. std::cout<<std::setw(12)<<Ss11.f<<std::endl;
  233. std::cout<<std::setw(12)<<Ss21.f<<" "<<std::setw(12)<<Ss22.f<<std::endl;
  234. std::cout<<std::setw(12)<<Ss31.f<<" "<<std::setw(12)<<Ss32.f<<" "<<std::setw(12)<<Ss33.f<<std::endl;
  235. #endif
  236. #ifdef USE_SSE_IMPLEMENTATION
  237. _mm_storeu_ps(buf,Vs11);S11=buf[0];
  238. _mm_storeu_ps(buf,Vs21);S21=buf[0];
  239. _mm_storeu_ps(buf,Vs31);S31=buf[0];
  240. _mm_storeu_ps(buf,Vs22);S22=buf[0];
  241. _mm_storeu_ps(buf,Vs32);S32=buf[0];
  242. _mm_storeu_ps(buf,Vs33);S33=buf[0];
  243. std::cout<<"Vector S ="<<std::endl;
  244. std::cout<<std::setw(12)<<S11<<std::endl;
  245. std::cout<<std::setw(12)<<S21<<" "<<std::setw(12)<<S22<<std::endl;
  246. std::cout<<std::setw(12)<<S31<<" "<<std::setw(12)<<S32<<" "<<std::setw(12)<<S33<<std::endl;
  247. #endif
  248. #ifdef USE_AVX_IMPLEMENTATION
  249. _mm256_storeu_ps(buf,Vs11);S11=buf[0];
  250. _mm256_storeu_ps(buf,Vs21);S21=buf[0];
  251. _mm256_storeu_ps(buf,Vs31);S31=buf[0];
  252. _mm256_storeu_ps(buf,Vs22);S22=buf[0];
  253. _mm256_storeu_ps(buf,Vs32);S32=buf[0];
  254. _mm256_storeu_ps(buf,Vs33);S33=buf[0];
  255. std::cout<<"Vector S ="<<std::endl;
  256. std::cout<<std::setw(12)<<S11<<std::endl;
  257. std::cout<<std::setw(12)<<S21<<" "<<std::setw(12)<<S22<<std::endl;
  258. std::cout<<std::setw(12)<<S31<<" "<<std::setw(12)<<S32<<" "<<std::setw(12)<<S33<<std::endl;
  259. #endif
  260. #endif
  261. } // End block : Symmetric eigenanalysis
  262. //###########################################################
  263. // Normalize quaternion for matrix V
  264. //###########################################################
  265. #if !defined(USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION) || defined(PERFORM_STRICT_QUATERNION_RENORMALIZATION)
  266. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvs,Vqvs);)
  267. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
  268. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  269. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vqvvy);)
  270. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  271. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvz,Vqvvz);)
  272. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  273. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);)
  274. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);)
  275. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);)
  276. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);)
  277. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);)
  278. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
  279. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);)
  280. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp1);)
  281. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp1);)
  282. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp1);)
  283. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp1);)
  284. #ifdef PRINT_DEBUGGING_OUTPUT
  285. #ifdef USE_SCALAR_IMPLEMENTATION
  286. std::cout<<"Scalar qV ="<<std::endl;
  287. std::cout<<std::setw(12)<<Sqvs.f<<" "<<std::setw(12)<<Sqvvx.f<<" "<<std::setw(12)<<Sqvvy.f<<" "<<std::setw(12)<<Sqvvz.f<<std::endl;
  288. #endif
  289. #ifdef USE_SSE_IMPLEMENTATION
  290. _mm_storeu_ps(buf,Vqvs);QVS=buf[0];
  291. _mm_storeu_ps(buf,Vqvvx);QVVX=buf[0];
  292. _mm_storeu_ps(buf,Vqvvy);QVVY=buf[0];
  293. _mm_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
  294. std::cout<<"Vector qV ="<<std::endl;
  295. std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl;
  296. #endif
  297. #ifdef USE_AVX_IMPLEMENTATION
  298. _mm256_storeu_ps(buf,Vqvs);QVS=buf[0];
  299. _mm256_storeu_ps(buf,Vqvvx);QVVX=buf[0];
  300. _mm256_storeu_ps(buf,Vqvvy);QVVY=buf[0];
  301. _mm256_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
  302. std::cout<<"Vector qV ="<<std::endl;
  303. std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl;
  304. #endif
  305. #endif
  306. #endif
  307. { // Begin block : Conjugation with V
  308. //###########################################################
  309. // Transform quaternion to matrix V
  310. //###########################################################
  311. #ifndef COMPUTE_V_AS_MATRIX
  312. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv11;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv11;)
  313. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv21;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv21;)
  314. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv31;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv31;)
  315. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv12;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv12;)
  316. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv22;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv22;)
  317. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv32;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv32;)
  318. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv13;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv13;)
  319. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv23;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv23;)
  320. ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv33;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv33;)
  321. #endif
  322. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
  323. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvvy,Vqvvy);)
  324. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vqvvz,Vqvvz);)
  325. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_mul_ps(Vqvs,Vqvs);)
  326. ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv11.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_sub_ps(Vv11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_sub_ps(Vv11,Vtmp1);)
  327. ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv22.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_sub_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_sub_ps(Vv22,Vtmp2);)
  328. ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv33.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_add_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_add_ps(Vv33,Vtmp3);)
  329. ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_add_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_add_ps(Vv22,Vtmp2);)
  330. ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_sub_ps(Vv22,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_sub_ps(Vv22,Vtmp3);)
  331. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_add_ps(Vv11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_add_ps(Vv11,Vtmp1);)
  332. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_sub_ps(Vv11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_sub_ps(Vv11,Vtmp2);)
  333. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_sub_ps(Vv11,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_sub_ps(Vv11,Vtmp3);)
  334. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f+Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vqvvx,Vqvvx);)
  335. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvy.f+Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vqvvy,Vqvvy);)
  336. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvz.f+Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vqvvz,Vqvvz);)
  337. ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_mul_ps(Vqvs,Vtmp1);)
  338. ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Sqvs.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_mul_ps(Vqvs,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_mul_ps(Vqvs,Vtmp2);)
  339. ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Sqvs.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_mul_ps(Vqvs,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_mul_ps(Vqvs,Vtmp3);)
  340. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vtmp1);)
  341. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvz.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvvz,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvvz,Vtmp2);)
  342. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvx.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vqvvx,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vqvvx,Vtmp3);)
  343. ENABLE_SCALAR_IMPLEMENTATION(Sv12.f=Stmp1.f-Sv21.f;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_sub_ps(Vtmp1,Vv21);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_sub_ps(Vtmp1,Vv21);)
  344. ENABLE_SCALAR_IMPLEMENTATION(Sv23.f=Stmp2.f-Sv32.f;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_sub_ps(Vtmp2,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_sub_ps(Vtmp2,Vv32);)
  345. ENABLE_SCALAR_IMPLEMENTATION(Sv31.f=Stmp3.f-Sv13.f;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_sub_ps(Vtmp3,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_sub_ps(Vtmp3,Vv13);)
  346. ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Stmp1.f+Sv21.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_add_ps(Vtmp1,Vv21);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_add_ps(Vtmp1,Vv21);)
  347. ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Stmp2.f+Sv32.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_add_ps(Vtmp2,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_add_ps(Vtmp2,Vv32);)
  348. ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Stmp3.f+Sv13.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_add_ps(Vtmp3,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_add_ps(Vtmp3,Vv13);)
  349. #ifdef COMPUTE_V_AS_MATRIX
  350. #ifdef PRINT_DEBUGGING_OUTPUT
  351. #ifdef USE_SCALAR_IMPLEMENTATION
  352. std::cout<<"Scalar V ="<<std::endl;
  353. std::cout<<std::setw(12)<<Sv11.f<<" "<<std::setw(12)<<Sv12.f<<" "<<std::setw(12)<<Sv13.f<<std::endl;
  354. std::cout<<std::setw(12)<<Sv21.f<<" "<<std::setw(12)<<Sv22.f<<" "<<std::setw(12)<<Sv23.f<<std::endl;
  355. std::cout<<std::setw(12)<<Sv31.f<<" "<<std::setw(12)<<Sv32.f<<" "<<std::setw(12)<<Sv33.f<<std::endl;
  356. #endif
  357. #ifdef USE_SSE_IMPLEMENTATION
  358. _mm_storeu_ps(buf,Vv11);V11=buf[0];
  359. _mm_storeu_ps(buf,Vv21);V21=buf[0];
  360. _mm_storeu_ps(buf,Vv31);V31=buf[0];
  361. _mm_storeu_ps(buf,Vv12);V12=buf[0];
  362. _mm_storeu_ps(buf,Vv22);V22=buf[0];
  363. _mm_storeu_ps(buf,Vv32);V32=buf[0];
  364. _mm_storeu_ps(buf,Vv13);V13=buf[0];
  365. _mm_storeu_ps(buf,Vv23);V23=buf[0];
  366. _mm_storeu_ps(buf,Vv33);V33=buf[0];
  367. std::cout<<"Vector V ="<<std::endl;
  368. std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl;
  369. std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl;
  370. std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl;
  371. #endif
  372. #ifdef USE_AVX_IMPLEMENTATION
  373. _mm256_storeu_ps(buf,Vv11);V11=buf[0];
  374. _mm256_storeu_ps(buf,Vv21);V21=buf[0];
  375. _mm256_storeu_ps(buf,Vv31);V31=buf[0];
  376. _mm256_storeu_ps(buf,Vv12);V12=buf[0];
  377. _mm256_storeu_ps(buf,Vv22);V22=buf[0];
  378. _mm256_storeu_ps(buf,Vv32);V32=buf[0];
  379. _mm256_storeu_ps(buf,Vv13);V13=buf[0];
  380. _mm256_storeu_ps(buf,Vv23);V23=buf[0];
  381. _mm256_storeu_ps(buf,Vv33);V33=buf[0];
  382. std::cout<<"Vector V ="<<std::endl;
  383. std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl;
  384. std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl;
  385. std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl;
  386. #endif
  387. #endif
  388. #endif
  389. //###########################################################
  390. // Multiply (from the right) with V
  391. //###########################################################
  392. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va12;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va12;)
  393. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va13;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va13;)
  394. ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sv12.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_mul_ps(Vv12,Va11);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_mul_ps(Vv12,Va11);)
  395. ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sv13.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_mul_ps(Vv13,Va11);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_mul_ps(Vv13,Va11);)
  396. ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sv11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_mul_ps(Vv11,Va11);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_mul_ps(Vv11,Va11);)
  397. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
  398. ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_add_ps(Va11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_add_ps(Va11,Vtmp1);)
  399. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
  400. ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_add_ps(Va11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_add_ps(Va11,Vtmp1);)
  401. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
  402. ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_add_ps(Va12,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_add_ps(Va12,Vtmp1);)
  403. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
  404. ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_add_ps(Va12,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_add_ps(Va12,Vtmp1);)
  405. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
  406. ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_add_ps(Va13,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_add_ps(Va13,Vtmp1);)
  407. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
  408. ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_add_ps(Va13,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_add_ps(Va13,Vtmp1);)
  409. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va22;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va22;)
  410. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va23;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va23;)
  411. ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sv12.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_mul_ps(Vv12,Va21);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_mul_ps(Vv12,Va21);)
  412. ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sv13.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_mul_ps(Vv13,Va21);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_mul_ps(Vv13,Va21);)
  413. ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sv11.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_mul_ps(Vv11,Va21);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_mul_ps(Vv11,Va21);)
  414. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
  415. ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_add_ps(Va21,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_add_ps(Va21,Vtmp1);)
  416. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
  417. ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_add_ps(Va21,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_add_ps(Va21,Vtmp1);)
  418. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
  419. ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_add_ps(Va22,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_add_ps(Va22,Vtmp1);)
  420. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
  421. ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_add_ps(Va22,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_add_ps(Va22,Vtmp1);)
  422. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
  423. ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_add_ps(Va23,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_add_ps(Va23,Vtmp1);)
  424. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
  425. ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_add_ps(Va23,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_add_ps(Va23,Vtmp1);)
  426. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va32;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va32;)
  427. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va33;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va33;)
  428. ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sv12.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_mul_ps(Vv12,Va31);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_mul_ps(Vv12,Va31);)
  429. ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sv13.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_mul_ps(Vv13,Va31);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_mul_ps(Vv13,Va31);)
  430. ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sv11.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_mul_ps(Vv11,Va31);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_mul_ps(Vv11,Va31);)
  431. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
  432. ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_add_ps(Va31,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_add_ps(Va31,Vtmp1);)
  433. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
  434. ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_add_ps(Va31,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_add_ps(Va31,Vtmp1);)
  435. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
  436. ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_add_ps(Va32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_add_ps(Va32,Vtmp1);)
  437. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
  438. ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_add_ps(Va32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_add_ps(Va32,Vtmp1);)
  439. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
  440. ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_add_ps(Va33,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_add_ps(Va33,Vtmp1);)
  441. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
  442. ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_add_ps(Va33,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_add_ps(Va33,Vtmp1);)
  443. #ifdef PRINT_DEBUGGING_OUTPUT
  444. #ifdef USE_SCALAR_IMPLEMENTATION
  445. std::cout<<"Scalar A (after multiplying with V) ="<<std::endl;
  446. std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl;
  447. std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl;
  448. std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl;
  449. #endif
  450. #ifdef USE_SSE_IMPLEMENTATION
  451. _mm_storeu_ps(buf,Va11);A11=buf[0];
  452. _mm_storeu_ps(buf,Va21);A21=buf[0];
  453. _mm_storeu_ps(buf,Va31);A31=buf[0];
  454. _mm_storeu_ps(buf,Va12);A12=buf[0];
  455. _mm_storeu_ps(buf,Va22);A22=buf[0];
  456. _mm_storeu_ps(buf,Va32);A32=buf[0];
  457. _mm_storeu_ps(buf,Va13);A13=buf[0];
  458. _mm_storeu_ps(buf,Va23);A23=buf[0];
  459. _mm_storeu_ps(buf,Va33);A33=buf[0];
  460. std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
  461. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  462. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  463. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  464. #endif
  465. #ifdef USE_AVX_IMPLEMENTATION
  466. _mm256_storeu_ps(buf,Va11);A11=buf[0];
  467. _mm256_storeu_ps(buf,Va21);A21=buf[0];
  468. _mm256_storeu_ps(buf,Va31);A31=buf[0];
  469. _mm256_storeu_ps(buf,Va12);A12=buf[0];
  470. _mm256_storeu_ps(buf,Va22);A22=buf[0];
  471. _mm256_storeu_ps(buf,Va32);A32=buf[0];
  472. _mm256_storeu_ps(buf,Va13);A13=buf[0];
  473. _mm256_storeu_ps(buf,Va23);A23=buf[0];
  474. _mm256_storeu_ps(buf,Va33);A33=buf[0];
  475. std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
  476. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  477. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  478. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  479. #endif
  480. #endif
  481. } // End block : Conjugation with V
  482. } // End block : Scope of qV (if not maintained)
  483. //###########################################################
  484. // Permute columns such that the singular values are sorted
  485. //###########################################################
  486. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va11,Va11);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va11,Va11);)
  487. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa21.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va21,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va21,Va21);)
  488. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
  489. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa31.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va31,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va31,Va31);)
  490. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
  491. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa12.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Va12,Va12);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Va12,Va12);)
  492. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa22.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va22,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va22,Va22);)
  493. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp2,Vtmp4);)
  494. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa32.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va32,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va32,Va32);)
  495. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp2,Vtmp4);)
  496. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa13.f*Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Va13,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Va13,Va13);)
  497. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa23.f*Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va23,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va23,Va23);)
  498. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp3.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp3,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp3,Vtmp4);)
  499. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa33.f*Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va33,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va33,Va33);)
  500. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp3.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp3,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp3,Vtmp4);)
  501. // Swap columns 1-2 if necessary
  502. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp1.f<Stmp2.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp1,Vtmp2, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp1,Vtmp2);)
  503. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa11.ui^Sa12.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va11,Va12);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va11,Va12);)
  504. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  505. ENABLE_SCALAR_IMPLEMENTATION(Sa11.ui=Sa11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_xor_ps(Va11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_xor_ps(Va11,Vtmp5);)
  506. ENABLE_SCALAR_IMPLEMENTATION(Sa12.ui=Sa12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_xor_ps(Va12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_xor_ps(Va12,Vtmp5);)
  507. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa21.ui^Sa22.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va21,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va21,Va22);)
  508. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  509. ENABLE_SCALAR_IMPLEMENTATION(Sa21.ui=Sa21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_xor_ps(Va21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_xor_ps(Va21,Vtmp5);)
  510. ENABLE_SCALAR_IMPLEMENTATION(Sa22.ui=Sa22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_xor_ps(Va22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_xor_ps(Va22,Vtmp5);)
  511. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa31.ui^Sa32.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va31,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va31,Va32);)
  512. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  513. ENABLE_SCALAR_IMPLEMENTATION(Sa31.ui=Sa31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_xor_ps(Va31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_xor_ps(Va31,Vtmp5);)
  514. ENABLE_SCALAR_IMPLEMENTATION(Sa32.ui=Sa32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_xor_ps(Va32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_xor_ps(Va32,Vtmp5);)
  515. #ifdef COMPUTE_V_AS_MATRIX
  516. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv11.ui^Sv12.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv11,Vv12);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv11,Vv12);)
  517. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  518. ENABLE_SCALAR_IMPLEMENTATION(Sv11.ui=Sv11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_xor_ps(Vv11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_xor_ps(Vv11,Vtmp5);)
  519. ENABLE_SCALAR_IMPLEMENTATION(Sv12.ui=Sv12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_xor_ps(Vv12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_xor_ps(Vv12,Vtmp5);)
  520. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv21.ui^Sv22.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv21,Vv22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv21,Vv22);)
  521. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  522. ENABLE_SCALAR_IMPLEMENTATION(Sv21.ui=Sv21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_xor_ps(Vv21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_xor_ps(Vv21,Vtmp5);)
  523. ENABLE_SCALAR_IMPLEMENTATION(Sv22.ui=Sv22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_xor_ps(Vv22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_xor_ps(Vv22,Vtmp5);)
  524. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv31.ui^Sv32.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv31,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv31,Vv32);)
  525. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  526. ENABLE_SCALAR_IMPLEMENTATION(Sv31.ui=Sv31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_xor_ps(Vv31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_xor_ps(Vv31,Vtmp5);)
  527. ENABLE_SCALAR_IMPLEMENTATION(Sv32.ui=Sv32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_xor_ps(Vv32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_xor_ps(Vv32,Vtmp5);)
  528. #endif
  529. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp1.ui^Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp1,Vtmp2);)
  530. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  531. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=Stmp1.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_xor_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_xor_ps(Vtmp1,Vtmp5);)
  532. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Stmp2.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_xor_ps(Vtmp2,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_xor_ps(Vtmp2,Vtmp5);)
  533. // If columns 1-2 have been swapped, negate 2nd column of A and V so that V is still a rotation
  534. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
  535. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  536. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
  537. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
  538. ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_mul_ps(Va12,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_mul_ps(Va12,Vtmp4);)
  539. ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_mul_ps(Va22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_mul_ps(Va22,Vtmp4);)
  540. ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_mul_ps(Va32,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_mul_ps(Va32,Vtmp4);)
  541. #ifdef COMPUTE_V_AS_MATRIX
  542. ENABLE_SCALAR_IMPLEMENTATION(Sv12.f=Sv12.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_mul_ps(Vv12,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_mul_ps(Vv12,Vtmp4);)
  543. ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_mul_ps(Vv22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_mul_ps(Vv22,Vtmp4);)
  544. ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Sv32.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_mul_ps(Vv32,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_mul_ps(Vv32,Vtmp4);)
  545. #endif
  546. // If columns 1-2 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
  547. #ifdef COMPUTE_V_AS_QUATERNION
  548. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
  549. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
  550. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvz);)
  551. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
  552. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
  553. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_sub_ps(Vqvvz,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_sub_ps(Vqvvz,Vqvs);)
  554. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
  555. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvx);)
  556. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvy);)
  557. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp4);)
  558. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f-Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_sub_ps(Vqvvx,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_sub_ps(Vqvvx,Vqvvy);)
  559. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvy=Vtmp5;)
  560. #endif
  561. // Swap columns 1-3 if necessary
  562. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp1.f<Stmp3.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp1,Vtmp3, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp1,Vtmp3);)
  563. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa11.ui^Sa13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va11,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va11,Va13);)
  564. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  565. ENABLE_SCALAR_IMPLEMENTATION(Sa11.ui=Sa11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_xor_ps(Va11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_xor_ps(Va11,Vtmp5);)
  566. ENABLE_SCALAR_IMPLEMENTATION(Sa13.ui=Sa13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_xor_ps(Va13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_xor_ps(Va13,Vtmp5);)
  567. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa21.ui^Sa23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va21,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va21,Va23);)
  568. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  569. ENABLE_SCALAR_IMPLEMENTATION(Sa21.ui=Sa21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_xor_ps(Va21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_xor_ps(Va21,Vtmp5);)
  570. ENABLE_SCALAR_IMPLEMENTATION(Sa23.ui=Sa23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_xor_ps(Va23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_xor_ps(Va23,Vtmp5);)
  571. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa31.ui^Sa33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va31,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va31,Va33);)
  572. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  573. ENABLE_SCALAR_IMPLEMENTATION(Sa31.ui=Sa31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_xor_ps(Va31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_xor_ps(Va31,Vtmp5);)
  574. ENABLE_SCALAR_IMPLEMENTATION(Sa33.ui=Sa33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_xor_ps(Va33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_xor_ps(Va33,Vtmp5);)
  575. #ifdef COMPUTE_V_AS_MATRIX
  576. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv11.ui^Sv13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv11,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv11,Vv13);)
  577. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  578. ENABLE_SCALAR_IMPLEMENTATION(Sv11.ui=Sv11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_xor_ps(Vv11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_xor_ps(Vv11,Vtmp5);)
  579. ENABLE_SCALAR_IMPLEMENTATION(Sv13.ui=Sv13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_xor_ps(Vv13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_xor_ps(Vv13,Vtmp5);)
  580. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv21.ui^Sv23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv21,Vv23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv21,Vv23);)
  581. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  582. ENABLE_SCALAR_IMPLEMENTATION(Sv21.ui=Sv21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_xor_ps(Vv21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_xor_ps(Vv21,Vtmp5);)
  583. ENABLE_SCALAR_IMPLEMENTATION(Sv23.ui=Sv23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_xor_ps(Vv23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_xor_ps(Vv23,Vtmp5);)
  584. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv31.ui^Sv33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv31,Vv33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv31,Vv33);)
  585. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  586. ENABLE_SCALAR_IMPLEMENTATION(Sv31.ui=Sv31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_xor_ps(Vv31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_xor_ps(Vv31,Vtmp5);)
  587. ENABLE_SCALAR_IMPLEMENTATION(Sv33.ui=Sv33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_xor_ps(Vv33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_xor_ps(Vv33,Vtmp5);)
  588. #endif
  589. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp1.ui^Stmp3.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp1,Vtmp3);)
  590. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  591. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=Stmp1.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_xor_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_xor_ps(Vtmp1,Vtmp5);)
  592. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.ui=Stmp3.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_xor_ps(Vtmp3,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_xor_ps(Vtmp3,Vtmp5);)
  593. // If columns 1-3 have been swapped, negate 1st column of A and V so that V is still a rotation
  594. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
  595. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  596. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
  597. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
  598. ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_mul_ps(Va11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_mul_ps(Va11,Vtmp4);)
  599. ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_mul_ps(Va21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_mul_ps(Va21,Vtmp4);)
  600. ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_mul_ps(Va31,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_mul_ps(Va31,Vtmp4);)
  601. #ifdef COMPUTE_V_AS_MATRIX
  602. ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_mul_ps(Vv11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_mul_ps(Vv11,Vtmp4);)
  603. ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Sv21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_mul_ps(Vv21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_mul_ps(Vv21,Vtmp4);)
  604. ENABLE_SCALAR_IMPLEMENTATION(Sv31.f=Sv31.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_mul_ps(Vv31,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_mul_ps(Vv31,Vtmp4);)
  605. #endif
  606. // If columns 1-3 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
  607. #ifdef COMPUTE_V_AS_QUATERNION
  608. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
  609. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
  610. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvy);)
  611. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
  612. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
  613. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_sub_ps(Vqvvy,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_sub_ps(Vqvvy,Vqvs);)
  614. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
  615. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvz);)
  616. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvx);)
  617. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp4);)
  618. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f-Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_sub_ps(Vqvvz,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_sub_ps(Vqvvz,Vqvvx);)
  619. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvx=Vtmp5;)
  620. #endif
  621. // Swap columns 2-3 if necessary
  622. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp2.f<Stmp3.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp2,Vtmp3, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp2,Vtmp3);)
  623. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa12.ui^Sa13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va12,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va12,Va13);)
  624. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  625. ENABLE_SCALAR_IMPLEMENTATION(Sa12.ui=Sa12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_xor_ps(Va12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_xor_ps(Va12,Vtmp5);)
  626. ENABLE_SCALAR_IMPLEMENTATION(Sa13.ui=Sa13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_xor_ps(Va13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_xor_ps(Va13,Vtmp5);)
  627. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa22.ui^Sa23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va22,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va22,Va23);)
  628. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  629. ENABLE_SCALAR_IMPLEMENTATION(Sa22.ui=Sa22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_xor_ps(Va22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_xor_ps(Va22,Vtmp5);)
  630. ENABLE_SCALAR_IMPLEMENTATION(Sa23.ui=Sa23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_xor_ps(Va23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_xor_ps(Va23,Vtmp5);)
  631. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa32.ui^Sa33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va32,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va32,Va33);)
  632. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  633. ENABLE_SCALAR_IMPLEMENTATION(Sa32.ui=Sa32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_xor_ps(Va32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_xor_ps(Va32,Vtmp5);)
  634. ENABLE_SCALAR_IMPLEMENTATION(Sa33.ui=Sa33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_xor_ps(Va33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_xor_ps(Va33,Vtmp5);)
  635. #ifdef COMPUTE_V_AS_MATRIX
  636. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv12.ui^Sv13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv12,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv12,Vv13);)
  637. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  638. ENABLE_SCALAR_IMPLEMENTATION(Sv12.ui=Sv12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_xor_ps(Vv12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_xor_ps(Vv12,Vtmp5);)
  639. ENABLE_SCALAR_IMPLEMENTATION(Sv13.ui=Sv13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_xor_ps(Vv13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_xor_ps(Vv13,Vtmp5);)
  640. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv22.ui^Sv23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv22,Vv23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv22,Vv23);)
  641. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  642. ENABLE_SCALAR_IMPLEMENTATION(Sv22.ui=Sv22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_xor_ps(Vv22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_xor_ps(Vv22,Vtmp5);)
  643. ENABLE_SCALAR_IMPLEMENTATION(Sv23.ui=Sv23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_xor_ps(Vv23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_xor_ps(Vv23,Vtmp5);)
  644. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv32.ui^Sv33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv32,Vv33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv32,Vv33);)
  645. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  646. ENABLE_SCALAR_IMPLEMENTATION(Sv32.ui=Sv32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_xor_ps(Vv32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_xor_ps(Vv32,Vtmp5);)
  647. ENABLE_SCALAR_IMPLEMENTATION(Sv33.ui=Sv33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_xor_ps(Vv33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_xor_ps(Vv33,Vtmp5);)
  648. #endif
  649. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp2.ui^Stmp3.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp2,Vtmp3);)
  650. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  651. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Stmp2.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_xor_ps(Vtmp2,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_xor_ps(Vtmp2,Vtmp5);)
  652. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.ui=Stmp3.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_xor_ps(Vtmp3,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_xor_ps(Vtmp3,Vtmp5);)
  653. // If columns 2-3 have been swapped, negate 3rd column of A and V so that V is still a rotation
  654. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
  655. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
  656. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
  657. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
  658. ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_mul_ps(Va13,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_mul_ps(Va13,Vtmp4);)
  659. ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_mul_ps(Va23,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_mul_ps(Va23,Vtmp4);)
  660. ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_mul_ps(Va33,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_mul_ps(Va33,Vtmp4);)
  661. #ifdef COMPUTE_V_AS_MATRIX
  662. ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Sv13.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_mul_ps(Vv13,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_mul_ps(Vv13,Vtmp4);)
  663. ENABLE_SCALAR_IMPLEMENTATION(Sv23.f=Sv23.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_mul_ps(Vv23,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_mul_ps(Vv23,Vtmp4);)
  664. ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv33.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_mul_ps(Vv33,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_mul_ps(Vv33,Vtmp4);)
  665. #endif
  666. // If columns 2-3 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
  667. #ifdef COMPUTE_V_AS_QUATERNION
  668. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
  669. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
  670. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvx);)
  671. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
  672. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
  673. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_sub_ps(Vqvvx,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_sub_ps(Vqvvx,Vqvs);)
  674. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
  675. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvy);)
  676. ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvz);)
  677. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp4);)
  678. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f-Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_sub_ps(Vqvvy,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_sub_ps(Vqvvy,Vqvvz);)
  679. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvz=Vtmp5;)
  680. #endif
  681. #ifdef COMPUTE_V_AS_MATRIX
  682. #ifdef PRINT_DEBUGGING_OUTPUT
  683. #ifdef USE_SCALAR_IMPLEMENTATION
  684. std::cout<<"Scalar V ="<<std::endl;
  685. std::cout<<std::setw(12)<<Sv11.f<<" "<<std::setw(12)<<Sv12.f<<" "<<std::setw(12)<<Sv13.f<<std::endl;
  686. std::cout<<std::setw(12)<<Sv21.f<<" "<<std::setw(12)<<Sv22.f<<" "<<std::setw(12)<<Sv23.f<<std::endl;
  687. std::cout<<std::setw(12)<<Sv31.f<<" "<<std::setw(12)<<Sv32.f<<" "<<std::setw(12)<<Sv33.f<<std::endl;
  688. #endif
  689. #ifdef USE_SSE_IMPLEMENTATION
  690. _mm_storeu_ps(buf,Vv11);V11=buf[0];
  691. _mm_storeu_ps(buf,Vv21);V21=buf[0];
  692. _mm_storeu_ps(buf,Vv31);V31=buf[0];
  693. _mm_storeu_ps(buf,Vv12);V12=buf[0];
  694. _mm_storeu_ps(buf,Vv22);V22=buf[0];
  695. _mm_storeu_ps(buf,Vv32);V32=buf[0];
  696. _mm_storeu_ps(buf,Vv13);V13=buf[0];
  697. _mm_storeu_ps(buf,Vv23);V23=buf[0];
  698. _mm_storeu_ps(buf,Vv33);V33=buf[0];
  699. std::cout<<"Vector V ="<<std::endl;
  700. std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl;
  701. std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl;
  702. std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl;
  703. #endif
  704. #ifdef USE_AVX_IMPLEMENTATION
  705. _mm256_storeu_ps(buf,Vv11);V11=buf[0];
  706. _mm256_storeu_ps(buf,Vv21);V21=buf[0];
  707. _mm256_storeu_ps(buf,Vv31);V31=buf[0];
  708. _mm256_storeu_ps(buf,Vv12);V12=buf[0];
  709. _mm256_storeu_ps(buf,Vv22);V22=buf[0];
  710. _mm256_storeu_ps(buf,Vv32);V32=buf[0];
  711. _mm256_storeu_ps(buf,Vv13);V13=buf[0];
  712. _mm256_storeu_ps(buf,Vv23);V23=buf[0];
  713. _mm256_storeu_ps(buf,Vv33);V33=buf[0];
  714. std::cout<<"Vector V ="<<std::endl;
  715. std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl;
  716. std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl;
  717. std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl;
  718. #endif
  719. #endif
  720. #endif
  721. #ifdef PRINT_DEBUGGING_OUTPUT
  722. #ifdef USE_SCALAR_IMPLEMENTATION
  723. std::cout<<"Scalar A (after multiplying with V) ="<<std::endl;
  724. std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl;
  725. std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl;
  726. std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl;
  727. #endif
  728. #ifdef USE_SSE_IMPLEMENTATION
  729. _mm_storeu_ps(buf,Va11);A11=buf[0];
  730. _mm_storeu_ps(buf,Va21);A21=buf[0];
  731. _mm_storeu_ps(buf,Va31);A31=buf[0];
  732. _mm_storeu_ps(buf,Va12);A12=buf[0];
  733. _mm_storeu_ps(buf,Va22);A22=buf[0];
  734. _mm_storeu_ps(buf,Va32);A32=buf[0];
  735. _mm_storeu_ps(buf,Va13);A13=buf[0];
  736. _mm_storeu_ps(buf,Va23);A23=buf[0];
  737. _mm_storeu_ps(buf,Va33);A33=buf[0];
  738. std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
  739. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  740. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  741. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  742. #endif
  743. #ifdef USE_AVX_IMPLEMENTATION
  744. _mm256_storeu_ps(buf,Va11);A11=buf[0];
  745. _mm256_storeu_ps(buf,Va21);A21=buf[0];
  746. _mm256_storeu_ps(buf,Va31);A31=buf[0];
  747. _mm256_storeu_ps(buf,Va12);A12=buf[0];
  748. _mm256_storeu_ps(buf,Va22);A22=buf[0];
  749. _mm256_storeu_ps(buf,Va32);A32=buf[0];
  750. _mm256_storeu_ps(buf,Va13);A13=buf[0];
  751. _mm256_storeu_ps(buf,Va23);A23=buf[0];
  752. _mm256_storeu_ps(buf,Va33);A33=buf[0];
  753. std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
  754. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  755. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  756. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  757. #endif
  758. #endif
  759. //###########################################################
  760. // Re-normalize quaternion for matrix V
  761. //###########################################################
  762. #ifdef COMPUTE_V_AS_QUATERNION
  763. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvs,Vqvs);)
  764. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
  765. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  766. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vqvvy);)
  767. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  768. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvz,Vqvvz);)
  769. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
  770. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);)
  771. #ifdef PERFORM_STRICT_QUATERNION_RENORMALIZATION
  772. ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);)
  773. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);)
  774. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);)
  775. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);)
  776. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
  777. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);)
  778. #endif
  779. ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp1);)
  780. ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp1);)
  781. ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp1);)
  782. ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp1);)
  783. #ifdef PRINT_DEBUGGING_OUTPUT
  784. #ifdef USE_SCALAR_IMPLEMENTATION
  785. std::cout<<"Scalar qV ="<<std::endl;
  786. std::cout<<std::setw(12)<<Sqvs.f<<" "<<std::setw(12)<<Sqvvx.f<<" "<<std::setw(12)<<Sqvvy.f<<" "<<std::setw(12)<<Sqvvz.f<<std::endl;
  787. #endif
  788. #ifdef USE_SSE_IMPLEMENTATION
  789. _mm_storeu_ps(buf,Vqvs);QVS=buf[0];
  790. _mm_storeu_ps(buf,Vqvvx);QVVX=buf[0];
  791. _mm_storeu_ps(buf,Vqvvy);QVVY=buf[0];
  792. _mm_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
  793. std::cout<<"Vector qV ="<<std::endl;
  794. std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl;
  795. #endif
  796. #ifdef USE_AVX_IMPLEMENTATION
  797. _mm256_storeu_ps(buf,Vqvs);QVS=buf[0];
  798. _mm256_storeu_ps(buf,Vqvvx);QVVX=buf[0];
  799. _mm256_storeu_ps(buf,Vqvvy);QVVY=buf[0];
  800. _mm256_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
  801. std::cout<<"Vector qV ="<<std::endl;
  802. std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl;
  803. #endif
  804. #endif
  805. #endif
  806. //###########################################################
  807. // Construct QR factorization of A*V (=U*D) using Givens rotations
  808. //###########################################################
  809. #ifdef COMPUTE_U_AS_MATRIX
  810. ENABLE_SCALAR_IMPLEMENTATION(Su11.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu11=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu11=Vone;)
  811. ENABLE_SCALAR_IMPLEMENTATION(Su21.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu21=_mm_xor_ps(Vu21,Vu21);) ENABLE_AVX_IMPLEMENTATION(Vu21=_mm256_xor_ps(Vu21,Vu21);)
  812. ENABLE_SCALAR_IMPLEMENTATION(Su31.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu31=_mm_xor_ps(Vu31,Vu31);) ENABLE_AVX_IMPLEMENTATION(Vu31=_mm256_xor_ps(Vu31,Vu31);)
  813. ENABLE_SCALAR_IMPLEMENTATION(Su12.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu12=_mm_xor_ps(Vu12,Vu12);) ENABLE_AVX_IMPLEMENTATION(Vu12=_mm256_xor_ps(Vu12,Vu12);)
  814. ENABLE_SCALAR_IMPLEMENTATION(Su22.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu22=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu22=Vone;)
  815. ENABLE_SCALAR_IMPLEMENTATION(Su32.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu32=_mm_xor_ps(Vu32,Vu32);) ENABLE_AVX_IMPLEMENTATION(Vu32=_mm256_xor_ps(Vu32,Vu32);)
  816. ENABLE_SCALAR_IMPLEMENTATION(Su13.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu13=_mm_xor_ps(Vu13,Vu13);) ENABLE_AVX_IMPLEMENTATION(Vu13=_mm256_xor_ps(Vu13,Vu13);)
  817. ENABLE_SCALAR_IMPLEMENTATION(Su23.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu23=_mm_xor_ps(Vu23,Vu23);) ENABLE_AVX_IMPLEMENTATION(Vu23=_mm256_xor_ps(Vu23,Vu23);)
  818. ENABLE_SCALAR_IMPLEMENTATION(Su33.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu33=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu33=Vone;)
  819. #endif
  820. #ifdef COMPUTE_U_AS_QUATERNION
  821. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vqus=Vone;) ENABLE_AVX_IMPLEMENTATION(Vqus=Vone;)
  822. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_xor_ps(Vquvx,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_xor_ps(Vquvx,Vquvx);)
  823. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_xor_ps(Vquvy,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_xor_ps(Vquvy,Vquvy);)
  824. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_xor_ps(Vquvz,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_xor_ps(Vquvz,Vquvz);)
  825. #endif
  826. // First Givens rotation
  827. #define SAPIVOT Sa11
  828. #define SANPIVOT Sa21
  829. #define SA11 Sa11
  830. #define SA21 Sa21
  831. #define SA12 Sa12
  832. #define SA22 Sa22
  833. #define SA13 Sa13
  834. #define SA23 Sa23
  835. #define SU11 Su11
  836. #define SU12 Su12
  837. #define SU21 Su21
  838. #define SU22 Su22
  839. #define SU31 Su31
  840. #define SU32 Su32
  841. #define VAPIVOT Va11
  842. #define VANPIVOT Va21
  843. #define VA11 Va11
  844. #define VA21 Va21
  845. #define VA12 Va12
  846. #define VA22 Va22
  847. #define VA13 Va13
  848. #define VA23 Va23
  849. #define VU11 Vu11
  850. #define VU12 Vu12
  851. #define VU21 Vu21
  852. #define VU22 Vu22
  853. #define VU31 Vu31
  854. #define VU32 Vu32
  855. #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
  856. #undef SAPIVOT
  857. #undef SANPIVOT
  858. #undef SA11
  859. #undef SA21
  860. #undef SA12
  861. #undef SA22
  862. #undef SA13
  863. #undef SA23
  864. #undef SU11
  865. #undef SU12
  866. #undef SU21
  867. #undef SU22
  868. #undef SU31
  869. #undef SU32
  870. #undef VAPIVOT
  871. #undef VANPIVOT
  872. #undef VA11
  873. #undef VA21
  874. #undef VA12
  875. #undef VA22
  876. #undef VA13
  877. #undef VA23
  878. #undef VU11
  879. #undef VU12
  880. #undef VU21
  881. #undef VU22
  882. #undef VU31
  883. #undef VU32
  884. // Update quaternion representation of U
  885. #ifdef COMPUTE_U_AS_QUATERNION
  886. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=Vch;) ENABLE_AVX_IMPLEMENTATION(Vqus=Vch;)
  887. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_xor_ps(Vquvx,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_xor_ps(Vquvx,Vquvx);)
  888. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_xor_ps(Vquvy,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_xor_ps(Vquvy,Vquvy);)
  889. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=Vsh;) ENABLE_AVX_IMPLEMENTATION(Vquvz=Vsh;)
  890. #endif
  891. // Second Givens rotation
  892. #define SAPIVOT Sa11
  893. #define SANPIVOT Sa31
  894. #define SA11 Sa11
  895. #define SA21 Sa31
  896. #define SA12 Sa12
  897. #define SA22 Sa32
  898. #define SA13 Sa13
  899. #define SA23 Sa33
  900. #define SU11 Su11
  901. #define SU12 Su13
  902. #define SU21 Su21
  903. #define SU22 Su23
  904. #define SU31 Su31
  905. #define SU32 Su33
  906. #define VAPIVOT Va11
  907. #define VANPIVOT Va31
  908. #define VA11 Va11
  909. #define VA21 Va31
  910. #define VA12 Va12
  911. #define VA22 Va32
  912. #define VA13 Va13
  913. #define VA23 Va33
  914. #define VU11 Vu11
  915. #define VU12 Vu13
  916. #define VU21 Vu21
  917. #define VU22 Vu23
  918. #define VU31 Vu31
  919. #define VU32 Vu33
  920. #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
  921. #undef SAPIVOT
  922. #undef SANPIVOT
  923. #undef SA11
  924. #undef SA21
  925. #undef SA12
  926. #undef SA22
  927. #undef SA13
  928. #undef SA23
  929. #undef SU11
  930. #undef SU12
  931. #undef SU21
  932. #undef SU22
  933. #undef SU31
  934. #undef SU32
  935. #undef VAPIVOT
  936. #undef VANPIVOT
  937. #undef VA11
  938. #undef VA21
  939. #undef VA12
  940. #undef VA22
  941. #undef VA13
  942. #undef VA23
  943. #undef VU11
  944. #undef VU12
  945. #undef VU21
  946. #undef VU22
  947. #undef VU31
  948. #undef VU32
  949. // Update quaternion representation of U
  950. #ifdef COMPUTE_U_AS_QUATERNION
  951. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Ssh.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_mul_ps(Vsh,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_mul_ps(Vsh,Vquvz);)
  952. ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqus);)
  953. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Squvy.f-Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_sub_ps(Vquvy,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_sub_ps(Vquvy,Vsh);)
  954. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_mul_ps(Vch,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_mul_ps(Vch,Vqus);)
  955. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Sch.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_mul_ps(Vch,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_mul_ps(Vch,Vquvz);)
  956. #endif
  957. // Third Givens rotation
  958. #define SAPIVOT Sa22
  959. #define SANPIVOT Sa32
  960. #define SA11 Sa21
  961. #define SA21 Sa31
  962. #define SA12 Sa22
  963. #define SA22 Sa32
  964. #define SA13 Sa23
  965. #define SA23 Sa33
  966. #define SU11 Su12
  967. #define SU12 Su13
  968. #define SU21 Su22
  969. #define SU22 Su23
  970. #define SU31 Su32
  971. #define SU32 Su33
  972. #define VAPIVOT Va22
  973. #define VANPIVOT Va32
  974. #define VA11 Va21
  975. #define VA21 Va31
  976. #define VA12 Va22
  977. #define VA22 Va32
  978. #define VA13 Va23
  979. #define VA23 Va33
  980. #define VU11 Vu12
  981. #define VU12 Vu13
  982. #define VU21 Vu22
  983. #define VU22 Vu23
  984. #define VU31 Vu32
  985. #define VU32 Vu33
  986. #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
  987. #undef SAPIVOT
  988. #undef SANPIVOT
  989. #undef SA11
  990. #undef SA21
  991. #undef SA12
  992. #undef SA22
  993. #undef SA13
  994. #undef SA23
  995. #undef SU11
  996. #undef SU12
  997. #undef SU21
  998. #undef SU22
  999. #undef SU31
  1000. #undef SU32
  1001. #undef VAPIVOT
  1002. #undef VANPIVOT
  1003. #undef VA11
  1004. #undef VA21
  1005. #undef VA12
  1006. #undef VA22
  1007. #undef VA13
  1008. #undef VA23
  1009. #undef VU11
  1010. #undef VU12
  1011. #undef VU21
  1012. #undef VU22
  1013. #undef VU31
  1014. #undef VU32
  1015. // Update quaternion representation of U
  1016. #ifdef COMPUTE_U_AS_QUATERNION
  1017. ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Squvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vquvx);)
  1018. ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Squvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vquvy);)
  1019. ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Ssh.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vsh,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vsh,Vquvz);)
  1020. ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqus);)
  1021. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_mul_ps(Vch,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_mul_ps(Vch,Vqus);)
  1022. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Sch.f*Squvx.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_mul_ps(Vch,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_mul_ps(Vch,Vquvx);)
  1023. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Sch.f*Squvy.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_mul_ps(Vch,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_mul_ps(Vch,Vquvy);)
  1024. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Sch.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_mul_ps(Vch,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_mul_ps(Vch,Vquvz);)
  1025. ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Squvx.f+Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_add_ps(Vquvx,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_add_ps(Vquvx,Vsh);)
  1026. ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Squs.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_sub_ps(Vqus,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_sub_ps(Vqus,Vtmp1);)
  1027. ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Squvy.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_add_ps(Vquvy,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_add_ps(Vquvy,Vtmp3);)
  1028. ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Squvz.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_sub_ps(Vquvz,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_sub_ps(Vquvz,Vtmp2);)
  1029. #endif
  1030. #ifdef COMPUTE_U_AS_MATRIX
  1031. #ifdef PRINT_DEBUGGING_OUTPUT
  1032. #ifdef USE_SCALAR_IMPLEMENTATION
  1033. std::cout<<"Scalar U ="<<std::endl;
  1034. std::cout<<std::setw(12)<<Su11.f<<" "<<std::setw(12)<<Su12.f<<" "<<std::setw(12)<<Su13.f<<std::endl;
  1035. std::cout<<std::setw(12)<<Su21.f<<" "<<std::setw(12)<<Su22.f<<" "<<std::setw(12)<<Su23.f<<std::endl;
  1036. std::cout<<std::setw(12)<<Su31.f<<" "<<std::setw(12)<<Su32.f<<" "<<std::setw(12)<<Su33.f<<std::endl;
  1037. #endif
  1038. #ifdef USE_SSE_IMPLEMENTATION
  1039. _mm_storeu_ps(buf,Vu11);U11=buf[0];
  1040. _mm_storeu_ps(buf,Vu21);U21=buf[0];
  1041. _mm_storeu_ps(buf,Vu31);U31=buf[0];
  1042. _mm_storeu_ps(buf,Vu12);U12=buf[0];
  1043. _mm_storeu_ps(buf,Vu22);U22=buf[0];
  1044. _mm_storeu_ps(buf,Vu32);U32=buf[0];
  1045. _mm_storeu_ps(buf,Vu13);U13=buf[0];
  1046. _mm_storeu_ps(buf,Vu23);U23=buf[0];
  1047. _mm_storeu_ps(buf,Vu33);U33=buf[0];
  1048. std::cout<<"Vector U ="<<std::endl;
  1049. std::cout<<std::setw(12)<<U11<<" "<<std::setw(12)<<U12<<" "<<std::setw(12)<<U13<<std::endl;
  1050. std::cout<<std::setw(12)<<U21<<" "<<std::setw(12)<<U22<<" "<<std::setw(12)<<U23<<std::endl;
  1051. std::cout<<std::setw(12)<<U31<<" "<<std::setw(12)<<U32<<" "<<std::setw(12)<<U33<<std::endl;
  1052. #endif
  1053. #ifdef USE_AVX_IMPLEMENTATION
  1054. _mm256_storeu_ps(buf,Vu11);U11=buf[0];
  1055. _mm256_storeu_ps(buf,Vu21);U21=buf[0];
  1056. _mm256_storeu_ps(buf,Vu31);U31=buf[0];
  1057. _mm256_storeu_ps(buf,Vu12);U12=buf[0];
  1058. _mm256_storeu_ps(buf,Vu22);U22=buf[0];
  1059. _mm256_storeu_ps(buf,Vu32);U32=buf[0];
  1060. _mm256_storeu_ps(buf,Vu13);U13=buf[0];
  1061. _mm256_storeu_ps(buf,Vu23);U23=buf[0];
  1062. _mm256_storeu_ps(buf,Vu33);U33=buf[0];
  1063. std::cout<<"Vector U ="<<std::endl;
  1064. std::cout<<std::setw(12)<<U11<<" "<<std::setw(12)<<U12<<" "<<std::setw(12)<<U13<<std::endl;
  1065. std::cout<<std::setw(12)<<U21<<" "<<std::setw(12)<<U22<<" "<<std::setw(12)<<U23<<std::endl;
  1066. std::cout<<std::setw(12)<<U31<<" "<<std::setw(12)<<U32<<" "<<std::setw(12)<<U33<<std::endl;
  1067. #endif
  1068. #endif
  1069. #endif
  1070. #ifdef PRINT_DEBUGGING_OUTPUT
  1071. #ifdef USE_SCALAR_IMPLEMENTATION
  1072. std::cout<<"Scalar A (after multiplying with U-transpose and V) ="<<std::endl;
  1073. std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl;
  1074. std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl;
  1075. std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl;
  1076. #endif
  1077. #ifdef USE_SSE_IMPLEMENTATION
  1078. _mm_storeu_ps(buf,Va11);A11=buf[0];
  1079. _mm_storeu_ps(buf,Va21);A21=buf[0];
  1080. _mm_storeu_ps(buf,Va31);A31=buf[0];
  1081. _mm_storeu_ps(buf,Va12);A12=buf[0];
  1082. _mm_storeu_ps(buf,Va22);A22=buf[0];
  1083. _mm_storeu_ps(buf,Va32);A32=buf[0];
  1084. _mm_storeu_ps(buf,Va13);A13=buf[0];
  1085. _mm_storeu_ps(buf,Va23);A23=buf[0];
  1086. _mm_storeu_ps(buf,Va33);A33=buf[0];
  1087. std::cout<<"Vector A (after multiplying with U-transpose and V) ="<<std::endl;
  1088. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  1089. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  1090. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  1091. #endif
  1092. #ifdef USE_AVX_IMPLEMENTATION
  1093. _mm256_storeu_ps(buf,Va11);A11=buf[0];
  1094. _mm256_storeu_ps(buf,Va21);A21=buf[0];
  1095. _mm256_storeu_ps(buf,Va31);A31=buf[0];
  1096. _mm256_storeu_ps(buf,Va12);A12=buf[0];
  1097. _mm256_storeu_ps(buf,Va22);A22=buf[0];
  1098. _mm256_storeu_ps(buf,Va32);A32=buf[0];
  1099. _mm256_storeu_ps(buf,Va13);A13=buf[0];
  1100. _mm256_storeu_ps(buf,Va23);A23=buf[0];
  1101. _mm256_storeu_ps(buf,Va33);A33=buf[0];
  1102. std::cout<<"Vector A (after multiplying with U-transpose and V) ="<<std::endl;
  1103. std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl;
  1104. std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl;
  1105. std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl;
  1106. #endif
  1107. #endif
  1108. #ifdef COMPUTE_U_AS_QUATERNION
  1109. #ifdef PRINT_DEBUGGING_OUTPUT
  1110. #ifdef USE_SCALAR_IMPLEMENTATION
  1111. std::cout<<"Scalar qU ="<<std::endl;
  1112. std::cout<<std::setw(12)<<Squs.f<<" "<<std::setw(12)<<Squvx.f<<" "<<std::setw(12)<<Squvy.f<<" "<<std::setw(12)<<Squvz.f<<std::endl;
  1113. #endif
  1114. #ifdef USE_SSE_IMPLEMENTATION
  1115. _mm_storeu_ps(buf,Vqus);QUS=buf[0];
  1116. _mm_storeu_ps(buf,Vquvx);QUVX=buf[0];
  1117. _mm_storeu_ps(buf,Vquvy);QUVY=buf[0];
  1118. _mm_storeu_ps(buf,Vquvz);QUVZ=buf[0];
  1119. std::cout<<"Vector qU ="<<std::endl;
  1120. std::cout<<std::setw(12)<<QUS<<" "<<std::setw(12)<<QUVX<<" "<<std::setw(12)<<QUVY<<" "<<std::setw(12)<<QUVZ<<std::endl;
  1121. #endif
  1122. #ifdef USE_AVX_IMPLEMENTATION
  1123. _mm256_storeu_ps(buf,Vqus);QUS=buf[0];
  1124. _mm256_storeu_ps(buf,Vquvx);QUVX=buf[0];
  1125. _mm256_storeu_ps(buf,Vquvy);QUVY=buf[0];
  1126. _mm256_storeu_ps(buf,Vquvz);QUVZ=buf[0];
  1127. std::cout<<"Vector qU ="<<std::endl;
  1128. std::cout<<std::setw(12)<<QUS<<" "<<std::setw(12)<<QUVX<<" "<<std::setw(12)<<QUVY<<" "<<std::setw(12)<<QUVZ<<std::endl;
  1129. #endif
  1130. #endif
  1131. #endif
  1132. #pragma clang diagnostic pop
  1133. #ifdef __INTEL_COMPILER
  1134. #pragma warning( default : 592 )
  1135. #endif