btVector3.cpp 69 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678
  1. /*
  2. Copyright (c) 2011 Apple Inc.
  3. http://continuousphysics.com/Bullet/
  4. This software is provided 'as-is', without any express or implied warranty.
  5. In no event will the authors be held liable for any damages arising from the use of this software.
  6. Permission is granted to anyone to use this software for any purpose,
  7. including commercial applications, and to alter it and redistribute it freely,
  8. subject to the following restrictions:
  9. 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  10. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  11. 3. This notice may not be removed or altered from any source distribution.
  12. This source version has been altered.
  13. */
  14. // Modified by Yao Wei Tjong for Urho3D
  15. #if defined (_WIN32) || defined (__i386__)
  16. #define BT_USE_SSE_IN_API
  17. #endif
  18. #include "btVector3.h"
  19. #if defined BT_USE_SIMD_VECTOR3
  20. #if DEBUG
  21. #include <string.h>//for memset
  22. #endif
  23. #ifdef __APPLE__
  24. #include <stdint.h>
  25. typedef float float4 __attribute__ ((vector_size(16)));
  26. #else
  27. #define float4 __m128
  28. #endif
  29. //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
  30. #if defined BT_USE_SSE //|| defined _WIN32 // Urho3D - just use BT_USE_SSE as the main switch, Urho3D allow SSE to be disabled
  31. #define LOG2_ARRAY_SIZE 6
  32. #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
  33. #include <emmintrin.h>
  34. long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
  35. long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
  36. {
  37. const float4 *vertices = (const float4*) vv;
  38. static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
  39. float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
  40. float4 vvec = _mm_loadu_ps( vec );
  41. float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
  42. float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
  43. long maxIndex = -1L;
  44. size_t segment = 0;
  45. float4 stack_array[ STACK_ARRAY_COUNT ];
  46. #if DEBUG
  47. //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
  48. #endif
  49. size_t index;
  50. float4 max;
  51. // Faster loop without cleanup code for full tiles
  52. for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
  53. {
  54. max = dotMax;
  55. for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
  56. { // do four dot products at a time. Carefully avoid touching the w element.
  57. float4 v0 = vertices[0];
  58. float4 v1 = vertices[1];
  59. float4 v2 = vertices[2];
  60. float4 v3 = vertices[3]; vertices += 4;
  61. float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  62. float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  63. float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  64. float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  65. lo0 = lo0*vLo;
  66. lo1 = lo1*vLo;
  67. float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
  68. float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
  69. float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  70. z = z*vHi;
  71. x = x+y;
  72. x = x+z;
  73. stack_array[index] = x;
  74. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  75. v0 = vertices[0];
  76. v1 = vertices[1];
  77. v2 = vertices[2];
  78. v3 = vertices[3]; vertices += 4;
  79. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  80. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  81. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  82. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  83. lo0 = lo0*vLo;
  84. lo1 = lo1*vLo;
  85. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  86. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  87. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  88. z = z*vHi;
  89. x = x+y;
  90. x = x+z;
  91. stack_array[index+1] = x;
  92. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  93. v0 = vertices[0];
  94. v1 = vertices[1];
  95. v2 = vertices[2];
  96. v3 = vertices[3]; vertices += 4;
  97. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  98. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  99. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  100. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  101. lo0 = lo0*vLo;
  102. lo1 = lo1*vLo;
  103. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  104. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  105. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  106. z = z*vHi;
  107. x = x+y;
  108. x = x+z;
  109. stack_array[index+2] = x;
  110. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  111. v0 = vertices[0];
  112. v1 = vertices[1];
  113. v2 = vertices[2];
  114. v3 = vertices[3]; vertices += 4;
  115. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  116. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  117. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  118. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  119. lo0 = lo0*vLo;
  120. lo1 = lo1*vLo;
  121. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  122. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  123. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  124. z = z*vHi;
  125. x = x+y;
  126. x = x+z;
  127. stack_array[index+3] = x;
  128. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  129. // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
  130. }
  131. // If we found a new max
  132. if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
  133. {
  134. // copy the new max across all lanes of our max accumulator
  135. max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
  136. max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
  137. dotMax = max;
  138. // find first occurrence of that max
  139. size_t test;
  140. for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
  141. {}
  142. // record where it is.
  143. maxIndex = 4*index + segment + indexTable[test];
  144. }
  145. }
  146. // account for work we've already done
  147. count -= segment;
  148. // Deal with the last < STACK_ARRAY_COUNT vectors
  149. max = dotMax;
  150. index = 0;
  151. if( btUnlikely( count > 16) )
  152. {
  153. for( ; index + 4 <= count / 4; index+=4 )
  154. { // do four dot products at a time. Carefully avoid touching the w element.
  155. float4 v0 = vertices[0];
  156. float4 v1 = vertices[1];
  157. float4 v2 = vertices[2];
  158. float4 v3 = vertices[3]; vertices += 4;
  159. float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  160. float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  161. float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  162. float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  163. lo0 = lo0*vLo;
  164. lo1 = lo1*vLo;
  165. float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
  166. float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
  167. float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  168. z = z*vHi;
  169. x = x+y;
  170. x = x+z;
  171. stack_array[index] = x;
  172. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  173. v0 = vertices[0];
  174. v1 = vertices[1];
  175. v2 = vertices[2];
  176. v3 = vertices[3]; vertices += 4;
  177. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  178. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  179. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  180. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  181. lo0 = lo0*vLo;
  182. lo1 = lo1*vLo;
  183. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  184. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  185. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  186. z = z*vHi;
  187. x = x+y;
  188. x = x+z;
  189. stack_array[index+1] = x;
  190. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  191. v0 = vertices[0];
  192. v1 = vertices[1];
  193. v2 = vertices[2];
  194. v3 = vertices[3]; vertices += 4;
  195. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  196. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  197. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  198. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  199. lo0 = lo0*vLo;
  200. lo1 = lo1*vLo;
  201. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  202. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  203. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  204. z = z*vHi;
  205. x = x+y;
  206. x = x+z;
  207. stack_array[index+2] = x;
  208. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  209. v0 = vertices[0];
  210. v1 = vertices[1];
  211. v2 = vertices[2];
  212. v3 = vertices[3]; vertices += 4;
  213. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  214. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  215. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  216. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  217. lo0 = lo0*vLo;
  218. lo1 = lo1*vLo;
  219. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  220. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  221. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  222. z = z*vHi;
  223. x = x+y;
  224. x = x+z;
  225. stack_array[index+3] = x;
  226. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  227. // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
  228. }
  229. }
  230. size_t localCount = (count & -4L) - 4*index;
  231. if( localCount )
  232. {
  233. #ifdef __APPLE__
  234. float4 t0, t1, t2, t3, t4;
  235. float4 * sap = &stack_array[index + localCount / 4];
  236. vertices += localCount; // counter the offset
  237. size_t byteIndex = -(localCount) * sizeof(float);
  238. //AT&T Code style assembly
  239. asm volatile
  240. ( ".align 4 \n\
  241. 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
  242. movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
  243. movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
  244. movaps %[t0], %[max] // vertices[0] \n\
  245. movlhps %[t1], %[max] // x0y0x1y1 \n\
  246. movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
  247. movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
  248. mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
  249. movhlps %[t0], %[t1] // z0w0z1w1 \n\
  250. movaps %[t3], %[t0] // vertices[2] \n\
  251. movlhps %[t4], %[t0] // x2y2x3y3 \n\
  252. mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
  253. movhlps %[t3], %[t4] // z2w2z3w3 \n\
  254. shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
  255. mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
  256. movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
  257. shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
  258. shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
  259. addps %[t3], %[max] // x + y \n\
  260. addps %[t1], %[max] // x + y + z \n\
  261. movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
  262. maxps %[t2], %[max] // record max, restore max \n\
  263. add $16, %[byteIndex] // advance loop counter\n\
  264. jnz 0b \n\
  265. "
  266. : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
  267. : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
  268. : "memory", "cc"
  269. );
  270. index += localCount/4;
  271. #else
  272. {
  273. for( unsigned int i=0; i<localCount/4; i++,index++)
  274. { // do four dot products at a time. Carefully avoid touching the w element.
  275. float4 v0 = vertices[0];
  276. float4 v1 = vertices[1];
  277. float4 v2 = vertices[2];
  278. float4 v3 = vertices[3];
  279. vertices += 4;
  280. float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  281. float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  282. float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  283. float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  284. lo0 = lo0*vLo;
  285. lo1 = lo1*vLo;
  286. float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
  287. float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
  288. float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  289. z = z*vHi;
  290. x = x+y;
  291. x = x+z;
  292. stack_array[index] = x;
  293. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  294. }
  295. }
  296. #endif //__APPLE__
  297. }
  298. // process the last few points
  299. if( count & 3 )
  300. {
  301. float4 v0, v1, v2, x, y, z;
  302. switch( count & 3 )
  303. {
  304. case 3:
  305. {
  306. v0 = vertices[0];
  307. v1 = vertices[1];
  308. v2 = vertices[2];
  309. // Calculate 3 dot products, transpose, duplicate v2
  310. float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
  311. float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
  312. lo0 = lo0*vLo;
  313. z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
  314. z = z*vHi;
  315. float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
  316. lo1 = lo1*vLo;
  317. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  318. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  319. }
  320. break;
  321. case 2:
  322. {
  323. v0 = vertices[0];
  324. v1 = vertices[1];
  325. float4 xy = _mm_movelh_ps(v0, v1);
  326. z = _mm_movehl_ps(v1, v0);
  327. xy = xy*vLo;
  328. z = _mm_shuffle_ps( z, z, 0xa8);
  329. x = _mm_shuffle_ps( xy, xy, 0xa8);
  330. y = _mm_shuffle_ps( xy, xy, 0xfd);
  331. z = z*vHi;
  332. }
  333. break;
  334. case 1:
  335. {
  336. float4 xy = vertices[0];
  337. z = _mm_shuffle_ps( xy, xy, 0xaa);
  338. xy = xy*vLo;
  339. z = z*vHi;
  340. x = _mm_shuffle_ps(xy, xy, 0);
  341. y = _mm_shuffle_ps(xy, xy, 0x55);
  342. }
  343. break;
  344. }
  345. x = x+y;
  346. x = x+z;
  347. stack_array[index] = x;
  348. max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
  349. index++;
  350. }
  351. // if we found a new max.
  352. if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
  353. { // we found a new max. Search for it
  354. // find max across the max vector, place in all elements of max -- big latency hit here
  355. max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
  356. max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
  357. // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
  358. // this where it actually makes a difference is handled in the early out at the top of the function,
  359. // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
  360. // complexity, and removed it.
  361. dotMax = max;
  362. // scan for the first occurence of max in the array
  363. size_t test;
  364. for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
  365. {}
  366. maxIndex = 4*index + segment + indexTable[test];
  367. }
  368. _mm_store_ss( dotResult, dotMax);
  369. return maxIndex;
  370. }
  371. long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
  372. long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
  373. {
  374. const float4 *vertices = (const float4*) vv;
  375. static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
  376. float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
  377. float4 vvec = _mm_loadu_ps( vec );
  378. float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
  379. float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
  380. long minIndex = -1L;
  381. size_t segment = 0;
  382. float4 stack_array[ STACK_ARRAY_COUNT ];
  383. #if DEBUG
  384. //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
  385. #endif
  386. size_t index;
  387. float4 min;
  388. // Faster loop without cleanup code for full tiles
  389. for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
  390. {
  391. min = dotmin;
  392. for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
  393. { // do four dot products at a time. Carefully avoid touching the w element.
  394. float4 v0 = vertices[0];
  395. float4 v1 = vertices[1];
  396. float4 v2 = vertices[2];
  397. float4 v3 = vertices[3]; vertices += 4;
  398. float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  399. float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  400. float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  401. float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  402. lo0 = lo0*vLo;
  403. lo1 = lo1*vLo;
  404. float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
  405. float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
  406. float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  407. z = z*vHi;
  408. x = x+y;
  409. x = x+z;
  410. stack_array[index] = x;
  411. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  412. v0 = vertices[0];
  413. v1 = vertices[1];
  414. v2 = vertices[2];
  415. v3 = vertices[3]; vertices += 4;
  416. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  417. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  418. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  419. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  420. lo0 = lo0*vLo;
  421. lo1 = lo1*vLo;
  422. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  423. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  424. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  425. z = z*vHi;
  426. x = x+y;
  427. x = x+z;
  428. stack_array[index+1] = x;
  429. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  430. v0 = vertices[0];
  431. v1 = vertices[1];
  432. v2 = vertices[2];
  433. v3 = vertices[3]; vertices += 4;
  434. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  435. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  436. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  437. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  438. lo0 = lo0*vLo;
  439. lo1 = lo1*vLo;
  440. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  441. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  442. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  443. z = z*vHi;
  444. x = x+y;
  445. x = x+z;
  446. stack_array[index+2] = x;
  447. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  448. v0 = vertices[0];
  449. v1 = vertices[1];
  450. v2 = vertices[2];
  451. v3 = vertices[3]; vertices += 4;
  452. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  453. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  454. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  455. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  456. lo0 = lo0*vLo;
  457. lo1 = lo1*vLo;
  458. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  459. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  460. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  461. z = z*vHi;
  462. x = x+y;
  463. x = x+z;
  464. stack_array[index+3] = x;
  465. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  466. // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
  467. }
  468. // If we found a new min
  469. if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
  470. {
  471. // copy the new min across all lanes of our min accumulator
  472. min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
  473. min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
  474. dotmin = min;
  475. // find first occurrence of that min
  476. size_t test;
  477. for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
  478. {}
  479. // record where it is.
  480. minIndex = 4*index + segment + indexTable[test];
  481. }
  482. }
  483. // account for work we've already done
  484. count -= segment;
  485. // Deal with the last < STACK_ARRAY_COUNT vectors
  486. min = dotmin;
  487. index = 0;
  488. if(btUnlikely( count > 16) )
  489. {
  490. for( ; index + 4 <= count / 4; index+=4 )
  491. { // do four dot products at a time. Carefully avoid touching the w element.
  492. float4 v0 = vertices[0];
  493. float4 v1 = vertices[1];
  494. float4 v2 = vertices[2];
  495. float4 v3 = vertices[3]; vertices += 4;
  496. float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  497. float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  498. float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  499. float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  500. lo0 = lo0*vLo;
  501. lo1 = lo1*vLo;
  502. float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
  503. float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
  504. float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  505. z = z*vHi;
  506. x = x+y;
  507. x = x+z;
  508. stack_array[index] = x;
  509. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  510. v0 = vertices[0];
  511. v1 = vertices[1];
  512. v2 = vertices[2];
  513. v3 = vertices[3]; vertices += 4;
  514. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  515. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  516. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  517. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  518. lo0 = lo0*vLo;
  519. lo1 = lo1*vLo;
  520. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  521. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  522. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  523. z = z*vHi;
  524. x = x+y;
  525. x = x+z;
  526. stack_array[index+1] = x;
  527. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  528. v0 = vertices[0];
  529. v1 = vertices[1];
  530. v2 = vertices[2];
  531. v3 = vertices[3]; vertices += 4;
  532. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  533. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  534. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  535. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  536. lo0 = lo0*vLo;
  537. lo1 = lo1*vLo;
  538. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  539. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  540. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  541. z = z*vHi;
  542. x = x+y;
  543. x = x+z;
  544. stack_array[index+2] = x;
  545. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  546. v0 = vertices[0];
  547. v1 = vertices[1];
  548. v2 = vertices[2];
  549. v3 = vertices[3]; vertices += 4;
  550. lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  551. hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  552. lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  553. hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  554. lo0 = lo0*vLo;
  555. lo1 = lo1*vLo;
  556. z = _mm_shuffle_ps(hi0, hi1, 0x88);
  557. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  558. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  559. z = z*vHi;
  560. x = x+y;
  561. x = x+z;
  562. stack_array[index+3] = x;
  563. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  564. // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
  565. }
  566. }
  567. size_t localCount = (count & -4L) - 4*index;
  568. if( localCount )
  569. {
  570. #ifdef __APPLE__
  571. vertices += localCount; // counter the offset
  572. float4 t0, t1, t2, t3, t4;
  573. size_t byteIndex = -(localCount) * sizeof(float);
  574. float4 * sap = &stack_array[index + localCount / 4];
  575. asm volatile
  576. ( ".align 4 \n\
  577. 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
  578. movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
  579. movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
  580. movaps %[t0], %[min] // vertices[0] \n\
  581. movlhps %[t1], %[min] // x0y0x1y1 \n\
  582. movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
  583. movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
  584. mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
  585. movhlps %[t0], %[t1] // z0w0z1w1 \n\
  586. movaps %[t3], %[t0] // vertices[2] \n\
  587. movlhps %[t4], %[t0] // x2y2x3y3 \n\
  588. movhlps %[t3], %[t4] // z2w2z3w3 \n\
  589. mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
  590. shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
  591. mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
  592. movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
  593. shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
  594. shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
  595. addps %[t3], %[min] // x + y \n\
  596. addps %[t1], %[min] // x + y + z \n\
  597. movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
  598. minps %[t2], %[min] // record min, restore min \n\
  599. add $16, %[byteIndex] // advance loop counter\n\
  600. jnz 0b \n\
  601. "
  602. : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
  603. : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
  604. : "memory", "cc"
  605. );
  606. index += localCount/4;
  607. #else
  608. {
  609. for( unsigned int i=0; i<localCount/4; i++,index++)
  610. { // do four dot products at a time. Carefully avoid touching the w element.
  611. float4 v0 = vertices[0];
  612. float4 v1 = vertices[1];
  613. float4 v2 = vertices[2];
  614. float4 v3 = vertices[3];
  615. vertices += 4;
  616. float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
  617. float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
  618. float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
  619. float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
  620. lo0 = lo0*vLo;
  621. lo1 = lo1*vLo;
  622. float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
  623. float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
  624. float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  625. z = z*vHi;
  626. x = x+y;
  627. x = x+z;
  628. stack_array[index] = x;
  629. min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan
  630. }
  631. }
  632. #endif
  633. }
  634. // process the last few points
  635. if( count & 3 )
  636. {
  637. float4 v0, v1, v2, x, y, z;
  638. switch( count & 3 )
  639. {
  640. case 3:
  641. {
  642. v0 = vertices[0];
  643. v1 = vertices[1];
  644. v2 = vertices[2];
  645. // Calculate 3 dot products, transpose, duplicate v2
  646. float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
  647. float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
  648. lo0 = lo0*vLo;
  649. z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
  650. z = z*vHi;
  651. float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
  652. lo1 = lo1*vLo;
  653. x = _mm_shuffle_ps(lo0, lo1, 0x88);
  654. y = _mm_shuffle_ps(lo0, lo1, 0xdd);
  655. }
  656. break;
  657. case 2:
  658. {
  659. v0 = vertices[0];
  660. v1 = vertices[1];
  661. float4 xy = _mm_movelh_ps(v0, v1);
  662. z = _mm_movehl_ps(v1, v0);
  663. xy = xy*vLo;
  664. z = _mm_shuffle_ps( z, z, 0xa8);
  665. x = _mm_shuffle_ps( xy, xy, 0xa8);
  666. y = _mm_shuffle_ps( xy, xy, 0xfd);
  667. z = z*vHi;
  668. }
  669. break;
  670. case 1:
  671. {
  672. float4 xy = vertices[0];
  673. z = _mm_shuffle_ps( xy, xy, 0xaa);
  674. xy = xy*vLo;
  675. z = z*vHi;
  676. x = _mm_shuffle_ps(xy, xy, 0);
  677. y = _mm_shuffle_ps(xy, xy, 0x55);
  678. }
  679. break;
  680. }
  681. x = x+y;
  682. x = x+z;
  683. stack_array[index] = x;
  684. min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
  685. index++;
  686. }
  687. // if we found a new min.
  688. if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
  689. { // we found a new min. Search for it
  690. // find min across the min vector, place in all elements of min -- big latency hit here
  691. min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
  692. min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
  693. // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
  694. // this where it actually makes a difference is handled in the early out at the top of the function,
  695. // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
  696. // complexity, and removed it.
  697. dotmin = min;
  698. // scan for the first occurence of min in the array
  699. size_t test;
  700. for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
  701. {}
  702. minIndex = 4*index + segment + indexTable[test];
  703. }
  704. _mm_store_ss( dotResult, dotmin);
  705. return minIndex;
  706. }
  707. #elif defined BT_USE_NEON
  708. #define ARM_NEON_GCC_COMPATIBILITY 1
  709. #include <arm_neon.h>
  710. #include <sys/types.h>
  711. // Urho3D - enable NEON on generic ARM
  712. #ifdef __APPLE__
  713. #include <sys/sysctl.h> //for sysctlbyname
  714. #endif //__APPLE__
  715. static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
  716. static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
  717. static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
  718. static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
  719. static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
  720. static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
  721. long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
  722. long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
  723. static inline uint32_t btGetCpuCapabilities( void )
  724. {
  725. static uint32_t capabilities = 0;
  726. static bool testedCapabilities = false;
  727. if( 0 == testedCapabilities)
  728. {
  729. // Urho3D - enable NEON on generic ARM
  730. #ifdef __APPLE__
  731. uint32_t hasFeature = 0;
  732. size_t featureSize = sizeof( hasFeature );
  733. int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
  734. if( 0 == err && hasFeature)
  735. capabilities |= 0x2000;
  736. #endif //__APPLE__
  737. testedCapabilities = true;
  738. }
  739. return capabilities;
  740. }
  741. static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
  742. {
  743. if( btGetCpuCapabilities() & 0x2000 )
  744. _maxdot_large = _maxdot_large_v1;
  745. else
  746. _maxdot_large = _maxdot_large_v0;
  747. return _maxdot_large(vv, vec, count, dotResult);
  748. }
  749. static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
  750. {
  751. if( btGetCpuCapabilities() & 0x2000 )
  752. _mindot_large = _mindot_large_v1;
  753. else
  754. _mindot_large = _mindot_large_v0;
  755. return _mindot_large(vv, vec, count, dotResult);
  756. }
  757. // Urho3D - enable NEON on generic ARM
  758. #if defined __arm__ && __APPLE__
  759. # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
  760. #else
  761. //support 64bit (and generic) arm
  762. # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
  763. #endif
  764. long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
  765. {
  766. unsigned long i = 0;
  767. float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
  768. float32x2_t vLo = vget_low_f32(vvec);
  769. float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
  770. float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
  771. float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
  772. uint32x2_t indexLo = (uint32x2_t) {0, 1};
  773. uint32x2_t indexHi = (uint32x2_t) {2, 3};
  774. uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
  775. uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
  776. const uint32x2_t four = (uint32x2_t) {4,4};
  777. for( ; i+8 <= count; i+= 8 )
  778. {
  779. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  780. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  781. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  782. float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  783. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  784. float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  785. float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  786. float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
  787. float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  788. float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  789. float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  790. float32x2_t zHi = vmul_f32( z1.val[0], vHi);
  791. float32x2_t rLo = vpadd_f32( xy0, xy1);
  792. float32x2_t rHi = vpadd_f32( xy2, xy3);
  793. rLo = vadd_f32(rLo, zLo);
  794. rHi = vadd_f32(rHi, zHi);
  795. uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
  796. uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
  797. dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  798. dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
  799. iLo = vbsl_u32(maskLo, indexLo, iLo);
  800. iHi = vbsl_u32(maskHi, indexHi, iHi);
  801. indexLo = vadd_u32(indexLo, four);
  802. indexHi = vadd_u32(indexHi, four);
  803. v0 = vld1q_f32_aligned_postincrement( vv );
  804. v1 = vld1q_f32_aligned_postincrement( vv );
  805. v2 = vld1q_f32_aligned_postincrement( vv );
  806. v3 = vld1q_f32_aligned_postincrement( vv );
  807. xy0 = vmul_f32( vget_low_f32(v0), vLo);
  808. xy1 = vmul_f32( vget_low_f32(v1), vLo);
  809. xy2 = vmul_f32( vget_low_f32(v2), vLo);
  810. xy3 = vmul_f32( vget_low_f32(v3), vLo);
  811. z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  812. z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  813. zLo = vmul_f32( z0.val[0], vHi);
  814. zHi = vmul_f32( z1.val[0], vHi);
  815. rLo = vpadd_f32( xy0, xy1);
  816. rHi = vpadd_f32( xy2, xy3);
  817. rLo = vadd_f32(rLo, zLo);
  818. rHi = vadd_f32(rHi, zHi);
  819. maskLo = vcgt_f32( rLo, dotMaxLo );
  820. maskHi = vcgt_f32( rHi, dotMaxHi );
  821. dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  822. dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
  823. iLo = vbsl_u32(maskLo, indexLo, iLo);
  824. iHi = vbsl_u32(maskHi, indexHi, iHi);
  825. indexLo = vadd_u32(indexLo, four);
  826. indexHi = vadd_u32(indexHi, four);
  827. }
  828. for( ; i+4 <= count; i+= 4 )
  829. {
  830. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  831. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  832. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  833. float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  834. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  835. float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  836. float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  837. float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
  838. float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  839. float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  840. float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  841. float32x2_t zHi = vmul_f32( z1.val[0], vHi);
  842. float32x2_t rLo = vpadd_f32( xy0, xy1);
  843. float32x2_t rHi = vpadd_f32( xy2, xy3);
  844. rLo = vadd_f32(rLo, zLo);
  845. rHi = vadd_f32(rHi, zHi);
  846. uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
  847. uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
  848. dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  849. dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
  850. iLo = vbsl_u32(maskLo, indexLo, iLo);
  851. iHi = vbsl_u32(maskHi, indexHi, iHi);
  852. indexLo = vadd_u32(indexLo, four);
  853. indexHi = vadd_u32(indexHi, four);
  854. }
  855. switch( count & 3 )
  856. {
  857. case 3:
  858. {
  859. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  860. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  861. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  862. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  863. float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  864. float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  865. float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  866. float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  867. float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
  868. float32x2_t rLo = vpadd_f32( xy0, xy1);
  869. float32x2_t rHi = vpadd_f32( xy2, xy2);
  870. rLo = vadd_f32(rLo, zLo);
  871. rHi = vadd_f32(rHi, zHi);
  872. uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
  873. uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
  874. dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  875. dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
  876. iLo = vbsl_u32(maskLo, indexLo, iLo);
  877. iHi = vbsl_u32(maskHi, indexHi, iHi);
  878. }
  879. break;
  880. case 2:
  881. {
  882. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  883. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  884. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  885. float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  886. float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  887. float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  888. float32x2_t rLo = vpadd_f32( xy0, xy1);
  889. rLo = vadd_f32(rLo, zLo);
  890. uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
  891. dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  892. iLo = vbsl_u32(maskLo, indexLo, iLo);
  893. }
  894. break;
  895. case 1:
  896. {
  897. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  898. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  899. float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
  900. float32x2_t zLo = vmul_f32( z0, vHi);
  901. float32x2_t rLo = vpadd_f32( xy0, xy0);
  902. rLo = vadd_f32(rLo, zLo);
  903. uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
  904. dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
  905. iLo = vbsl_u32(maskLo, indexLo, iLo);
  906. }
  907. break;
  908. default:
  909. break;
  910. }
  911. // select best answer between hi and lo results
  912. uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
  913. dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
  914. iLo = vbsl_u32(mask, iHi, iLo);
  915. // select best answer between even and odd results
  916. dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
  917. iHi = vdup_lane_u32(iLo, 1);
  918. mask = vcgt_f32( dotMaxHi, dotMaxLo );
  919. dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
  920. iLo = vbsl_u32(mask, iHi, iLo);
  921. *dotResult = vget_lane_f32( dotMaxLo, 0);
  922. return vget_lane_u32(iLo, 0);
  923. }
  924. long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
  925. {
  926. float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
  927. float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
  928. float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
  929. const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
  930. uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
  931. uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
  932. float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
  933. unsigned long i = 0;
  934. for( ; i + 8 <= count; i += 8 )
  935. {
  936. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  937. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  938. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  939. float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  940. // the next two lines should resolve to a single vswp d, d
  941. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  942. float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  943. // the next two lines should resolve to a single vswp d, d
  944. float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  945. float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  946. xy0 = vmulq_f32(xy0, vLo);
  947. xy1 = vmulq_f32(xy1, vLo);
  948. float32x4x2_t zb = vuzpq_f32( z0, z1);
  949. float32x4_t z = vmulq_f32( zb.val[0], vHi);
  950. float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  951. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  952. x = vaddq_f32(x, z);
  953. uint32x4_t mask = vcgtq_f32(x, maxDot);
  954. maxDot = vbslq_f32( mask, x, maxDot);
  955. index = vbslq_u32(mask, local_index, index);
  956. local_index = vaddq_u32(local_index, four);
  957. v0 = vld1q_f32_aligned_postincrement( vv );
  958. v1 = vld1q_f32_aligned_postincrement( vv );
  959. v2 = vld1q_f32_aligned_postincrement( vv );
  960. v3 = vld1q_f32_aligned_postincrement( vv );
  961. // the next two lines should resolve to a single vswp d, d
  962. xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  963. xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  964. // the next two lines should resolve to a single vswp d, d
  965. z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  966. z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  967. xy0 = vmulq_f32(xy0, vLo);
  968. xy1 = vmulq_f32(xy1, vLo);
  969. zb = vuzpq_f32( z0, z1);
  970. z = vmulq_f32( zb.val[0], vHi);
  971. xy = vuzpq_f32( xy0, xy1);
  972. x = vaddq_f32(xy.val[0], xy.val[1]);
  973. x = vaddq_f32(x, z);
  974. mask = vcgtq_f32(x, maxDot);
  975. maxDot = vbslq_f32( mask, x, maxDot);
  976. index = vbslq_u32(mask, local_index, index);
  977. local_index = vaddq_u32(local_index, four);
  978. }
  979. for( ; i + 4 <= count; i += 4 )
  980. {
  981. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  982. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  983. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  984. float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  985. // the next two lines should resolve to a single vswp d, d
  986. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  987. float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  988. // the next two lines should resolve to a single vswp d, d
  989. float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  990. float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  991. xy0 = vmulq_f32(xy0, vLo);
  992. xy1 = vmulq_f32(xy1, vLo);
  993. float32x4x2_t zb = vuzpq_f32( z0, z1);
  994. float32x4_t z = vmulq_f32( zb.val[0], vHi);
  995. float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  996. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  997. x = vaddq_f32(x, z);
  998. uint32x4_t mask = vcgtq_f32(x, maxDot);
  999. maxDot = vbslq_f32( mask, x, maxDot);
  1000. index = vbslq_u32(mask, local_index, index);
  1001. local_index = vaddq_u32(local_index, four);
  1002. }
  1003. switch (count & 3) {
  1004. case 3:
  1005. {
  1006. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1007. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1008. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1009. // the next two lines should resolve to a single vswp d, d
  1010. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1011. float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
  1012. // the next two lines should resolve to a single vswp d, d
  1013. float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1014. float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
  1015. xy0 = vmulq_f32(xy0, vLo);
  1016. xy1 = vmulq_f32(xy1, vLo);
  1017. float32x4x2_t zb = vuzpq_f32( z0, z1);
  1018. float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1019. float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1020. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1021. x = vaddq_f32(x, z);
  1022. uint32x4_t mask = vcgtq_f32(x, maxDot);
  1023. maxDot = vbslq_f32( mask, x, maxDot);
  1024. index = vbslq_u32(mask, local_index, index);
  1025. local_index = vaddq_u32(local_index, four);
  1026. }
  1027. break;
  1028. case 2:
  1029. {
  1030. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1031. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1032. // the next two lines should resolve to a single vswp d, d
  1033. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1034. // the next two lines should resolve to a single vswp d, d
  1035. float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1036. xy0 = vmulq_f32(xy0, vLo);
  1037. float32x4x2_t zb = vuzpq_f32( z0, z0);
  1038. float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1039. float32x4x2_t xy = vuzpq_f32( xy0, xy0);
  1040. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1041. x = vaddq_f32(x, z);
  1042. uint32x4_t mask = vcgtq_f32(x, maxDot);
  1043. maxDot = vbslq_f32( mask, x, maxDot);
  1044. index = vbslq_u32(mask, local_index, index);
  1045. local_index = vaddq_u32(local_index, four);
  1046. }
  1047. break;
  1048. case 1:
  1049. {
  1050. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1051. // the next two lines should resolve to a single vswp d, d
  1052. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
  1053. // the next two lines should resolve to a single vswp d, d
  1054. float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
  1055. xy0 = vmulq_f32(xy0, vLo);
  1056. z = vmulq_f32( z, vHi);
  1057. float32x4x2_t xy = vuzpq_f32( xy0, xy0);
  1058. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1059. x = vaddq_f32(x, z);
  1060. uint32x4_t mask = vcgtq_f32(x, maxDot);
  1061. maxDot = vbslq_f32( mask, x, maxDot);
  1062. index = vbslq_u32(mask, local_index, index);
  1063. local_index = vaddq_u32(local_index, four);
  1064. }
  1065. break;
  1066. default:
  1067. break;
  1068. }
  1069. // select best answer between hi and lo results
  1070. uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
  1071. float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
  1072. uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
  1073. // select best answer between even and odd results
  1074. float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
  1075. uint32x2_t indexHi = vdup_lane_u32(index2, 1);
  1076. mask = vcgt_f32( maxDotO, maxDot2 );
  1077. maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
  1078. index2 = vbsl_u32(mask, indexHi, index2);
  1079. *dotResult = vget_lane_f32( maxDot2, 0);
  1080. return vget_lane_u32(index2, 0);
  1081. }
  1082. long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
  1083. {
  1084. unsigned long i = 0;
  1085. float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
  1086. float32x2_t vLo = vget_low_f32(vvec);
  1087. float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
  1088. float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
  1089. float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
  1090. uint32x2_t indexLo = (uint32x2_t) {0, 1};
  1091. uint32x2_t indexHi = (uint32x2_t) {2, 3};
  1092. uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
  1093. uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
  1094. const uint32x2_t four = (uint32x2_t) {4,4};
  1095. for( ; i+8 <= count; i+= 8 )
  1096. {
  1097. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1098. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1099. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1100. float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1101. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1102. float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1103. float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1104. float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
  1105. float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1106. float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  1107. float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1108. float32x2_t zHi = vmul_f32( z1.val[0], vHi);
  1109. float32x2_t rLo = vpadd_f32( xy0, xy1);
  1110. float32x2_t rHi = vpadd_f32( xy2, xy3);
  1111. rLo = vadd_f32(rLo, zLo);
  1112. rHi = vadd_f32(rHi, zHi);
  1113. uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1114. uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
  1115. dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1116. dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
  1117. iLo = vbsl_u32(maskLo, indexLo, iLo);
  1118. iHi = vbsl_u32(maskHi, indexHi, iHi);
  1119. indexLo = vadd_u32(indexLo, four);
  1120. indexHi = vadd_u32(indexHi, four);
  1121. v0 = vld1q_f32_aligned_postincrement( vv );
  1122. v1 = vld1q_f32_aligned_postincrement( vv );
  1123. v2 = vld1q_f32_aligned_postincrement( vv );
  1124. v3 = vld1q_f32_aligned_postincrement( vv );
  1125. xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1126. xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1127. xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1128. xy3 = vmul_f32( vget_low_f32(v3), vLo);
  1129. z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1130. z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  1131. zLo = vmul_f32( z0.val[0], vHi);
  1132. zHi = vmul_f32( z1.val[0], vHi);
  1133. rLo = vpadd_f32( xy0, xy1);
  1134. rHi = vpadd_f32( xy2, xy3);
  1135. rLo = vadd_f32(rLo, zLo);
  1136. rHi = vadd_f32(rHi, zHi);
  1137. maskLo = vclt_f32( rLo, dotMinLo );
  1138. maskHi = vclt_f32( rHi, dotMinHi );
  1139. dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1140. dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
  1141. iLo = vbsl_u32(maskLo, indexLo, iLo);
  1142. iHi = vbsl_u32(maskHi, indexHi, iHi);
  1143. indexLo = vadd_u32(indexLo, four);
  1144. indexHi = vadd_u32(indexHi, four);
  1145. }
  1146. for( ; i+4 <= count; i+= 4 )
  1147. {
  1148. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1149. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1150. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1151. float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1152. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1153. float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1154. float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1155. float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
  1156. float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1157. float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
  1158. float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1159. float32x2_t zHi = vmul_f32( z1.val[0], vHi);
  1160. float32x2_t rLo = vpadd_f32( xy0, xy1);
  1161. float32x2_t rHi = vpadd_f32( xy2, xy3);
  1162. rLo = vadd_f32(rLo, zLo);
  1163. rHi = vadd_f32(rHi, zHi);
  1164. uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1165. uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
  1166. dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1167. dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
  1168. iLo = vbsl_u32(maskLo, indexLo, iLo);
  1169. iHi = vbsl_u32(maskHi, indexHi, iHi);
  1170. indexLo = vadd_u32(indexLo, four);
  1171. indexHi = vadd_u32(indexHi, four);
  1172. }
  1173. switch( count & 3 )
  1174. {
  1175. case 3:
  1176. {
  1177. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1178. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1179. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1180. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1181. float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1182. float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
  1183. float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1184. float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1185. float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
  1186. float32x2_t rLo = vpadd_f32( xy0, xy1);
  1187. float32x2_t rHi = vpadd_f32( xy2, xy2);
  1188. rLo = vadd_f32(rLo, zLo);
  1189. rHi = vadd_f32(rHi, zHi);
  1190. uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1191. uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
  1192. dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1193. dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
  1194. iLo = vbsl_u32(maskLo, indexLo, iLo);
  1195. iHi = vbsl_u32(maskHi, indexHi, iHi);
  1196. }
  1197. break;
  1198. case 2:
  1199. {
  1200. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1201. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1202. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1203. float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
  1204. float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
  1205. float32x2_t zLo = vmul_f32( z0.val[0], vHi);
  1206. float32x2_t rLo = vpadd_f32( xy0, xy1);
  1207. rLo = vadd_f32(rLo, zLo);
  1208. uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1209. dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1210. iLo = vbsl_u32(maskLo, indexLo, iLo);
  1211. }
  1212. break;
  1213. case 1:
  1214. {
  1215. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1216. float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
  1217. float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
  1218. float32x2_t zLo = vmul_f32( z0, vHi);
  1219. float32x2_t rLo = vpadd_f32( xy0, xy0);
  1220. rLo = vadd_f32(rLo, zLo);
  1221. uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
  1222. dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
  1223. iLo = vbsl_u32(maskLo, indexLo, iLo);
  1224. }
  1225. break;
  1226. default:
  1227. break;
  1228. }
  1229. // select best answer between hi and lo results
  1230. uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
  1231. dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
  1232. iLo = vbsl_u32(mask, iHi, iLo);
  1233. // select best answer between even and odd results
  1234. dotMinHi = vdup_lane_f32(dotMinLo, 1);
  1235. iHi = vdup_lane_u32(iLo, 1);
  1236. mask = vclt_f32( dotMinHi, dotMinLo );
  1237. dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
  1238. iLo = vbsl_u32(mask, iHi, iLo);
  1239. *dotResult = vget_lane_f32( dotMinLo, 0);
  1240. return vget_lane_u32(iLo, 0);
  1241. }
  1242. long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
  1243. {
  1244. float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
  1245. float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
  1246. float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
  1247. const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
  1248. uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
  1249. uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
  1250. float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
  1251. unsigned long i = 0;
  1252. for( ; i + 8 <= count; i += 8 )
  1253. {
  1254. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1255. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1256. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1257. float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1258. // the next two lines should resolve to a single vswp d, d
  1259. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1260. float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1261. // the next two lines should resolve to a single vswp d, d
  1262. float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1263. float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1264. xy0 = vmulq_f32(xy0, vLo);
  1265. xy1 = vmulq_f32(xy1, vLo);
  1266. float32x4x2_t zb = vuzpq_f32( z0, z1);
  1267. float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1268. float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1269. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1270. x = vaddq_f32(x, z);
  1271. uint32x4_t mask = vcltq_f32(x, minDot);
  1272. minDot = vbslq_f32( mask, x, minDot);
  1273. index = vbslq_u32(mask, local_index, index);
  1274. local_index = vaddq_u32(local_index, four);
  1275. v0 = vld1q_f32_aligned_postincrement( vv );
  1276. v1 = vld1q_f32_aligned_postincrement( vv );
  1277. v2 = vld1q_f32_aligned_postincrement( vv );
  1278. v3 = vld1q_f32_aligned_postincrement( vv );
  1279. // the next two lines should resolve to a single vswp d, d
  1280. xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1281. xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1282. // the next two lines should resolve to a single vswp d, d
  1283. z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1284. z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1285. xy0 = vmulq_f32(xy0, vLo);
  1286. xy1 = vmulq_f32(xy1, vLo);
  1287. zb = vuzpq_f32( z0, z1);
  1288. z = vmulq_f32( zb.val[0], vHi);
  1289. xy = vuzpq_f32( xy0, xy1);
  1290. x = vaddq_f32(xy.val[0], xy.val[1]);
  1291. x = vaddq_f32(x, z);
  1292. mask = vcltq_f32(x, minDot);
  1293. minDot = vbslq_f32( mask, x, minDot);
  1294. index = vbslq_u32(mask, local_index, index);
  1295. local_index = vaddq_u32(local_index, four);
  1296. }
  1297. for( ; i + 4 <= count; i += 4 )
  1298. {
  1299. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1300. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1301. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1302. float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
  1303. // the next two lines should resolve to a single vswp d, d
  1304. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1305. float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
  1306. // the next two lines should resolve to a single vswp d, d
  1307. float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1308. float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
  1309. xy0 = vmulq_f32(xy0, vLo);
  1310. xy1 = vmulq_f32(xy1, vLo);
  1311. float32x4x2_t zb = vuzpq_f32( z0, z1);
  1312. float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1313. float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1314. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1315. x = vaddq_f32(x, z);
  1316. uint32x4_t mask = vcltq_f32(x, minDot);
  1317. minDot = vbslq_f32( mask, x, minDot);
  1318. index = vbslq_u32(mask, local_index, index);
  1319. local_index = vaddq_u32(local_index, four);
  1320. }
  1321. switch (count & 3) {
  1322. case 3:
  1323. {
  1324. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1325. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1326. float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
  1327. // the next two lines should resolve to a single vswp d, d
  1328. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1329. float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
  1330. // the next two lines should resolve to a single vswp d, d
  1331. float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1332. float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
  1333. xy0 = vmulq_f32(xy0, vLo);
  1334. xy1 = vmulq_f32(xy1, vLo);
  1335. float32x4x2_t zb = vuzpq_f32( z0, z1);
  1336. float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1337. float32x4x2_t xy = vuzpq_f32( xy0, xy1);
  1338. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1339. x = vaddq_f32(x, z);
  1340. uint32x4_t mask = vcltq_f32(x, minDot);
  1341. minDot = vbslq_f32( mask, x, minDot);
  1342. index = vbslq_u32(mask, local_index, index);
  1343. local_index = vaddq_u32(local_index, four);
  1344. }
  1345. break;
  1346. case 2:
  1347. {
  1348. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1349. float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
  1350. // the next two lines should resolve to a single vswp d, d
  1351. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
  1352. // the next two lines should resolve to a single vswp d, d
  1353. float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
  1354. xy0 = vmulq_f32(xy0, vLo);
  1355. float32x4x2_t zb = vuzpq_f32( z0, z0);
  1356. float32x4_t z = vmulq_f32( zb.val[0], vHi);
  1357. float32x4x2_t xy = vuzpq_f32( xy0, xy0);
  1358. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1359. x = vaddq_f32(x, z);
  1360. uint32x4_t mask = vcltq_f32(x, minDot);
  1361. minDot = vbslq_f32( mask, x, minDot);
  1362. index = vbslq_u32(mask, local_index, index);
  1363. local_index = vaddq_u32(local_index, four);
  1364. }
  1365. break;
  1366. case 1:
  1367. {
  1368. float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
  1369. // the next two lines should resolve to a single vswp d, d
  1370. float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
  1371. // the next two lines should resolve to a single vswp d, d
  1372. float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
  1373. xy0 = vmulq_f32(xy0, vLo);
  1374. z = vmulq_f32( z, vHi);
  1375. float32x4x2_t xy = vuzpq_f32( xy0, xy0);
  1376. float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
  1377. x = vaddq_f32(x, z);
  1378. uint32x4_t mask = vcltq_f32(x, minDot);
  1379. minDot = vbslq_f32( mask, x, minDot);
  1380. index = vbslq_u32(mask, local_index, index);
  1381. local_index = vaddq_u32(local_index, four);
  1382. }
  1383. break;
  1384. default:
  1385. break;
  1386. }
  1387. // select best answer between hi and lo results
  1388. uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
  1389. float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
  1390. uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
  1391. // select best answer between even and odd results
  1392. float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
  1393. uint32x2_t indexHi = vdup_lane_u32(index2, 1);
  1394. mask = vclt_f32( minDotO, minDot2 );
  1395. minDot2 = vbsl_f32(mask, minDotO, minDot2);
  1396. index2 = vbsl_u32(mask, indexHi, index2);
  1397. *dotResult = vget_lane_f32( minDot2, 0);
  1398. return vget_lane_u32(index2, 0);
  1399. }
  1400. #else
  1401. #error Unhandled __APPLE__ arch
  1402. #endif
  1403. #endif /* __APPLE__ */