| 1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678 |
- /*
- Copyright (c) 2011 Apple Inc.
- http://continuousphysics.com/Bullet/
-
- This software is provided 'as-is', without any express or implied warranty.
- In no event will the authors be held liable for any damages arising from the use of this software.
- Permission is granted to anyone to use this software for any purpose,
- including commercial applications, and to alter it and redistribute it freely,
- subject to the following restrictions:
-
- 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source distribution.
-
- This source version has been altered.
- */
- // Modified by Yao Wei Tjong for Urho3D
- #if defined (_WIN32) || defined (__i386__)
- #define BT_USE_SSE_IN_API
- #endif
- #include "btVector3.h"
- #if defined BT_USE_SIMD_VECTOR3
- #if DEBUG
- #include <string.h>//for memset
- #endif
- #ifdef __APPLE__
- #include <stdint.h>
- typedef float float4 __attribute__ ((vector_size(16)));
- #else
- #define float4 __m128
- #endif
- //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
- #if defined BT_USE_SSE //|| defined _WIN32 // Urho3D - just use BT_USE_SSE as the main switch, Urho3D allow SSE to be disabled
- #define LOG2_ARRAY_SIZE 6
- #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
- #include <emmintrin.h>
- long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
- long _maxdot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
- {
- const float4 *vertices = (const float4*) vv;
- static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
- float4 dotMax = btAssign128( -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY );
- float4 vvec = _mm_loadu_ps( vec );
- float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
- float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
-
- long maxIndex = -1L;
-
- size_t segment = 0;
- float4 stack_array[ STACK_ARRAY_COUNT ];
-
- #if DEBUG
- //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
- #endif
-
- size_t index;
- float4 max;
- // Faster loop without cleanup code for full tiles
- for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
- {
- max = dotMax;
-
- for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3]; vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+1] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+2] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+3] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
- }
-
- // If we found a new max
- if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
- {
- // copy the new max across all lanes of our max accumulator
- max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
- max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
-
- dotMax = max;
-
- // find first occurrence of that max
- size_t test;
- for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
- {}
- // record where it is.
- maxIndex = 4*index + segment + indexTable[test];
- }
- }
-
- // account for work we've already done
- count -= segment;
-
- // Deal with the last < STACK_ARRAY_COUNT vectors
- max = dotMax;
- index = 0;
-
-
- if( btUnlikely( count > 16) )
- {
- for( ; index + 4 <= count / 4; index+=4 )
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3]; vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+1] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+2] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+3] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
-
- // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
- }
- }
-
- size_t localCount = (count & -4L) - 4*index;
- if( localCount )
- {
- #ifdef __APPLE__
- float4 t0, t1, t2, t3, t4;
- float4 * sap = &stack_array[index + localCount / 4];
- vertices += localCount; // counter the offset
- size_t byteIndex = -(localCount) * sizeof(float);
- //AT&T Code style assembly
- asm volatile
- ( ".align 4 \n\
- 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
- movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
- movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
- movaps %[t0], %[max] // vertices[0] \n\
- movlhps %[t1], %[max] // x0y0x1y1 \n\
- movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
- movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
- mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
- movhlps %[t0], %[t1] // z0w0z1w1 \n\
- movaps %[t3], %[t0] // vertices[2] \n\
- movlhps %[t4], %[t0] // x2y2x3y3 \n\
- mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
- movhlps %[t3], %[t4] // z2w2z3w3 \n\
- shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
- mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
- movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
- shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
- shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
- addps %[t3], %[max] // x + y \n\
- addps %[t1], %[max] // x + y + z \n\
- movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
- maxps %[t2], %[max] // record max, restore max \n\
- add $16, %[byteIndex] // advance loop counter\n\
- jnz 0b \n\
- "
- : [max] "+x" (max), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
- : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
- : "memory", "cc"
- );
- index += localCount/4;
- #else
- {
- for( unsigned int i=0; i<localCount/4; i++,index++)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
- }
- }
- #endif //__APPLE__
- }
- // process the last few points
- if( count & 3 )
- {
- float4 v0, v1, v2, x, y, z;
- switch( count & 3 )
- {
- case 3:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
-
- // Calculate 3 dot products, transpose, duplicate v2
- float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
- float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
- lo0 = lo0*vLo;
- z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
- z = z*vHi;
- float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
- lo1 = lo1*vLo;
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- }
- break;
- case 2:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- float4 xy = _mm_movelh_ps(v0, v1);
- z = _mm_movehl_ps(v1, v0);
- xy = xy*vLo;
- z = _mm_shuffle_ps( z, z, 0xa8);
- x = _mm_shuffle_ps( xy, xy, 0xa8);
- y = _mm_shuffle_ps( xy, xy, 0xfd);
- z = z*vHi;
- }
- break;
- case 1:
- {
- float4 xy = vertices[0];
- z = _mm_shuffle_ps( xy, xy, 0xaa);
- xy = xy*vLo;
- z = z*vHi;
- x = _mm_shuffle_ps(xy, xy, 0);
- y = _mm_shuffle_ps(xy, xy, 0x55);
- }
- break;
- }
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- max = _mm_max_ps( x, max ); // control the order here so that max is never NaN even if x is nan
- index++;
- }
-
- // if we found a new max.
- if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(max, dotMax)))
- { // we found a new max. Search for it
- // find max across the max vector, place in all elements of max -- big latency hit here
- max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0x4e));
- max = _mm_max_ps(max, (float4) _mm_shuffle_ps( max, max, 0xb1));
-
- // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
- // this where it actually makes a difference is handled in the early out at the top of the function,
- // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
- // complexity, and removed it.
-
- dotMax = max;
-
- // scan for the first occurence of max in the array
- size_t test;
- for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], max))); index++ ) // local_count must be a multiple of 4
- {}
- maxIndex = 4*index + segment + indexTable[test];
- }
-
- _mm_store_ss( dotResult, dotMax);
- return maxIndex;
- }
- long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult );
- long _mindot_large( const float *vv, const float *vec, unsigned long count, float *dotResult )
- {
- const float4 *vertices = (const float4*) vv;
- static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 };
- float4 dotmin = btAssign128( BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY );
- float4 vvec = _mm_loadu_ps( vec );
- float4 vHi = btCastiTo128f(_mm_shuffle_epi32( btCastfTo128i( vvec), 0xaa )); /// zzzz
- float4 vLo = _mm_movelh_ps( vvec, vvec ); /// xyxy
-
- long minIndex = -1L;
- size_t segment = 0;
- float4 stack_array[ STACK_ARRAY_COUNT ];
-
- #if DEBUG
- //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
- #endif
-
- size_t index;
- float4 min;
- // Faster loop without cleanup code for full tiles
- for ( segment = 0; segment + STACK_ARRAY_COUNT*4 <= count; segment += STACK_ARRAY_COUNT*4 )
- {
- min = dotmin;
-
- for( index = 0; index < STACK_ARRAY_COUNT; index+= 4 )
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3]; vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+1] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+2] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+3] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
- }
-
- // If we found a new min
- if( 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
- {
- // copy the new min across all lanes of our min accumulator
- min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
- min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
-
- dotmin = min;
-
- // find first occurrence of that min
- size_t test;
- for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
- {}
- // record where it is.
- minIndex = 4*index + segment + indexTable[test];
- }
- }
-
- // account for work we've already done
- count -= segment;
-
- // Deal with the last < STACK_ARRAY_COUNT vectors
- min = dotmin;
- index = 0;
-
-
- if(btUnlikely( count > 16) )
- {
- for( ; index + 4 <= count / 4; index+=4 )
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3]; vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+1] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+2] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3]; vertices += 4;
-
- lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index+3] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
-
- // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
- }
- }
-
- size_t localCount = (count & -4L) - 4*index;
- if( localCount )
- {
-
-
- #ifdef __APPLE__
- vertices += localCount; // counter the offset
- float4 t0, t1, t2, t3, t4;
- size_t byteIndex = -(localCount) * sizeof(float);
- float4 * sap = &stack_array[index + localCount / 4];
-
- asm volatile
- ( ".align 4 \n\
- 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
- movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
- movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
- movaps %[t0], %[min] // vertices[0] \n\
- movlhps %[t1], %[min] // x0y0x1y1 \n\
- movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
- movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
- mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
- movhlps %[t0], %[t1] // z0w0z1w1 \n\
- movaps %[t3], %[t0] // vertices[2] \n\
- movlhps %[t4], %[t0] // x2y2x3y3 \n\
- movhlps %[t3], %[t4] // z2w2z3w3 \n\
- mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
- shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
- mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
- movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
- shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
- shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
- addps %[t3], %[min] // x + y \n\
- addps %[t1], %[min] // x + y + z \n\
- movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
- minps %[t2], %[min] // record min, restore min \n\
- add $16, %[byteIndex] // advance loop counter\n\
- jnz 0b \n\
- "
- : [min] "+x" (min), [t0] "=&x" (t0), [t1] "=&x" (t1), [t2] "=&x" (t2), [t3] "=&x" (t3), [t4] "=&x" (t4), [byteIndex] "+r" (byteIndex)
- : [vLo] "x" (vLo), [vHi] "x" (vHi), [vertices] "r" (vertices), [sap] "r" (sap)
- : "memory", "cc"
- );
- index += localCount/4;
- #else
- {
- for( unsigned int i=0; i<localCount/4; i++,index++)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
-
- float4 lo0 = _mm_movelh_ps( v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps( v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps( v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps( v3, v2); // z2?2z3?3
-
- lo0 = lo0*vLo;
- lo1 = lo1*vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z*vHi;
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- min = _mm_min_ps( x, min ); // control the order here so that max is never NaN even if x is nan
- }
- }
- #endif
- }
-
- // process the last few points
- if( count & 3 )
- {
- float4 v0, v1, v2, x, y, z;
- switch( count & 3 )
- {
- case 3:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
-
- // Calculate 3 dot products, transpose, duplicate v2
- float4 lo0 = _mm_movelh_ps( v0, v1); // xyxy.lo
- float4 hi0 = _mm_movehl_ps( v1, v0); // z?z?.lo
- lo0 = lo0*vLo;
- z = _mm_shuffle_ps(hi0, v2, 0xa8 ); // z0z1z2z2
- z = z*vHi;
- float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
- lo1 = lo1*vLo;
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- }
- break;
- case 2:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- float4 xy = _mm_movelh_ps(v0, v1);
- z = _mm_movehl_ps(v1, v0);
- xy = xy*vLo;
- z = _mm_shuffle_ps( z, z, 0xa8);
- x = _mm_shuffle_ps( xy, xy, 0xa8);
- y = _mm_shuffle_ps( xy, xy, 0xfd);
- z = z*vHi;
- }
- break;
- case 1:
- {
- float4 xy = vertices[0];
- z = _mm_shuffle_ps( xy, xy, 0xaa);
- xy = xy*vLo;
- z = z*vHi;
- x = _mm_shuffle_ps(xy, xy, 0);
- y = _mm_shuffle_ps(xy, xy, 0x55);
- }
- break;
- }
- x = x+y;
- x = x+z;
- stack_array[index] = x;
- min = _mm_min_ps( x, min ); // control the order here so that min is never NaN even if x is nan
- index++;
- }
-
- // if we found a new min.
- if( 0 == segment || 0xf != _mm_movemask_ps( (float4) _mm_cmpeq_ps(min, dotmin)))
- { // we found a new min. Search for it
- // find min across the min vector, place in all elements of min -- big latency hit here
- min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0x4e));
- min = _mm_min_ps(min, (float4) _mm_shuffle_ps( min, min, 0xb1));
-
- // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
- // this where it actually makes a difference is handled in the early out at the top of the function,
- // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
- // complexity, and removed it.
-
- dotmin = min;
-
- // scan for the first occurence of min in the array
- size_t test;
- for( index = 0; 0 == (test=_mm_movemask_ps( _mm_cmpeq_ps( stack_array[index], min))); index++ ) // local_count must be a multiple of 4
- {}
- minIndex = 4*index + segment + indexTable[test];
- }
-
- _mm_store_ss( dotResult, dotmin);
- return minIndex;
- }
- #elif defined BT_USE_NEON
- #define ARM_NEON_GCC_COMPATIBILITY 1
- #include <arm_neon.h>
- #include <sys/types.h>
- // Urho3D - enable NEON on generic ARM
- #ifdef __APPLE__
- #include <sys/sysctl.h> //for sysctlbyname
- #endif //__APPLE__
- static long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
- static long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
- static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
- static long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult );
- static long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult );
- static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult );
- long (*_maxdot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _maxdot_large_sel;
- long (*_mindot_large)( const float *vv, const float *vec, unsigned long count, float *dotResult ) = _mindot_large_sel;
- static inline uint32_t btGetCpuCapabilities( void )
- {
- static uint32_t capabilities = 0;
- static bool testedCapabilities = false;
- if( 0 == testedCapabilities)
- {
- // Urho3D - enable NEON on generic ARM
- #ifdef __APPLE__
- uint32_t hasFeature = 0;
- size_t featureSize = sizeof( hasFeature );
- int err = sysctlbyname( "hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0 );
- if( 0 == err && hasFeature)
- capabilities |= 0x2000;
- #endif //__APPLE__
- testedCapabilities = true;
- }
-
- return capabilities;
- }
- static long _maxdot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
- {
- if( btGetCpuCapabilities() & 0x2000 )
- _maxdot_large = _maxdot_large_v1;
- else
- _maxdot_large = _maxdot_large_v0;
-
- return _maxdot_large(vv, vec, count, dotResult);
- }
- static long _mindot_large_sel( const float *vv, const float *vec, unsigned long count, float *dotResult )
- {
- if( btGetCpuCapabilities() & 0x2000 )
- _mindot_large = _mindot_large_v1;
- else
- _mindot_large = _mindot_large_v0;
-
- return _mindot_large(vv, vec, count, dotResult);
- }
- // Urho3D - enable NEON on generic ARM
- #if defined __arm__ && __APPLE__
- # define vld1q_f32_aligned_postincrement( _ptr ) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
- #else
- //support 64bit (and generic) arm
- # define vld1q_f32_aligned_postincrement( _ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
- #endif
- long _maxdot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
- {
- unsigned long i = 0;
- float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
- float32x2_t vLo = vget_low_f32(vvec);
- float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
- float32x2_t dotMaxLo = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
- float32x2_t dotMaxHi = (float32x2_t) { -BT_INFINITY, -BT_INFINITY };
- uint32x2_t indexLo = (uint32x2_t) {0, 1};
- uint32x2_t indexHi = (uint32x2_t) {2, 3};
- uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- const uint32x2_t four = (uint32x2_t) {4,4};
- for( ; i+8 <= count; i+= 8 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- v0 = vld1q_f32_aligned_postincrement( vv );
- v1 = vld1q_f32_aligned_postincrement( vv );
- v2 = vld1q_f32_aligned_postincrement( vv );
- v3 = vld1q_f32_aligned_postincrement( vv );
-
- xy0 = vmul_f32( vget_low_f32(v0), vLo);
- xy1 = vmul_f32( vget_low_f32(v1), vLo);
- xy2 = vmul_f32( vget_low_f32(v2), vLo);
- xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- zLo = vmul_f32( z0.val[0], vHi);
- zHi = vmul_f32( z1.val[0], vHi);
-
- rLo = vpadd_f32( xy0, xy1);
- rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- maskLo = vcgt_f32( rLo, dotMaxLo );
- maskHi = vcgt_f32( rHi, dotMaxHi );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
- for( ; i+4 <= count; i+= 4 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
-
- switch( count & 3 )
- {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy2);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- uint32x2_t maskHi = vcgt_f32( rHi, dotMaxHi );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32( maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- rLo = vadd_f32(rLo, zLo);
-
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
- float32x2_t zLo = vmul_f32( z0, vHi);
- float32x2_t rLo = vpadd_f32( xy0, xy0);
- rLo = vadd_f32(rLo, zLo);
- uint32x2_t maskLo = vcgt_f32( rLo, dotMaxLo );
- dotMaxLo = vbsl_f32( maskLo, rLo, dotMaxLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
-
- default:
- break;
- }
-
- // select best answer between hi and lo results
- uint32x2_t mask = vcgt_f32( dotMaxHi, dotMaxLo );
- dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
- iLo = vbsl_u32(mask, iHi, iLo);
-
- // select best answer between even and odd results
- dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
- iHi = vdup_lane_u32(iLo, 1);
- mask = vcgt_f32( dotMaxHi, dotMaxLo );
- dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
- iLo = vbsl_u32(mask, iHi, iLo);
-
- *dotResult = vget_lane_f32( dotMaxLo, 0);
- return vget_lane_u32(iLo, 0);
- }
- long _maxdot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
- {
- float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
- float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
- float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
- const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
- uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
- uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
- float32x4_t maxDot = (float32x4_t) { -BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY };
-
- unsigned long i = 0;
- for( ; i + 8 <= count; i += 8 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- v0 = vld1q_f32_aligned_postincrement( vv );
- v1 = vld1q_f32_aligned_postincrement( vv );
- v2 = vld1q_f32_aligned_postincrement( vv );
- v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- zb = vuzpq_f32( z0, z1);
- z = vmulq_f32( zb.val[0], vHi);
- xy = vuzpq_f32( xy0, xy1);
- x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- for( ; i + 4 <= count; i += 4 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
-
- switch (count & 3) {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-
- xy0 = vmulq_f32(xy0, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z0);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
-
- xy0 = vmulq_f32(xy0, vLo);
-
- z = vmulq_f32( z, vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32( mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- default:
- break;
- }
-
-
- // select best answer between hi and lo results
- uint32x2_t mask = vcgt_f32( vget_high_f32(maxDot), vget_low_f32(maxDot));
- float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
- uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
-
- // select best answer between even and odd results
- float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
- uint32x2_t indexHi = vdup_lane_u32(index2, 1);
- mask = vcgt_f32( maxDotO, maxDot2 );
- maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
- index2 = vbsl_u32(mask, indexHi, index2);
-
- *dotResult = vget_lane_f32( maxDot2, 0);
- return vget_lane_u32(index2, 0);
-
- }
- long _mindot_large_v0( const float *vv, const float *vec, unsigned long count, float *dotResult )
- {
- unsigned long i = 0;
- float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
- float32x2_t vLo = vget_low_f32(vvec);
- float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
- float32x2_t dotMinLo = (float32x2_t) { BT_INFINITY, BT_INFINITY };
- float32x2_t dotMinHi = (float32x2_t) { BT_INFINITY, BT_INFINITY };
- uint32x2_t indexLo = (uint32x2_t) {0, 1};
- uint32x2_t indexHi = (uint32x2_t) {2, 3};
- uint32x2_t iLo = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- uint32x2_t iHi = (uint32x2_t) {static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- const uint32x2_t four = (uint32x2_t) {4,4};
-
- for( ; i+8 <= count; i+= 8 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
-
- v0 = vld1q_f32_aligned_postincrement( vv );
- v1 = vld1q_f32_aligned_postincrement( vv );
- v2 = vld1q_f32_aligned_postincrement( vv );
- v3 = vld1q_f32_aligned_postincrement( vv );
-
- xy0 = vmul_f32( vget_low_f32(v0), vLo);
- xy1 = vmul_f32( vget_low_f32(v1), vLo);
- xy2 = vmul_f32( vget_low_f32(v2), vLo);
- xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- zLo = vmul_f32( z0.val[0], vHi);
- zHi = vmul_f32( z1.val[0], vHi);
-
- rLo = vpadd_f32( xy0, xy1);
- rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- maskLo = vclt_f32( rLo, dotMinLo );
- maskHi = vclt_f32( rHi, dotMinHi );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
- for( ; i+4 <= count; i+= 4 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32( vget_low_f32(v3), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32( vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( z1.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
- switch( count & 3 )
- {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32( vget_low_f32(v2), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
- float32x2_t zHi = vmul_f32( vdup_lane_f32(vget_high_f32(v2), 0), vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- float32x2_t rHi = vpadd_f32( xy2, xy2);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
-
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- uint32x2_t maskHi = vclt_f32( rHi, dotMinHi );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32( maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32( vget_low_f32(v1), vLo);
-
- float32x2x2_t z0 = vtrn_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32( z0.val[0], vHi);
-
- float32x2_t rLo = vpadd_f32( xy0, xy1);
- rLo = vadd_f32(rLo, zLo);
-
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x2_t xy0 = vmul_f32( vget_low_f32(v0), vLo);
- float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
- float32x2_t zLo = vmul_f32( z0, vHi);
- float32x2_t rLo = vpadd_f32( xy0, xy0);
- rLo = vadd_f32(rLo, zLo);
- uint32x2_t maskLo = vclt_f32( rLo, dotMinLo );
- dotMinLo = vbsl_f32( maskLo, rLo, dotMinLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
-
- default:
- break;
- }
-
- // select best answer between hi and lo results
- uint32x2_t mask = vclt_f32( dotMinHi, dotMinLo );
- dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
- iLo = vbsl_u32(mask, iHi, iLo);
-
- // select best answer between even and odd results
- dotMinHi = vdup_lane_f32(dotMinLo, 1);
- iHi = vdup_lane_u32(iLo, 1);
- mask = vclt_f32( dotMinHi, dotMinLo );
- dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
- iLo = vbsl_u32(mask, iHi, iLo);
-
- *dotResult = vget_lane_f32( dotMinLo, 0);
- return vget_lane_u32(iLo, 0);
- }
- long _mindot_large_v1( const float *vv, const float *vec, unsigned long count, float *dotResult )
- {
- float32x4_t vvec = vld1q_f32_aligned_postincrement( vec );
- float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
- float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
- const uint32x4_t four = (uint32x4_t){ 4, 4, 4, 4 };
- uint32x4_t local_index = (uint32x4_t) {0, 1, 2, 3};
- uint32x4_t index = (uint32x4_t) { static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1) };
- float32x4_t minDot = (float32x4_t) { BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY };
-
- unsigned long i = 0;
- for( ; i + 8 <= count; i += 8 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
-
- v0 = vld1q_f32_aligned_postincrement( vv );
- v1 = vld1q_f32_aligned_postincrement( vv );
- v2 = vld1q_f32_aligned_postincrement( vv );
- v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- zb = vuzpq_f32( z0, z1);
- z = vmulq_f32( zb.val[0], vHi);
- xy = vuzpq_f32( xy0, xy1);
- x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
-
- for( ; i + 4 <= count; i += 4 )
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v3 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v3));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
-
- switch (count & 3) {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v2 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32( vget_low_f32(v2), vget_low_f32(v2));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32( vget_high_f32(v2), vget_high_f32(v2));
-
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z1);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
- float32x4_t v1 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v1));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32( vget_high_f32(v0), vget_high_f32(v1));
-
- xy0 = vmulq_f32(xy0, vLo);
-
- float32x4x2_t zb = vuzpq_f32( z0, z0);
- float32x4_t z = vmulq_f32( zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement( vv );
-
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32( vget_low_f32(v0), vget_low_f32(v0));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
-
- xy0 = vmulq_f32(xy0, vLo);
-
- z = vmulq_f32( z, vHi);
- float32x4x2_t xy = vuzpq_f32( xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
-
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32( mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
-
- default:
- break;
- }
-
-
- // select best answer between hi and lo results
- uint32x2_t mask = vclt_f32( vget_high_f32(minDot), vget_low_f32(minDot));
- float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
- uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
-
- // select best answer between even and odd results
- float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
- uint32x2_t indexHi = vdup_lane_u32(index2, 1);
- mask = vclt_f32( minDotO, minDot2 );
- minDot2 = vbsl_f32(mask, minDotO, minDot2);
- index2 = vbsl_u32(mask, indexHi, index2);
-
- *dotResult = vget_lane_f32( minDot2, 0);
- return vget_lane_u32(index2, 0);
-
- }
- #else
- #error Unhandled __APPLE__ arch
- #endif
- #endif /* __APPLE__ */
|