12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664 |
- /*
- Copyright (c) 2011 Apple Inc.
- http://continuousphysics.com/Bullet/
-
- This software is provided 'as-is', without any express or implied warranty.
- In no event will the authors be held liable for any damages arising from the use of this software.
- Permission is granted to anyone to use this software for any purpose,
- including commercial applications, and to alter it and redistribute it freely,
- subject to the following restrictions:
-
- 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
- 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
- 3. This notice may not be removed or altered from any source distribution.
-
- This source version has been altered.
- */
- #if defined(_WIN32) || defined(__i386__)
- #define BT_USE_SSE_IN_API
- #endif
- #include "btVector3.h"
- #if defined BT_USE_SIMD_VECTOR3
- #if DEBUG
- #include <string.h> //for memset
- #endif
- #ifdef __APPLE__
- #include <stdint.h>
- typedef float float4 __attribute__((vector_size(16)));
- #else
- #define float4 __m128
- #endif
- //typedef uint32_t uint4 __attribute__ ((vector_size(16)));
- #if defined BT_USE_SSE || defined _WIN32
- #define LOG2_ARRAY_SIZE 6
- #define STACK_ARRAY_COUNT (1UL << LOG2_ARRAY_SIZE)
- #include <emmintrin.h>
- long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
- long _maxdot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
- {
- const float4 *vertices = (const float4 *)vv;
- static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
- float4 dotMax = btAssign128(-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY);
- float4 vvec = _mm_loadu_ps(vec);
- float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa)); /// zzzz
- float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy
- long maxIndex = -1L;
- size_t segment = 0;
- float4 stack_array[STACK_ARRAY_COUNT];
- #if DEBUG
- //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
- #endif
- size_t index;
- float4 max;
- // Faster loop without cleanup code for full tiles
- for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
- {
- max = dotMax;
- for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
- float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 1] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 2] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 3] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
- }
- // If we found a new max
- if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
- {
- // copy the new max across all lanes of our max accumulator
- max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
- max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
- dotMax = max;
- // find first occurrence of that max
- size_t test;
- for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
- {
- }
- // record where it is.
- maxIndex = 4 * index + segment + indexTable[test];
- }
- }
- // account for work we've already done
- count -= segment;
- // Deal with the last < STACK_ARRAY_COUNT vectors
- max = dotMax;
- index = 0;
- if (btUnlikely(count > 16))
- {
- for (; index + 4 <= count / 4; index += 4)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
- float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 1] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 2] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 3] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- // It is too costly to keep the index of the max here. We will look for it again later. We save a lot of work this way.
- }
- }
- size_t localCount = (count & -4L) - 4 * index;
- if (localCount)
- {
- #ifdef __APPLE__
- float4 t0, t1, t2, t3, t4;
- float4 *sap = &stack_array[index + localCount / 4];
- vertices += localCount; // counter the offset
- size_t byteIndex = -(localCount) * sizeof(float);
- //AT&T Code style assembly
- asm volatile(
- ".align 4 \n\
- 0: movaps %[max], %[t2] // move max out of the way to avoid propagating NaNs in max \n\
- movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
- movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
- movaps %[t0], %[max] // vertices[0] \n\
- movlhps %[t1], %[max] // x0y0x1y1 \n\
- movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
- movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
- mulps %[vLo], %[max] // x0y0x1y1 * vLo \n\
- movhlps %[t0], %[t1] // z0w0z1w1 \n\
- movaps %[t3], %[t0] // vertices[2] \n\
- movlhps %[t4], %[t0] // x2y2x3y3 \n\
- mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
- movhlps %[t3], %[t4] // z2w2z3w3 \n\
- shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
- mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
- movaps %[max], %[t3] // x0y0x1y1 * vLo \n\
- shufps $0x88, %[t0], %[max] // x0x1x2x3 * vLo.x \n\
- shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
- addps %[t3], %[max] // x + y \n\
- addps %[t1], %[max] // x + y + z \n\
- movaps %[max], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
- maxps %[t2], %[max] // record max, restore max \n\
- add $16, %[byteIndex] // advance loop counter\n\
- jnz 0b \n\
- "
- : [max] "+x"(max), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
- : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
- : "memory", "cc");
- index += localCount / 4;
- #else
- {
- for (unsigned int i = 0; i < localCount / 4; i++, index++)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
- float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- }
- }
- #endif //__APPLE__
- }
- // process the last few points
- if (count & 3)
- {
- float4 v0, v1, v2, x, y, z;
- switch (count & 3)
- {
- case 3:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- // Calculate 3 dot products, transpose, duplicate v2
- float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
- float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
- lo0 = lo0 * vLo;
- z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
- z = z * vHi;
- float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
- lo1 = lo1 * vLo;
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- }
- break;
- case 2:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- float4 xy = _mm_movelh_ps(v0, v1);
- z = _mm_movehl_ps(v1, v0);
- xy = xy * vLo;
- z = _mm_shuffle_ps(z, z, 0xa8);
- x = _mm_shuffle_ps(xy, xy, 0xa8);
- y = _mm_shuffle_ps(xy, xy, 0xfd);
- z = z * vHi;
- }
- break;
- case 1:
- {
- float4 xy = vertices[0];
- z = _mm_shuffle_ps(xy, xy, 0xaa);
- xy = xy * vLo;
- z = z * vHi;
- x = _mm_shuffle_ps(xy, xy, 0);
- y = _mm_shuffle_ps(xy, xy, 0x55);
- }
- break;
- }
- x = x + y;
- x = x + z;
- stack_array[index] = x;
- max = _mm_max_ps(x, max); // control the order here so that max is never NaN even if x is nan
- index++;
- }
- // if we found a new max.
- if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(max, dotMax)))
- { // we found a new max. Search for it
- // find max across the max vector, place in all elements of max -- big latency hit here
- max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0x4e));
- max = _mm_max_ps(max, (float4)_mm_shuffle_ps(max, max, 0xb1));
- // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
- // this where it actually makes a difference is handled in the early out at the top of the function,
- // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
- // complexity, and removed it.
- dotMax = max;
- // scan for the first occurence of max in the array
- size_t test;
- for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], max))); index++) // local_count must be a multiple of 4
- {
- }
- maxIndex = 4 * index + segment + indexTable[test];
- }
- _mm_store_ss(dotResult, dotMax);
- return maxIndex;
- }
- long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult);
- long _mindot_large(const float *vv, const float *vec, unsigned long count, float *dotResult)
- {
- const float4 *vertices = (const float4 *)vv;
- static const unsigned char indexTable[16] = {(unsigned char)-1, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
- float4 dotmin = btAssign128(BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY);
- float4 vvec = _mm_loadu_ps(vec);
- float4 vHi = btCastiTo128f(_mm_shuffle_epi32(btCastfTo128i(vvec), 0xaa)); /// zzzz
- float4 vLo = _mm_movelh_ps(vvec, vvec); /// xyxy
- long minIndex = -1L;
- size_t segment = 0;
- float4 stack_array[STACK_ARRAY_COUNT];
- #if DEBUG
- //memset( stack_array, -1, STACK_ARRAY_COUNT * sizeof(stack_array[0]) );
- #endif
- size_t index;
- float4 min;
- // Faster loop without cleanup code for full tiles
- for (segment = 0; segment + STACK_ARRAY_COUNT * 4 <= count; segment += STACK_ARRAY_COUNT * 4)
- {
- min = dotmin;
- for (index = 0; index < STACK_ARRAY_COUNT; index += 4)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
- float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 1] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 2] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 3] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
- }
- // If we found a new min
- if (0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
- {
- // copy the new min across all lanes of our min accumulator
- min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
- min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
- dotmin = min;
- // find first occurrence of that min
- size_t test;
- for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
- {
- }
- // record where it is.
- minIndex = 4 * index + segment + indexTable[test];
- }
- }
- // account for work we've already done
- count -= segment;
- // Deal with the last < STACK_ARRAY_COUNT vectors
- min = dotmin;
- index = 0;
- if (btUnlikely(count > 16))
- {
- for (; index + 4 <= count / 4; index += 4)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
- float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 1] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 2] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- v3 = vertices[3];
- vertices += 4;
- lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- z = _mm_shuffle_ps(hi0, hi1, 0x88);
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index + 3] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- // It is too costly to keep the index of the min here. We will look for it again later. We save a lot of work this way.
- }
- }
- size_t localCount = (count & -4L) - 4 * index;
- if (localCount)
- {
- #ifdef __APPLE__
- vertices += localCount; // counter the offset
- float4 t0, t1, t2, t3, t4;
- size_t byteIndex = -(localCount) * sizeof(float);
- float4 *sap = &stack_array[index + localCount / 4];
- asm volatile(
- ".align 4 \n\
- 0: movaps %[min], %[t2] // move min out of the way to avoid propagating NaNs in min \n\
- movaps (%[vertices], %[byteIndex], 4), %[t0] // vertices[0] \n\
- movaps 16(%[vertices], %[byteIndex], 4), %[t1] // vertices[1] \n\
- movaps %[t0], %[min] // vertices[0] \n\
- movlhps %[t1], %[min] // x0y0x1y1 \n\
- movaps 32(%[vertices], %[byteIndex], 4), %[t3] // vertices[2] \n\
- movaps 48(%[vertices], %[byteIndex], 4), %[t4] // vertices[3] \n\
- mulps %[vLo], %[min] // x0y0x1y1 * vLo \n\
- movhlps %[t0], %[t1] // z0w0z1w1 \n\
- movaps %[t3], %[t0] // vertices[2] \n\
- movlhps %[t4], %[t0] // x2y2x3y3 \n\
- movhlps %[t3], %[t4] // z2w2z3w3 \n\
- mulps %[vLo], %[t0] // x2y2x3y3 * vLo \n\
- shufps $0x88, %[t4], %[t1] // z0z1z2z3 \n\
- mulps %[vHi], %[t1] // z0z1z2z3 * vHi \n\
- movaps %[min], %[t3] // x0y0x1y1 * vLo \n\
- shufps $0x88, %[t0], %[min] // x0x1x2x3 * vLo.x \n\
- shufps $0xdd, %[t0], %[t3] // y0y1y2y3 * vLo.y \n\
- addps %[t3], %[min] // x + y \n\
- addps %[t1], %[min] // x + y + z \n\
- movaps %[min], (%[sap], %[byteIndex]) // record result for later scrutiny \n\
- minps %[t2], %[min] // record min, restore min \n\
- add $16, %[byteIndex] // advance loop counter\n\
- jnz 0b \n\
- "
- : [min] "+x"(min), [t0] "=&x"(t0), [t1] "=&x"(t1), [t2] "=&x"(t2), [t3] "=&x"(t3), [t4] "=&x"(t4), [byteIndex] "+r"(byteIndex)
- : [vLo] "x"(vLo), [vHi] "x"(vHi), [vertices] "r"(vertices), [sap] "r"(sap)
- : "memory", "cc");
- index += localCount / 4;
- #else
- {
- for (unsigned int i = 0; i < localCount / 4; i++, index++)
- { // do four dot products at a time. Carefully avoid touching the w element.
- float4 v0 = vertices[0];
- float4 v1 = vertices[1];
- float4 v2 = vertices[2];
- float4 v3 = vertices[3];
- vertices += 4;
- float4 lo0 = _mm_movelh_ps(v0, v1); // x0y0x1y1
- float4 hi0 = _mm_movehl_ps(v1, v0); // z0?0z1?1
- float4 lo1 = _mm_movelh_ps(v2, v3); // x2y2x3y3
- float4 hi1 = _mm_movehl_ps(v3, v2); // z2?2z3?3
- lo0 = lo0 * vLo;
- lo1 = lo1 * vLo;
- float4 z = _mm_shuffle_ps(hi0, hi1, 0x88);
- float4 x = _mm_shuffle_ps(lo0, lo1, 0x88);
- float4 y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- z = z * vHi;
- x = x + y;
- x = x + z;
- stack_array[index] = x;
- min = _mm_min_ps(x, min); // control the order here so that max is never NaN even if x is nan
- }
- }
- #endif
- }
- // process the last few points
- if (count & 3)
- {
- float4 v0, v1, v2, x, y, z;
- switch (count & 3)
- {
- case 3:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- v2 = vertices[2];
- // Calculate 3 dot products, transpose, duplicate v2
- float4 lo0 = _mm_movelh_ps(v0, v1); // xyxy.lo
- float4 hi0 = _mm_movehl_ps(v1, v0); // z?z?.lo
- lo0 = lo0 * vLo;
- z = _mm_shuffle_ps(hi0, v2, 0xa8); // z0z1z2z2
- z = z * vHi;
- float4 lo1 = _mm_movelh_ps(v2, v2); // xyxy
- lo1 = lo1 * vLo;
- x = _mm_shuffle_ps(lo0, lo1, 0x88);
- y = _mm_shuffle_ps(lo0, lo1, 0xdd);
- }
- break;
- case 2:
- {
- v0 = vertices[0];
- v1 = vertices[1];
- float4 xy = _mm_movelh_ps(v0, v1);
- z = _mm_movehl_ps(v1, v0);
- xy = xy * vLo;
- z = _mm_shuffle_ps(z, z, 0xa8);
- x = _mm_shuffle_ps(xy, xy, 0xa8);
- y = _mm_shuffle_ps(xy, xy, 0xfd);
- z = z * vHi;
- }
- break;
- case 1:
- {
- float4 xy = vertices[0];
- z = _mm_shuffle_ps(xy, xy, 0xaa);
- xy = xy * vLo;
- z = z * vHi;
- x = _mm_shuffle_ps(xy, xy, 0);
- y = _mm_shuffle_ps(xy, xy, 0x55);
- }
- break;
- }
- x = x + y;
- x = x + z;
- stack_array[index] = x;
- min = _mm_min_ps(x, min); // control the order here so that min is never NaN even if x is nan
- index++;
- }
- // if we found a new min.
- if (0 == segment || 0xf != _mm_movemask_ps((float4)_mm_cmpeq_ps(min, dotmin)))
- { // we found a new min. Search for it
- // find min across the min vector, place in all elements of min -- big latency hit here
- min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0x4e));
- min = _mm_min_ps(min, (float4)_mm_shuffle_ps(min, min, 0xb1));
- // It is slightly faster to do this part in scalar code when count < 8. However, the common case for
- // this where it actually makes a difference is handled in the early out at the top of the function,
- // so it is less than a 1% difference here. I opted for improved code size, fewer branches and reduced
- // complexity, and removed it.
- dotmin = min;
- // scan for the first occurence of min in the array
- size_t test;
- for (index = 0; 0 == (test = _mm_movemask_ps(_mm_cmpeq_ps(stack_array[index], min))); index++) // local_count must be a multiple of 4
- {
- }
- minIndex = 4 * index + segment + indexTable[test];
- }
- _mm_store_ss(dotResult, dotmin);
- return minIndex;
- }
- #elif defined BT_USE_NEON
- #define ARM_NEON_GCC_COMPATIBILITY 1
- #include <arm_neon.h>
- #include <sys/types.h>
- #include <sys/sysctl.h> //for sysctlbyname
- static long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
- static long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
- static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
- static long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult);
- static long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult);
- static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult);
- long (*_maxdot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _maxdot_large_sel;
- long (*_mindot_large)(const float *vv, const float *vec, unsigned long count, float *dotResult) = _mindot_large_sel;
- static inline uint32_t btGetCpuCapabilities(void)
- {
- static uint32_t capabilities = 0;
- static bool testedCapabilities = false;
- if (0 == testedCapabilities)
- {
- uint32_t hasFeature = 0;
- size_t featureSize = sizeof(hasFeature);
- int err = sysctlbyname("hw.optional.neon_hpfp", &hasFeature, &featureSize, NULL, 0);
- if (0 == err && hasFeature)
- capabilities |= 0x2000;
- testedCapabilities = true;
- }
- return capabilities;
- }
- static long _maxdot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
- {
- if (btGetCpuCapabilities() & 0x2000)
- _maxdot_large = _maxdot_large_v1;
- else
- _maxdot_large = _maxdot_large_v0;
- return _maxdot_large(vv, vec, count, dotResult);
- }
- static long _mindot_large_sel(const float *vv, const float *vec, unsigned long count, float *dotResult)
- {
- if (btGetCpuCapabilities() & 0x2000)
- _mindot_large = _mindot_large_v1;
- else
- _mindot_large = _mindot_large_v0;
- return _mindot_large(vv, vec, count, dotResult);
- }
- #if defined __arm__
- #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r; asm( "vld1.f32 {%0}, [%1, :128]!\n" : "=w" (_r), "+r" (_ptr) ); /*return*/ _r; })
- #else
- //support 64bit arm
- #define vld1q_f32_aligned_postincrement(_ptr) ({ float32x4_t _r = ((float32x4_t*)(_ptr))[0]; (_ptr) = (const float*) ((const char*)(_ptr) + 16L); /*return*/ _r; })
- #endif
- long _maxdot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
- {
- unsigned long i = 0;
- float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
- float32x2_t vLo = vget_low_f32(vvec);
- float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
- float32x2_t dotMaxLo = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
- float32x2_t dotMaxHi = (float32x2_t){-BT_INFINITY, -BT_INFINITY};
- uint32x2_t indexLo = (uint32x2_t){0, 1};
- uint32x2_t indexHi = (uint32x2_t){2, 3};
- uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- const uint32x2_t four = (uint32x2_t){4, 4};
- for (; i + 8 <= count; i += 8)
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
- float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32(z0.val[0], vHi);
- float32x2_t zHi = vmul_f32(z1.val[0], vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy1);
- float32x2_t rHi = vpadd_f32(xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
- uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
- uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
- dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- v0 = vld1q_f32_aligned_postincrement(vv);
- v1 = vld1q_f32_aligned_postincrement(vv);
- v2 = vld1q_f32_aligned_postincrement(vv);
- v3 = vld1q_f32_aligned_postincrement(vv);
- xy0 = vmul_f32(vget_low_f32(v0), vLo);
- xy1 = vmul_f32(vget_low_f32(v1), vLo);
- xy2 = vmul_f32(vget_low_f32(v2), vLo);
- xy3 = vmul_f32(vget_low_f32(v3), vLo);
- z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
- zLo = vmul_f32(z0.val[0], vHi);
- zHi = vmul_f32(z1.val[0], vHi);
- rLo = vpadd_f32(xy0, xy1);
- rHi = vpadd_f32(xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
- maskLo = vcgt_f32(rLo, dotMaxLo);
- maskHi = vcgt_f32(rHi, dotMaxHi);
- dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
- for (; i + 4 <= count; i += 4)
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
- float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32(z0.val[0], vHi);
- float32x2_t zHi = vmul_f32(z1.val[0], vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy1);
- float32x2_t rHi = vpadd_f32(xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
- uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
- uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
- dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
- switch (count & 3)
- {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
- float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32(z0.val[0], vHi);
- float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy1);
- float32x2_t rHi = vpadd_f32(xy2, xy2);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
- uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
- uint32x2_t maskHi = vcgt_f32(rHi, dotMaxHi);
- dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
- dotMaxHi = vbsl_f32(maskHi, rHi, dotMaxHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
- float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32(z0.val[0], vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy1);
- rLo = vadd_f32(rLo, zLo);
- uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
- dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
- float32x2_t zLo = vmul_f32(z0, vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy0);
- rLo = vadd_f32(rLo, zLo);
- uint32x2_t maskLo = vcgt_f32(rLo, dotMaxLo);
- dotMaxLo = vbsl_f32(maskLo, rLo, dotMaxLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
- default:
- break;
- }
- // select best answer between hi and lo results
- uint32x2_t mask = vcgt_f32(dotMaxHi, dotMaxLo);
- dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
- iLo = vbsl_u32(mask, iHi, iLo);
- // select best answer between even and odd results
- dotMaxHi = vdup_lane_f32(dotMaxLo, 1);
- iHi = vdup_lane_u32(iLo, 1);
- mask = vcgt_f32(dotMaxHi, dotMaxLo);
- dotMaxLo = vbsl_f32(mask, dotMaxHi, dotMaxLo);
- iLo = vbsl_u32(mask, iHi, iLo);
- *dotResult = vget_lane_f32(dotMaxLo, 0);
- return vget_lane_u32(iLo, 0);
- }
- long _maxdot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
- {
- float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
- float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
- float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
- const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
- uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
- uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- float32x4_t maxDot = (float32x4_t){-BT_INFINITY, -BT_INFINITY, -BT_INFINITY, -BT_INFINITY};
- unsigned long i = 0;
- for (; i + 8 <= count; i += 8)
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
- float32x4x2_t zb = vuzpq_f32(z0, z1);
- float32x4_t z = vmulq_f32(zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32(mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- v0 = vld1q_f32_aligned_postincrement(vv);
- v1 = vld1q_f32_aligned_postincrement(vv);
- v2 = vld1q_f32_aligned_postincrement(vv);
- v3 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
- zb = vuzpq_f32(z0, z1);
- z = vmulq_f32(zb.val[0], vHi);
- xy = vuzpq_f32(xy0, xy1);
- x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32(mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- for (; i + 4 <= count; i += 4)
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
- float32x4x2_t zb = vuzpq_f32(z0, z1);
- float32x4_t z = vmulq_f32(zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32(mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- switch (count & 3)
- {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
- float32x4x2_t zb = vuzpq_f32(z0, z1);
- float32x4_t z = vmulq_f32(zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32(mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- xy0 = vmulq_f32(xy0, vLo);
- float32x4x2_t zb = vuzpq_f32(z0, z0);
- float32x4_t z = vmulq_f32(zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32(mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
- xy0 = vmulq_f32(xy0, vLo);
- z = vmulq_f32(z, vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcgtq_f32(x, maxDot);
- maxDot = vbslq_f32(mask, x, maxDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- default:
- break;
- }
- // select best answer between hi and lo results
- uint32x2_t mask = vcgt_f32(vget_high_f32(maxDot), vget_low_f32(maxDot));
- float32x2_t maxDot2 = vbsl_f32(mask, vget_high_f32(maxDot), vget_low_f32(maxDot));
- uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
- // select best answer between even and odd results
- float32x2_t maxDotO = vdup_lane_f32(maxDot2, 1);
- uint32x2_t indexHi = vdup_lane_u32(index2, 1);
- mask = vcgt_f32(maxDotO, maxDot2);
- maxDot2 = vbsl_f32(mask, maxDotO, maxDot2);
- index2 = vbsl_u32(mask, indexHi, index2);
- *dotResult = vget_lane_f32(maxDot2, 0);
- return vget_lane_u32(index2, 0);
- }
- long _mindot_large_v0(const float *vv, const float *vec, unsigned long count, float *dotResult)
- {
- unsigned long i = 0;
- float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
- float32x2_t vLo = vget_low_f32(vvec);
- float32x2_t vHi = vdup_lane_f32(vget_high_f32(vvec), 0);
- float32x2_t dotMinLo = (float32x2_t){BT_INFINITY, BT_INFINITY};
- float32x2_t dotMinHi = (float32x2_t){BT_INFINITY, BT_INFINITY};
- uint32x2_t indexLo = (uint32x2_t){0, 1};
- uint32x2_t indexHi = (uint32x2_t){2, 3};
- uint32x2_t iLo = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- uint32x2_t iHi = (uint32x2_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- const uint32x2_t four = (uint32x2_t){4, 4};
- for (; i + 8 <= count; i += 8)
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
- float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32(z0.val[0], vHi);
- float32x2_t zHi = vmul_f32(z1.val[0], vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy1);
- float32x2_t rHi = vpadd_f32(xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
- uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
- uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
- dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- v0 = vld1q_f32_aligned_postincrement(vv);
- v1 = vld1q_f32_aligned_postincrement(vv);
- v2 = vld1q_f32_aligned_postincrement(vv);
- v3 = vld1q_f32_aligned_postincrement(vv);
- xy0 = vmul_f32(vget_low_f32(v0), vLo);
- xy1 = vmul_f32(vget_low_f32(v1), vLo);
- xy2 = vmul_f32(vget_low_f32(v2), vLo);
- xy3 = vmul_f32(vget_low_f32(v3), vLo);
- z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
- zLo = vmul_f32(z0.val[0], vHi);
- zHi = vmul_f32(z1.val[0], vHi);
- rLo = vpadd_f32(xy0, xy1);
- rHi = vpadd_f32(xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
- maskLo = vclt_f32(rLo, dotMinLo);
- maskHi = vclt_f32(rHi, dotMinHi);
- dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
- for (; i + 4 <= count; i += 4)
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
- float32x2_t xy3 = vmul_f32(vget_low_f32(v3), vLo);
- float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x2x2_t z1 = vtrn_f32(vget_high_f32(v2), vget_high_f32(v3));
- float32x2_t zLo = vmul_f32(z0.val[0], vHi);
- float32x2_t zHi = vmul_f32(z1.val[0], vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy1);
- float32x2_t rHi = vpadd_f32(xy2, xy3);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
- uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
- uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
- dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- indexLo = vadd_u32(indexLo, four);
- indexHi = vadd_u32(indexHi, four);
- }
- switch (count & 3)
- {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
- float32x2_t xy2 = vmul_f32(vget_low_f32(v2), vLo);
- float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32(z0.val[0], vHi);
- float32x2_t zHi = vmul_f32(vdup_lane_f32(vget_high_f32(v2), 0), vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy1);
- float32x2_t rHi = vpadd_f32(xy2, xy2);
- rLo = vadd_f32(rLo, zLo);
- rHi = vadd_f32(rHi, zHi);
- uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
- uint32x2_t maskHi = vclt_f32(rHi, dotMinHi);
- dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
- dotMinHi = vbsl_f32(maskHi, rHi, dotMinHi);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- iHi = vbsl_u32(maskHi, indexHi, iHi);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t xy1 = vmul_f32(vget_low_f32(v1), vLo);
- float32x2x2_t z0 = vtrn_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x2_t zLo = vmul_f32(z0.val[0], vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy1);
- rLo = vadd_f32(rLo, zLo);
- uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
- dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x2_t xy0 = vmul_f32(vget_low_f32(v0), vLo);
- float32x2_t z0 = vdup_lane_f32(vget_high_f32(v0), 0);
- float32x2_t zLo = vmul_f32(z0, vHi);
- float32x2_t rLo = vpadd_f32(xy0, xy0);
- rLo = vadd_f32(rLo, zLo);
- uint32x2_t maskLo = vclt_f32(rLo, dotMinLo);
- dotMinLo = vbsl_f32(maskLo, rLo, dotMinLo);
- iLo = vbsl_u32(maskLo, indexLo, iLo);
- }
- break;
- default:
- break;
- }
- // select best answer between hi and lo results
- uint32x2_t mask = vclt_f32(dotMinHi, dotMinLo);
- dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
- iLo = vbsl_u32(mask, iHi, iLo);
- // select best answer between even and odd results
- dotMinHi = vdup_lane_f32(dotMinLo, 1);
- iHi = vdup_lane_u32(iLo, 1);
- mask = vclt_f32(dotMinHi, dotMinLo);
- dotMinLo = vbsl_f32(mask, dotMinHi, dotMinLo);
- iLo = vbsl_u32(mask, iHi, iLo);
- *dotResult = vget_lane_f32(dotMinLo, 0);
- return vget_lane_u32(iLo, 0);
- }
- long _mindot_large_v1(const float *vv, const float *vec, unsigned long count, float *dotResult)
- {
- float32x4_t vvec = vld1q_f32_aligned_postincrement(vec);
- float32x4_t vLo = vcombine_f32(vget_low_f32(vvec), vget_low_f32(vvec));
- float32x4_t vHi = vdupq_lane_f32(vget_high_f32(vvec), 0);
- const uint32x4_t four = (uint32x4_t){4, 4, 4, 4};
- uint32x4_t local_index = (uint32x4_t){0, 1, 2, 3};
- uint32x4_t index = (uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1)};
- float32x4_t minDot = (float32x4_t){BT_INFINITY, BT_INFINITY, BT_INFINITY, BT_INFINITY};
- unsigned long i = 0;
- for (; i + 8 <= count; i += 8)
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
- float32x4x2_t zb = vuzpq_f32(z0, z1);
- float32x4_t z = vmulq_f32(zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32(mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- v0 = vld1q_f32_aligned_postincrement(vv);
- v1 = vld1q_f32_aligned_postincrement(vv);
- v2 = vld1q_f32_aligned_postincrement(vv);
- v3 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
- zb = vuzpq_f32(z0, z1);
- z = vmulq_f32(zb.val[0], vHi);
- xy = vuzpq_f32(xy0, xy1);
- x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32(mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- for (; i + 4 <= count; i += 4)
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v3 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v3));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v3));
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
- float32x4x2_t zb = vuzpq_f32(z0, z1);
- float32x4_t z = vmulq_f32(zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32(mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- switch (count & 3)
- {
- case 3:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v2 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- float32x4_t xy1 = vcombine_f32(vget_low_f32(v2), vget_low_f32(v2));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- float32x4_t z1 = vcombine_f32(vget_high_f32(v2), vget_high_f32(v2));
- xy0 = vmulq_f32(xy0, vLo);
- xy1 = vmulq_f32(xy1, vLo);
- float32x4x2_t zb = vuzpq_f32(z0, z1);
- float32x4_t z = vmulq_f32(zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy1);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32(mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- case 2:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- float32x4_t v1 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v1));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z0 = vcombine_f32(vget_high_f32(v0), vget_high_f32(v1));
- xy0 = vmulq_f32(xy0, vLo);
- float32x4x2_t zb = vuzpq_f32(z0, z0);
- float32x4_t z = vmulq_f32(zb.val[0], vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32(mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- case 1:
- {
- float32x4_t v0 = vld1q_f32_aligned_postincrement(vv);
- // the next two lines should resolve to a single vswp d, d
- float32x4_t xy0 = vcombine_f32(vget_low_f32(v0), vget_low_f32(v0));
- // the next two lines should resolve to a single vswp d, d
- float32x4_t z = vdupq_lane_f32(vget_high_f32(v0), 0);
- xy0 = vmulq_f32(xy0, vLo);
- z = vmulq_f32(z, vHi);
- float32x4x2_t xy = vuzpq_f32(xy0, xy0);
- float32x4_t x = vaddq_f32(xy.val[0], xy.val[1]);
- x = vaddq_f32(x, z);
- uint32x4_t mask = vcltq_f32(x, minDot);
- minDot = vbslq_f32(mask, x, minDot);
- index = vbslq_u32(mask, local_index, index);
- local_index = vaddq_u32(local_index, four);
- }
- break;
- default:
- break;
- }
- // select best answer between hi and lo results
- uint32x2_t mask = vclt_f32(vget_high_f32(minDot), vget_low_f32(minDot));
- float32x2_t minDot2 = vbsl_f32(mask, vget_high_f32(minDot), vget_low_f32(minDot));
- uint32x2_t index2 = vbsl_u32(mask, vget_high_u32(index), vget_low_u32(index));
- // select best answer between even and odd results
- float32x2_t minDotO = vdup_lane_f32(minDot2, 1);
- uint32x2_t indexHi = vdup_lane_u32(index2, 1);
- mask = vclt_f32(minDotO, minDot2);
- minDot2 = vbsl_f32(mask, minDotO, minDot2);
- index2 = vbsl_u32(mask, indexHi, index2);
- *dotResult = vget_lane_f32(minDot2, 0);
- return vget_lane_u32(index2, 0);
- }
- #else
- #error Unhandled __APPLE__ arch
- #endif
- #endif /* __APPLE__ */
|