cppspmd_sse.h 80 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105
  1. // cppspmd_sse.h
  2. // Copyright 2020-2022 Binomial LLC
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. //
  16. // Notes for Basis Universal:
  17. // All of the "cppspmd" code and headers are OPTIONAL to Basis Universal. if BASISU_SUPPORT_SSE is 0, it will never be included and does not impact compilation.
  18. // The techniques used in this code were originally demonstrated for AVX2 by Nicolas Guillemot, Jefferson Amstutz in their "CppSPMD" project.
  19. // This is new code for use in Basis Universal, although it uses the same general SPMD techniques in SSE 2/4.
  20. #include <stdlib.h>
  21. #include <stdint.h>
  22. #include <assert.h>
  23. #include <math.h>
  24. #include <utility>
  25. #include <algorithm>
  26. #if CPPSPMD_SSE2
  27. #include <xmmintrin.h> // SSE
  28. #include <emmintrin.h> // SSE2
  29. #else
  30. #include <xmmintrin.h> // SSE
  31. #include <emmintrin.h> // SSE2
  32. #include <pmmintrin.h> // SSE3
  33. #include <tmmintrin.h> // SSSE3
  34. #include <smmintrin.h> // SSE4.1
  35. //#include <nmmintrin.h> // SSE4.2
  36. #endif
  37. #undef CPPSPMD_SSE
  38. #undef CPPSPMD_AVX1
  39. #undef CPPSPMD_AVX2
  40. #undef CPPSPMD_AVX
  41. #undef CPPSPMD_FLOAT4
  42. #undef CPPSPMD_INT16
  43. #define CPPSPMD_SSE 1
  44. #define CPPSPMD_AVX 0
  45. #define CPPSPMD_AVX1 0
  46. #define CPPSPMD_AVX2 0
  47. #define CPPSPMD_FLOAT4 0
  48. #define CPPSPMD_INT16 0
  49. #ifdef _MSC_VER
  50. #ifndef CPPSPMD_DECL
  51. #define CPPSPMD_DECL(type, name) __declspec(align(16)) type name
  52. #endif
  53. #ifndef CPPSPMD_ALIGN
  54. #define CPPSPMD_ALIGN(v) __declspec(align(v))
  55. #endif
  56. #define _mm_undefined_si128 _mm_setzero_si128
  57. #define _mm_undefined_ps _mm_setzero_ps
  58. #else
  59. #ifndef CPPSPMD_DECL
  60. #define CPPSPMD_DECL(type, name) type name __attribute__((aligned(32)))
  61. #endif
  62. #ifndef CPPSPMD_ALIGN
  63. #define CPPSPMD_ALIGN(v) __attribute__((aligned(v)))
  64. #endif
  65. #endif
  66. #ifndef CPPSPMD_FORCE_INLINE
  67. #ifdef _DEBUG
  68. #define CPPSPMD_FORCE_INLINE inline
  69. #else
  70. #ifdef _MSC_VER
  71. #define CPPSPMD_FORCE_INLINE __forceinline
  72. #else
  73. #define CPPSPMD_FORCE_INLINE inline
  74. #endif
  75. #endif
  76. #endif
  77. #undef CPPSPMD
  78. #undef CPPSPMD_ARCH
  79. #if CPPSPMD_SSE2
  80. #define CPPSPMD_SSE41 0
  81. #define CPPSPMD cppspmd_sse2
  82. #define CPPSPMD_ARCH _sse2
  83. #else
  84. #define CPPSPMD_SSE41 1
  85. #define CPPSPMD cppspmd_sse41
  86. #define CPPSPMD_ARCH _sse41
  87. #endif
  88. #ifndef CPPSPMD_GLUER
  89. #define CPPSPMD_GLUER(a, b) a##b
  90. #endif
  91. #ifndef CPPSPMD_GLUER2
  92. #define CPPSPMD_GLUER2(a, b) CPPSPMD_GLUER(a, b)
  93. #endif
  94. #ifndef CPPSPMD_NAME
  95. #define CPPSPMD_NAME(a) CPPSPMD_GLUER2(a, CPPSPMD_ARCH)
  96. #endif
  97. #undef VASSERT
  98. #define VCOND(cond) ((exec_mask(vbool(cond)) & m_exec).get_movemask() == m_exec.get_movemask())
  99. #define VASSERT(cond) assert( VCOND(cond) )
  100. #define CPPSPMD_ALIGNMENT (16)
  101. #define storeu_si32(p, a) (void)(*(int*)(p) = _mm_cvtsi128_si32((a)))
  102. namespace CPPSPMD
  103. {
  104. const int PROGRAM_COUNT_SHIFT = 2;
  105. const int PROGRAM_COUNT = 1 << PROGRAM_COUNT_SHIFT;
  106. template <typename N> inline N* aligned_new() { void* p = _mm_malloc(sizeof(N), 64); new (p) N; return static_cast<N*>(p); }
  107. template <typename N> void aligned_delete(N* p) { if (p) { p->~N(); _mm_free(p); } }
  108. CPPSPMD_DECL(const uint32_t, g_allones_128[4]) = { UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX };
  109. CPPSPMD_DECL(const uint32_t, g_x_128[4]) = { UINT32_MAX, 0, 0, 0 };
  110. CPPSPMD_DECL(const float, g_onef_128[4]) = { 1.0f, 1.0f, 1.0f, 1.0f };
  111. CPPSPMD_DECL(const uint32_t, g_oneu_128[4]) = { 1, 1, 1, 1 };
  112. CPPSPMD_DECL(const uint32_t, g_lane_masks_128[4][4]) =
  113. {
  114. { UINT32_MAX, 0, 0, 0 },
  115. { 0, UINT32_MAX, 0, 0 },
  116. { 0, 0, UINT32_MAX, 0 },
  117. { 0, 0, 0, UINT32_MAX },
  118. };
  119. #if CPPSPMD_SSE41
  120. CPPSPMD_FORCE_INLINE __m128i _mm_blendv_epi32(__m128i a, __m128i b, __m128i c) { return _mm_castps_si128(_mm_blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(c))); }
  121. #endif
  122. CPPSPMD_FORCE_INLINE __m128i blendv_epi8(__m128i a, __m128i b, __m128i mask)
  123. {
  124. #if CPPSPMD_SSE2
  125. return _mm_castps_si128(_mm_or_ps(_mm_and_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(b)), _mm_andnot_ps(_mm_castsi128_ps(mask), _mm_castsi128_ps(a))));
  126. #else
  127. return _mm_blendv_epi8(a, b, mask);
  128. #endif
  129. }
  130. CPPSPMD_FORCE_INLINE __m128 blendv_mask_ps(__m128 a, __m128 b, __m128 mask)
  131. {
  132. #if CPPSPMD_SSE2
  133. // We know it's a mask, so we can just emulate the blend.
  134. return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
  135. #else
  136. return _mm_blendv_ps(a, b, mask);
  137. #endif
  138. }
  139. CPPSPMD_FORCE_INLINE __m128 blendv_ps(__m128 a, __m128 b, __m128 mask)
  140. {
  141. #if CPPSPMD_SSE2
  142. // Input is not a mask, but MSB bits - so emulate _mm_blendv_ps() by replicating bit 31.
  143. mask = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mask), 31));
  144. return _mm_or_ps(_mm_and_ps(mask, b), _mm_andnot_ps(mask, a));
  145. #else
  146. return _mm_blendv_ps(a, b, mask);
  147. #endif
  148. }
  149. CPPSPMD_FORCE_INLINE __m128i blendv_mask_epi32(__m128i a, __m128i b, __m128i mask)
  150. {
  151. return _mm_castps_si128(blendv_mask_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
  152. }
  153. CPPSPMD_FORCE_INLINE __m128i blendv_epi32(__m128i a, __m128i b, __m128i mask)
  154. {
  155. return _mm_castps_si128(blendv_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), _mm_castsi128_ps(mask)));
  156. }
  157. #if CPPSPMD_SSE2
  158. CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_cvtsi128_si32(vec); }
  159. CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0x55)); }
  160. CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xAA)); }
  161. CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_cvtsi128_si32(_mm_shuffle_epi32(vec, 0xFF)); }
  162. // Returns float bits as int, to emulate _mm_extract_ps()
  163. CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { float f = _mm_cvtss_f32(vec); return *(const int*)&f; }
  164. CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); return *(const int*)&f; }
  165. CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); return *(const int*)&f; }
  166. CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { float f = _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); return *(const int*)&f; }
  167. // Returns floats
  168. CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { return _mm_cvtss_f32(vec); }
  169. CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0x55)); }
  170. CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xAA)); }
  171. CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { return _mm_cvtss_f32(_mm_shuffle_ps(vec, vec, 0xFF)); }
  172. #else
  173. CPPSPMD_FORCE_INLINE int extract_x(const __m128i& vec) { return _mm_extract_epi32(vec, 0); }
  174. CPPSPMD_FORCE_INLINE int extract_y(const __m128i& vec) { return _mm_extract_epi32(vec, 1); }
  175. CPPSPMD_FORCE_INLINE int extract_z(const __m128i& vec) { return _mm_extract_epi32(vec, 2); }
  176. CPPSPMD_FORCE_INLINE int extract_w(const __m128i& vec) { return _mm_extract_epi32(vec, 3); }
  177. // Returns float bits as int
  178. CPPSPMD_FORCE_INLINE int extract_ps_x(const __m128& vec) { return _mm_extract_ps(vec, 0); }
  179. CPPSPMD_FORCE_INLINE int extract_ps_y(const __m128& vec) { return _mm_extract_ps(vec, 1); }
  180. CPPSPMD_FORCE_INLINE int extract_ps_z(const __m128& vec) { return _mm_extract_ps(vec, 2); }
  181. CPPSPMD_FORCE_INLINE int extract_ps_w(const __m128& vec) { return _mm_extract_ps(vec, 3); }
  182. // Returns floats
  183. CPPSPMD_FORCE_INLINE float extractf_ps_x(const __m128& vec) { int v = extract_ps_x(vec); return *(const float*)&v; }
  184. CPPSPMD_FORCE_INLINE float extractf_ps_y(const __m128& vec) { int v = extract_ps_y(vec); return *(const float*)&v; }
  185. CPPSPMD_FORCE_INLINE float extractf_ps_z(const __m128& vec) { int v = extract_ps_z(vec); return *(const float*)&v; }
  186. CPPSPMD_FORCE_INLINE float extractf_ps_w(const __m128& vec) { int v = extract_ps_w(vec); return *(const float*)&v; }
  187. #endif
  188. #if CPPSPMD_SSE2
  189. CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 0), (uint32_t)v >> 16U, 1); }
  190. CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 2), (uint32_t)v >> 16U, 3); }
  191. CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 4), (uint32_t)v >> 16U, 5); }
  192. CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi16(_mm_insert_epi16(vec, v, 6), (uint32_t)v >> 16U, 7); }
  193. #else
  194. CPPSPMD_FORCE_INLINE __m128i insert_x(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 0); }
  195. CPPSPMD_FORCE_INLINE __m128i insert_y(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 1); }
  196. CPPSPMD_FORCE_INLINE __m128i insert_z(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 2); }
  197. CPPSPMD_FORCE_INLINE __m128i insert_w(const __m128i& vec, int v) { return _mm_insert_epi32(vec, v, 3); }
  198. #endif
  199. #if CPPSPMD_SSE2
  200. inline __m128i shuffle_epi8(const __m128i& a, const __m128i& b)
  201. {
  202. // Just emulate _mm_shuffle_epi8. This is very slow, but what else can we do?
  203. CPPSPMD_ALIGN(16) uint8_t av[16];
  204. _mm_store_si128((__m128i*)av, a);
  205. CPPSPMD_ALIGN(16) uint8_t bvi[16];
  206. _mm_store_ps((float*)bvi, _mm_and_ps(_mm_castsi128_ps(b), _mm_castsi128_ps(_mm_set1_epi32(0x0F0F0F0F))));
  207. CPPSPMD_ALIGN(16) uint8_t result[16];
  208. result[0] = av[bvi[0]];
  209. result[1] = av[bvi[1]];
  210. result[2] = av[bvi[2]];
  211. result[3] = av[bvi[3]];
  212. result[4] = av[bvi[4]];
  213. result[5] = av[bvi[5]];
  214. result[6] = av[bvi[6]];
  215. result[7] = av[bvi[7]];
  216. result[8] = av[bvi[8]];
  217. result[9] = av[bvi[9]];
  218. result[10] = av[bvi[10]];
  219. result[11] = av[bvi[11]];
  220. result[12] = av[bvi[12]];
  221. result[13] = av[bvi[13]];
  222. result[14] = av[bvi[14]];
  223. result[15] = av[bvi[15]];
  224. return _mm_andnot_si128(_mm_cmplt_epi8(b, _mm_setzero_si128()), _mm_load_si128((__m128i*)result));
  225. }
  226. #else
  227. CPPSPMD_FORCE_INLINE __m128i shuffle_epi8(const __m128i& a, const __m128i& b)
  228. {
  229. return _mm_shuffle_epi8(a, b);
  230. }
  231. #endif
  232. #if CPPSPMD_SSE2
  233. CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
  234. {
  235. return blendv_mask_epi32(b, a, _mm_cmplt_epi32(a, b));
  236. }
  237. CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
  238. {
  239. return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(a, b));
  240. }
  241. CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
  242. {
  243. __m128i n = _mm_set1_epi32(0x80000000);
  244. __m128i ac = _mm_add_epi32(a, n);
  245. __m128i bc = _mm_add_epi32(b, n);
  246. return blendv_mask_epi32(b, a, _mm_cmplt_epi32(ac, bc));
  247. }
  248. CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
  249. {
  250. __m128i n = _mm_set1_epi32(0x80000000);
  251. __m128i ac = _mm_add_epi32(a, n);
  252. __m128i bc = _mm_add_epi32(b, n);
  253. return blendv_mask_epi32(b, a, _mm_cmpgt_epi32(ac, bc));
  254. }
  255. #else
  256. CPPSPMD_FORCE_INLINE __m128i min_epi32(__m128i a, __m128i b)
  257. {
  258. return _mm_min_epi32(a, b);
  259. }
  260. CPPSPMD_FORCE_INLINE __m128i max_epi32(__m128i a, __m128i b)
  261. {
  262. return _mm_max_epi32(a, b);
  263. }
  264. CPPSPMD_FORCE_INLINE __m128i min_epu32(__m128i a, __m128i b)
  265. {
  266. return _mm_min_epu32(a, b);
  267. }
  268. CPPSPMD_FORCE_INLINE __m128i max_epu32(__m128i a, __m128i b)
  269. {
  270. return _mm_max_epu32(a, b);
  271. }
  272. #endif
  273. #if CPPSPMD_SSE2
  274. CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
  275. {
  276. __m128i sign_mask = _mm_srai_epi32(a, 31);
  277. return _mm_sub_epi32(_mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(sign_mask))), sign_mask);
  278. }
  279. #else
  280. CPPSPMD_FORCE_INLINE __m128i abs_epi32(__m128i a)
  281. {
  282. return _mm_abs_epi32(a);
  283. }
  284. #endif
  285. #if CPPSPMD_SSE2
  286. CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
  287. {
  288. __m128i tmp1 = _mm_mul_epu32(a, b);
  289. __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
  290. return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
  291. }
  292. #else
  293. CPPSPMD_FORCE_INLINE __m128i mullo_epi32(__m128i a, __m128i b)
  294. {
  295. return _mm_mullo_epi32(a, b);
  296. }
  297. #endif
  298. CPPSPMD_FORCE_INLINE __m128i mulhi_epu32(__m128i a, __m128i b)
  299. {
  300. __m128i tmp1 = _mm_mul_epu32(a, b);
  301. __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
  302. return _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 3, 1)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 3, 1)));
  303. }
  304. #if CPPSPMD_SSE2
  305. inline __m128i load_rgba32(const void* p)
  306. {
  307. __m128i xmm = _mm_cvtsi32_si128(*(const int*)p);
  308. xmm = _mm_unpacklo_epi8(xmm, _mm_setzero_si128());
  309. xmm = _mm_unpacklo_epi16(xmm, _mm_setzero_si128());
  310. return xmm;
  311. }
  312. #else
  313. inline __m128i load_rgba32(const void* p)
  314. {
  315. return _mm_cvtepu8_epi32(_mm_castps_si128(_mm_load_ss((const float*)p)));
  316. }
  317. #endif
  318. inline void transpose4x4(__m128i& x, __m128i& y, __m128i& z, __m128i& w, const __m128i& r0, const __m128i& r1, const __m128i& r2, const __m128i& r3)
  319. {
  320. __m128i t0 = _mm_unpacklo_epi32(r0, r1);
  321. __m128i t1 = _mm_unpacklo_epi32(r2, r3);
  322. __m128i t2 = _mm_unpackhi_epi32(r0, r1);
  323. __m128i t3 = _mm_unpackhi_epi32(r2, r3);
  324. x = _mm_unpacklo_epi64(t0, t1);
  325. y = _mm_unpackhi_epi64(t0, t1);
  326. z = _mm_unpacklo_epi64(t2, t3);
  327. w = _mm_unpackhi_epi64(t2, t3);
  328. }
  329. const uint32_t ALL_ON_MOVEMASK = 0xF;
  330. struct spmd_kernel
  331. {
  332. struct vint;
  333. struct lint;
  334. struct vbool;
  335. struct vfloat;
  336. typedef int int_t;
  337. typedef vint vint_t;
  338. typedef lint lint_t;
  339. // Exec mask
  340. struct exec_mask
  341. {
  342. __m128i m_mask;
  343. exec_mask() = default;
  344. CPPSPMD_FORCE_INLINE explicit exec_mask(const vbool& b);
  345. CPPSPMD_FORCE_INLINE explicit exec_mask(const __m128i& mask) : m_mask(mask) { }
  346. CPPSPMD_FORCE_INLINE void enable_lane(uint32_t lane) { m_mask = _mm_load_si128((const __m128i *)&g_lane_masks_128[lane][0]); }
  347. static CPPSPMD_FORCE_INLINE exec_mask all_on() { return exec_mask{ _mm_load_si128((const __m128i*)g_allones_128) }; }
  348. static CPPSPMD_FORCE_INLINE exec_mask all_off() { return exec_mask{ _mm_setzero_si128() }; }
  349. CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return _mm_movemask_ps(_mm_castsi128_ps(m_mask)); }
  350. };
  351. friend CPPSPMD_FORCE_INLINE bool all(const exec_mask& e);
  352. friend CPPSPMD_FORCE_INLINE bool any(const exec_mask& e);
  353. CPPSPMD_FORCE_INLINE bool spmd_all() const { return all(m_exec); }
  354. CPPSPMD_FORCE_INLINE bool spmd_any() const { return any(m_exec); }
  355. CPPSPMD_FORCE_INLINE bool spmd_none() { return !any(m_exec); }
  356. // true if cond is true for all active lanes - false if no active lanes
  357. CPPSPMD_FORCE_INLINE bool spmd_all(const vbool& e) { uint32_t m = m_exec.get_movemask(); return (m != 0) && ((exec_mask(e) & m_exec).get_movemask() == m); }
  358. // true if cond is true for any active lanes
  359. CPPSPMD_FORCE_INLINE bool spmd_any(const vbool& e) { return (exec_mask(e) & m_exec).get_movemask() != 0; }
  360. CPPSPMD_FORCE_INLINE bool spmd_none(const vbool& e) { return !spmd_any(e); }
  361. friend CPPSPMD_FORCE_INLINE exec_mask operator^ (const exec_mask& a, const exec_mask& b);
  362. friend CPPSPMD_FORCE_INLINE exec_mask operator& (const exec_mask& a, const exec_mask& b);
  363. friend CPPSPMD_FORCE_INLINE exec_mask operator| (const exec_mask& a, const exec_mask& b);
  364. exec_mask m_exec;
  365. exec_mask m_kernel_exec;
  366. exec_mask m_continue_mask;
  367. #ifdef _DEBUG
  368. bool m_in_loop;
  369. #endif
  370. CPPSPMD_FORCE_INLINE uint32_t get_movemask() const { return m_exec.get_movemask(); }
  371. void init(const exec_mask& kernel_exec);
  372. // Varying bool
  373. struct vbool
  374. {
  375. __m128i m_value;
  376. vbool() = default;
  377. CPPSPMD_FORCE_INLINE vbool(bool value) : m_value(_mm_set1_epi32(value ? UINT32_MAX : 0)) { }
  378. CPPSPMD_FORCE_INLINE explicit vbool(const __m128i& value) : m_value(value) { }
  379. CPPSPMD_FORCE_INLINE explicit operator vfloat() const;
  380. CPPSPMD_FORCE_INLINE explicit operator vint() const;
  381. private:
  382. //vbool& operator=(const vbool&);
  383. };
  384. friend vbool operator!(const vbool& v);
  385. CPPSPMD_FORCE_INLINE vbool& store(vbool& dst, const vbool& src)
  386. {
  387. dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
  388. return dst;
  389. }
  390. CPPSPMD_FORCE_INLINE vbool& store_all(vbool& dst, const vbool& src)
  391. {
  392. dst.m_value = src.m_value;
  393. return dst;
  394. }
  395. // Varying float
  396. struct vfloat
  397. {
  398. __m128 m_value;
  399. vfloat() = default;
  400. CPPSPMD_FORCE_INLINE explicit vfloat(const __m128& v) : m_value(v) { }
  401. CPPSPMD_FORCE_INLINE vfloat(float value) : m_value(_mm_set1_ps(value)) { }
  402. CPPSPMD_FORCE_INLINE explicit vfloat(int value) : m_value(_mm_set1_ps((float)value)) { }
  403. private:
  404. //vfloat& operator=(const vfloat&);
  405. };
  406. CPPSPMD_FORCE_INLINE vfloat& store(vfloat& dst, const vfloat& src)
  407. {
  408. dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
  409. return dst;
  410. }
  411. CPPSPMD_FORCE_INLINE vfloat& store(vfloat&& dst, const vfloat& src)
  412. {
  413. dst.m_value = blendv_mask_ps(dst.m_value, src.m_value, _mm_castsi128_ps(m_exec.m_mask));
  414. return dst;
  415. }
  416. CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat& dst, const vfloat& src)
  417. {
  418. dst.m_value = src.m_value;
  419. return dst;
  420. }
  421. CPPSPMD_FORCE_INLINE vfloat& store_all(vfloat&& dst, const vfloat& src)
  422. {
  423. dst.m_value = src.m_value;
  424. return dst;
  425. }
  426. // Linear ref to floats
  427. struct float_lref
  428. {
  429. float* m_pValue;
  430. private:
  431. //float_lref& operator=(const float_lref&);
  432. };
  433. CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref& dst, const vfloat& src)
  434. {
  435. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  436. if (mask == ALL_ON_MOVEMASK)
  437. _mm_storeu_ps(dst.m_pValue, src.m_value);
  438. else
  439. _mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
  440. return dst;
  441. }
  442. CPPSPMD_FORCE_INLINE const float_lref& store(const float_lref&& dst, const vfloat& src)
  443. {
  444. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  445. if (mask == ALL_ON_MOVEMASK)
  446. _mm_storeu_ps(dst.m_pValue, src.m_value);
  447. else
  448. _mm_storeu_ps(dst.m_pValue, blendv_mask_ps(_mm_loadu_ps(dst.m_pValue), src.m_value, _mm_castsi128_ps(m_exec.m_mask)));
  449. return dst;
  450. }
  451. CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref& dst, const vfloat& src)
  452. {
  453. _mm_storeu_ps(dst.m_pValue, src.m_value);
  454. return dst;
  455. }
  456. CPPSPMD_FORCE_INLINE const float_lref& store_all(const float_lref&& dst, const vfloat& src)
  457. {
  458. _mm_storeu_ps(dst.m_pValue, src.m_value);
  459. return dst;
  460. }
  461. CPPSPMD_FORCE_INLINE vfloat load(const float_lref& src)
  462. {
  463. return vfloat{ _mm_and_ps(_mm_loadu_ps(src.m_pValue), _mm_castsi128_ps(m_exec.m_mask)) };
  464. }
  465. // Varying ref to floats
  466. struct float_vref
  467. {
  468. __m128i m_vindex;
  469. float* m_pValue;
  470. private:
  471. //float_vref& operator=(const float_vref&);
  472. };
  473. // Varying ref to varying float
  474. struct vfloat_vref
  475. {
  476. __m128i m_vindex;
  477. vfloat* m_pValue;
  478. private:
  479. //vfloat_vref& operator=(const vfloat_vref&);
  480. };
  481. // Varying ref to varying int
  482. struct vint_vref
  483. {
  484. __m128i m_vindex;
  485. vint* m_pValue;
  486. private:
  487. //vint_vref& operator=(const vint_vref&);
  488. };
  489. CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref& dst, const vfloat& src);
  490. CPPSPMD_FORCE_INLINE const float_vref& store(const float_vref&& dst, const vfloat& src);
  491. CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref& dst, const vfloat& src);
  492. CPPSPMD_FORCE_INLINE const float_vref& store_all(const float_vref&& dst, const vfloat& src);
  493. CPPSPMD_FORCE_INLINE vfloat load(const float_vref& src)
  494. {
  495. CPPSPMD_ALIGN(16) int vindex[4];
  496. _mm_store_si128((__m128i *)vindex, src.m_vindex);
  497. CPPSPMD_ALIGN(16) float loaded[4];
  498. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  499. for (int i = 0; i < 4; i++)
  500. {
  501. if (mask & (1 << i))
  502. loaded[i] = src.m_pValue[vindex[i]];
  503. }
  504. return vfloat{ _mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)loaded)) };
  505. }
  506. CPPSPMD_FORCE_INLINE vfloat load_all(const float_vref& src)
  507. {
  508. CPPSPMD_ALIGN(16) int vindex[4];
  509. _mm_store_si128((__m128i *)vindex, src.m_vindex);
  510. CPPSPMD_ALIGN(16) float loaded[4];
  511. for (int i = 0; i < 4; i++)
  512. loaded[i] = src.m_pValue[vindex[i]];
  513. return vfloat{ _mm_load_ps((const float*)loaded) };
  514. }
  515. // Linear ref to ints
  516. struct int_lref
  517. {
  518. int* m_pValue;
  519. private:
  520. //int_lref& operator=(const int_lref&);
  521. };
  522. CPPSPMD_FORCE_INLINE const int_lref& store(const int_lref& dst, const vint& src)
  523. {
  524. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  525. if (mask == ALL_ON_MOVEMASK)
  526. {
  527. _mm_storeu_si128((__m128i *)dst.m_pValue, src.m_value);
  528. }
  529. else
  530. {
  531. CPPSPMD_ALIGN(16) int stored[4];
  532. _mm_store_si128((__m128i *)stored, src.m_value);
  533. for (int i = 0; i < 4; i++)
  534. {
  535. if (mask & (1 << i))
  536. dst.m_pValue[i] = stored[i];
  537. }
  538. }
  539. return dst;
  540. }
  541. CPPSPMD_FORCE_INLINE vint load(const int_lref& src)
  542. {
  543. __m128i v = _mm_loadu_si128((const __m128i*)src.m_pValue);
  544. v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
  545. return vint{ v };
  546. }
  547. // Linear ref to int16's
  548. struct int16_lref
  549. {
  550. int16_t* m_pValue;
  551. private:
  552. //int16_lref& operator=(const int16_lref&);
  553. };
  554. CPPSPMD_FORCE_INLINE const int16_lref& store(const int16_lref& dst, const vint& src)
  555. {
  556. CPPSPMD_ALIGN(16) int stored[4];
  557. _mm_store_si128((__m128i *)stored, src.m_value);
  558. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  559. for (int i = 0; i < 4; i++)
  560. {
  561. if (mask & (1 << i))
  562. dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
  563. }
  564. return dst;
  565. }
  566. CPPSPMD_FORCE_INLINE const int16_lref& store_all(const int16_lref& dst, const vint& src)
  567. {
  568. CPPSPMD_ALIGN(16) int stored[4];
  569. _mm_store_si128((__m128i *)stored, src.m_value);
  570. for (int i = 0; i < 4; i++)
  571. dst.m_pValue[i] = static_cast<int16_t>(stored[i]);
  572. return dst;
  573. }
  574. CPPSPMD_FORCE_INLINE vint load(const int16_lref& src)
  575. {
  576. CPPSPMD_ALIGN(16) int values[4];
  577. for (int i = 0; i < 4; i++)
  578. values[i] = static_cast<int16_t>(src.m_pValue[i]);
  579. __m128i t = _mm_load_si128( (const __m128i *)values );
  580. return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps( t ), _mm_castsi128_ps(m_exec.m_mask))) };
  581. }
  582. CPPSPMD_FORCE_INLINE vint load_all(const int16_lref& src)
  583. {
  584. CPPSPMD_ALIGN(16) int values[4];
  585. for (int i = 0; i < 4; i++)
  586. values[i] = static_cast<int16_t>(src.m_pValue[i]);
  587. __m128i t = _mm_load_si128( (const __m128i *)values );
  588. return vint{ t };
  589. }
  590. // Linear ref to constant ints
  591. struct cint_lref
  592. {
  593. const int* m_pValue;
  594. private:
  595. //cint_lref& operator=(const cint_lref&);
  596. };
  597. CPPSPMD_FORCE_INLINE vint load(const cint_lref& src)
  598. {
  599. __m128i v = _mm_loadu_si128((const __m128i *)src.m_pValue);
  600. v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
  601. return vint{ v };
  602. }
  603. CPPSPMD_FORCE_INLINE vint load_all(const cint_lref& src)
  604. {
  605. return vint{ _mm_loadu_si128((const __m128i *)src.m_pValue) };
  606. }
  607. // Varying ref to ints
  608. struct int_vref
  609. {
  610. __m128i m_vindex;
  611. int* m_pValue;
  612. private:
  613. //int_vref& operator=(const int_vref&);
  614. };
  615. // Varying ref to constant ints
  616. struct cint_vref
  617. {
  618. __m128i m_vindex;
  619. const int* m_pValue;
  620. private:
  621. //cint_vref& operator=(const cint_vref&);
  622. };
  623. // Varying int
  624. struct vint
  625. {
  626. __m128i m_value;
  627. vint() = default;
  628. CPPSPMD_FORCE_INLINE explicit vint(const __m128i& value) : m_value(value) { }
  629. CPPSPMD_FORCE_INLINE explicit vint(const lint &other) : m_value(other.m_value) { }
  630. CPPSPMD_FORCE_INLINE vint& operator=(const lint& other) { m_value = other.m_value; return *this; }
  631. CPPSPMD_FORCE_INLINE vint(int value) : m_value(_mm_set1_epi32(value)) { }
  632. CPPSPMD_FORCE_INLINE explicit vint(float value) : m_value(_mm_set1_epi32((int)value)) { }
  633. CPPSPMD_FORCE_INLINE explicit vint(const vfloat& other) : m_value(_mm_cvttps_epi32(other.m_value)) { }
  634. CPPSPMD_FORCE_INLINE explicit operator vbool() const
  635. {
  636. return vbool{ _mm_xor_si128( _mm_load_si128((const __m128i*)g_allones_128), _mm_cmpeq_epi32(m_value, _mm_setzero_si128())) };
  637. }
  638. CPPSPMD_FORCE_INLINE explicit operator vfloat() const
  639. {
  640. return vfloat{ _mm_cvtepi32_ps(m_value) };
  641. }
  642. CPPSPMD_FORCE_INLINE int_vref operator[](int* ptr) const
  643. {
  644. return int_vref{ m_value, ptr };
  645. }
  646. CPPSPMD_FORCE_INLINE cint_vref operator[](const int* ptr) const
  647. {
  648. return cint_vref{ m_value, ptr };
  649. }
  650. CPPSPMD_FORCE_INLINE float_vref operator[](float* ptr) const
  651. {
  652. return float_vref{ m_value, ptr };
  653. }
  654. CPPSPMD_FORCE_INLINE vfloat_vref operator[](vfloat* ptr) const
  655. {
  656. return vfloat_vref{ m_value, ptr };
  657. }
  658. CPPSPMD_FORCE_INLINE vint_vref operator[](vint* ptr) const
  659. {
  660. return vint_vref{ m_value, ptr };
  661. }
  662. private:
  663. //vint& operator=(const vint&);
  664. };
  665. // Load/store linear int
  666. CPPSPMD_FORCE_INLINE void storeu_linear(int *pDst, const vint& src)
  667. {
  668. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  669. if (mask == ALL_ON_MOVEMASK)
  670. _mm_storeu_si128((__m128i *)pDst, src.m_value);
  671. else
  672. {
  673. if (mask & 1) pDst[0] = extract_x(src.m_value);
  674. if (mask & 2) pDst[1] = extract_y(src.m_value);
  675. if (mask & 4) pDst[2] = extract_z(src.m_value);
  676. if (mask & 8) pDst[3] = extract_w(src.m_value);
  677. }
  678. }
  679. CPPSPMD_FORCE_INLINE void storeu_linear_all(int *pDst, const vint& src)
  680. {
  681. _mm_storeu_si128((__m128i*)pDst, src.m_value);
  682. }
  683. CPPSPMD_FORCE_INLINE void store_linear_all(int *pDst, const vint& src)
  684. {
  685. _mm_store_si128((__m128i*)pDst, src.m_value);
  686. }
  687. CPPSPMD_FORCE_INLINE vint loadu_linear(const int *pSrc)
  688. {
  689. __m128i v = _mm_loadu_si128((const __m128i*)pSrc);
  690. v = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(v), _mm_castsi128_ps(m_exec.m_mask)));
  691. return vint{ v };
  692. }
  693. CPPSPMD_FORCE_INLINE vint loadu_linear_all(const int *pSrc)
  694. {
  695. return vint{ _mm_loadu_si128((__m128i*)pSrc) };
  696. }
  697. CPPSPMD_FORCE_INLINE vint load_linear_all(const int *pSrc)
  698. {
  699. return vint{ _mm_load_si128((__m128i*)pSrc) };
  700. }
  701. // Load/store linear float
  702. CPPSPMD_FORCE_INLINE void storeu_linear(float *pDst, const vfloat& src)
  703. {
  704. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  705. if (mask == ALL_ON_MOVEMASK)
  706. _mm_storeu_ps((float*)pDst, src.m_value);
  707. else
  708. {
  709. int *pDstI = (int *)pDst;
  710. if (mask & 1) pDstI[0] = extract_ps_x(src.m_value);
  711. if (mask & 2) pDstI[1] = extract_ps_y(src.m_value);
  712. if (mask & 4) pDstI[2] = extract_ps_z(src.m_value);
  713. if (mask & 8) pDstI[3] = extract_ps_w(src.m_value);
  714. }
  715. }
  716. CPPSPMD_FORCE_INLINE void storeu_linear_all(float *pDst, const vfloat& src)
  717. {
  718. _mm_storeu_ps((float*)pDst, src.m_value);
  719. }
  720. CPPSPMD_FORCE_INLINE void store_linear_all(float *pDst, const vfloat& src)
  721. {
  722. _mm_store_ps((float*)pDst, src.m_value);
  723. }
  724. CPPSPMD_FORCE_INLINE vfloat loadu_linear(const float *pSrc)
  725. {
  726. __m128 v = _mm_loadu_ps((const float*)pSrc);
  727. v = _mm_and_ps(v, _mm_castsi128_ps(m_exec.m_mask));
  728. return vfloat{ v };
  729. }
  730. CPPSPMD_FORCE_INLINE vfloat loadu_linear_all(const float *pSrc)
  731. {
  732. return vfloat{ _mm_loadu_ps((float*)pSrc) };
  733. }
  734. CPPSPMD_FORCE_INLINE vfloat load_linear_all(const float *pSrc)
  735. {
  736. return vfloat{ _mm_load_ps((float*)pSrc) };
  737. }
  738. CPPSPMD_FORCE_INLINE vint& store(vint& dst, const vint& src)
  739. {
  740. dst.m_value = blendv_mask_epi32(dst.m_value, src.m_value, m_exec.m_mask);
  741. return dst;
  742. }
  743. CPPSPMD_FORCE_INLINE const int_vref& store(const int_vref& dst, const vint& src)
  744. {
  745. CPPSPMD_ALIGN(16) int vindex[4];
  746. _mm_store_si128((__m128i*)vindex, dst.m_vindex);
  747. CPPSPMD_ALIGN(16) int stored[4];
  748. _mm_store_si128((__m128i*)stored, src.m_value);
  749. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  750. for (int i = 0; i < 4; i++)
  751. {
  752. if (mask & (1 << i))
  753. dst.m_pValue[vindex[i]] = stored[i];
  754. }
  755. return dst;
  756. }
  757. CPPSPMD_FORCE_INLINE vint& store_all(vint& dst, const vint& src)
  758. {
  759. dst.m_value = src.m_value;
  760. return dst;
  761. }
  762. CPPSPMD_FORCE_INLINE const int_vref& store_all(const int_vref& dst, const vint& src)
  763. {
  764. CPPSPMD_ALIGN(16) int vindex[4];
  765. _mm_store_si128((__m128i*)vindex, dst.m_vindex);
  766. CPPSPMD_ALIGN(16) int stored[4];
  767. _mm_store_si128((__m128i*)stored, src.m_value);
  768. for (int i = 0; i < 4; i++)
  769. dst.m_pValue[vindex[i]] = stored[i];
  770. return dst;
  771. }
  772. CPPSPMD_FORCE_INLINE vint load(const int_vref& src)
  773. {
  774. CPPSPMD_ALIGN(16) int values[4];
  775. CPPSPMD_ALIGN(16) int indices[4];
  776. _mm_store_si128((__m128i *)indices, src.m_vindex);
  777. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  778. for (int i = 0; i < 4; i++)
  779. {
  780. if (mask & (1 << i))
  781. values[i] = src.m_pValue[indices[i]];
  782. }
  783. return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
  784. }
  785. CPPSPMD_FORCE_INLINE vint load_all(const int_vref& src)
  786. {
  787. CPPSPMD_ALIGN(16) int values[4];
  788. CPPSPMD_ALIGN(16) int indices[4];
  789. _mm_store_si128((__m128i *)indices, src.m_vindex);
  790. for (int i = 0; i < 4; i++)
  791. values[i] = src.m_pValue[indices[i]];
  792. return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
  793. }
  794. CPPSPMD_FORCE_INLINE vint load(const cint_vref& src)
  795. {
  796. CPPSPMD_ALIGN(16) int values[4];
  797. CPPSPMD_ALIGN(16) int indices[4];
  798. _mm_store_si128((__m128i *)indices, src.m_vindex);
  799. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  800. for (int i = 0; i < 4; i++)
  801. {
  802. if (mask & (1 << i))
  803. values[i] = src.m_pValue[indices[i]];
  804. }
  805. return vint{ _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(m_exec.m_mask), _mm_load_ps((const float*)values))) };
  806. }
  807. CPPSPMD_FORCE_INLINE vint load_all(const cint_vref& src)
  808. {
  809. CPPSPMD_ALIGN(16) int values[4];
  810. CPPSPMD_ALIGN(16) int indices[4];
  811. _mm_store_si128((__m128i *)indices, src.m_vindex);
  812. for (int i = 0; i < 4; i++)
  813. values[i] = src.m_pValue[indices[i]];
  814. return vint{ _mm_castps_si128( _mm_load_ps((const float*)values)) };
  815. }
  816. CPPSPMD_FORCE_INLINE vint load_bytes_all(const cint_vref& src)
  817. {
  818. __m128i v0_l;
  819. const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
  820. v0_l = insert_x(_mm_undefined_si128(), ((int*)(pSrc + extract_x(src.m_vindex)))[0]);
  821. v0_l = insert_y(v0_l, ((int*)(pSrc + extract_y(src.m_vindex)))[0]);
  822. v0_l = insert_z(v0_l, ((int*)(pSrc + extract_z(src.m_vindex)))[0]);
  823. v0_l = insert_w(v0_l, ((int*)(pSrc + extract_w(src.m_vindex)))[0]);
  824. return vint{ v0_l };
  825. }
  826. CPPSPMD_FORCE_INLINE vint load_words_all(const cint_vref& src)
  827. {
  828. __m128i v0_l;
  829. const uint8_t* pSrc = (const uint8_t*)src.m_pValue;
  830. v0_l = insert_x(_mm_undefined_si128(), ((int16_t*)(pSrc + 2 * extract_x(src.m_vindex)))[0]);
  831. v0_l = insert_y(v0_l, ((int16_t*)(pSrc + 2 * extract_y(src.m_vindex)))[0]);
  832. v0_l = insert_z(v0_l, ((int16_t*)(pSrc + 2 * extract_z(src.m_vindex)))[0]);
  833. v0_l = insert_w(v0_l, ((int16_t*)(pSrc + 2 * extract_w(src.m_vindex)))[0]);
  834. return vint{ v0_l };
  835. }
  836. CPPSPMD_FORCE_INLINE void store_strided(int *pDst, uint32_t stride, const vint &v)
  837. {
  838. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  839. if (mask & 1) pDst[0] = extract_x(v.m_value);
  840. if (mask & 2) pDst[stride] = extract_y(v.m_value);
  841. if (mask & 4) pDst[stride*2] = extract_z(v.m_value);
  842. if (mask & 8) pDst[stride*3] = extract_w(v.m_value);
  843. }
  844. CPPSPMD_FORCE_INLINE void store_strided(float *pDstF, uint32_t stride, const vfloat &v)
  845. {
  846. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  847. if (mask & 1) ((int *)pDstF)[0] = extract_ps_x(v.m_value);
  848. if (mask & 2) ((int *)pDstF)[stride] = extract_ps_y(v.m_value);
  849. if (mask & 4) ((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
  850. if (mask & 8) ((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
  851. }
  852. CPPSPMD_FORCE_INLINE void store_all_strided(int *pDst, uint32_t stride, const vint &v)
  853. {
  854. pDst[0] = extract_x(v.m_value);
  855. pDst[stride] = extract_y(v.m_value);
  856. pDst[stride*2] = extract_z(v.m_value);
  857. pDst[stride*3] = extract_w(v.m_value);
  858. }
  859. CPPSPMD_FORCE_INLINE void store_all_strided(float *pDstF, uint32_t stride, const vfloat &v)
  860. {
  861. ((int *)pDstF)[0] = extract_ps_x(v.m_value);
  862. ((int *)pDstF)[stride] = extract_ps_y(v.m_value);
  863. ((int *)pDstF)[stride*2] = extract_ps_z(v.m_value);
  864. ((int *)pDstF)[stride*3] = extract_ps_w(v.m_value);
  865. }
  866. CPPSPMD_FORCE_INLINE vint load_strided(const int *pSrc, uint32_t stride)
  867. {
  868. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  869. #if CPPSPMD_SSE2
  870. CPPSPMD_ALIGN(16) int vals[4] = { 0, 0, 0, 0 };
  871. if (mask & 1) vals[0] = pSrc[0];
  872. if (mask & 2) vals[1] = pSrc[stride];
  873. if (mask & 4) vals[2] = pSrc[stride * 2];
  874. if (mask & 8) vals[3] = pSrc[stride * 3];
  875. return vint{ _mm_load_si128((__m128i*)vals) };
  876. #else
  877. const float* pSrcF = (const float*)pSrc;
  878. __m128 v = _mm_setzero_ps();
  879. if (mask & 1) v = _mm_load_ss(pSrcF);
  880. if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
  881. if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
  882. if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
  883. return vint{ _mm_castps_si128(v) };
  884. #endif
  885. }
  886. CPPSPMD_FORCE_INLINE vfloat load_strided(const float *pSrc, uint32_t stride)
  887. {
  888. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  889. #if CPPSPMD_SSE2
  890. CPPSPMD_ALIGN(16) float vals[4] = { 0, 0, 0, 0 };
  891. if (mask & 1) vals[0] = pSrc[0];
  892. if (mask & 2) vals[1] = pSrc[stride];
  893. if (mask & 4) vals[2] = pSrc[stride * 2];
  894. if (mask & 8) vals[3] = pSrc[stride * 3];
  895. return vfloat{ _mm_load_ps(vals) };
  896. #else
  897. __m128 v = _mm_setzero_ps();
  898. if (mask & 1) v = _mm_load_ss(pSrc);
  899. if (mask & 2) v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
  900. if (mask & 4) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
  901. if (mask & 8) v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
  902. return vfloat{ v };
  903. #endif
  904. }
  905. CPPSPMD_FORCE_INLINE vint load_all_strided(const int *pSrc, uint32_t stride)
  906. {
  907. #if CPPSPMD_SSE2
  908. CPPSPMD_ALIGN(16) int vals[4];
  909. vals[0] = pSrc[0];
  910. vals[1] = pSrc[stride];
  911. vals[2] = pSrc[stride * 2];
  912. vals[3] = pSrc[stride * 3];
  913. return vint{ _mm_load_si128((__m128i*)vals) };
  914. #else
  915. const float* pSrcF = (const float*)pSrc;
  916. __m128 v = _mm_load_ss(pSrcF);
  917. v = _mm_insert_ps(v, _mm_load_ss(pSrcF + stride), 0x10);
  918. v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 2 * stride), 0x20);
  919. v = _mm_insert_ps(v, _mm_load_ss(pSrcF + 3 * stride), 0x30);
  920. return vint{ _mm_castps_si128(v) };
  921. #endif
  922. }
  923. CPPSPMD_FORCE_INLINE vfloat load_all_strided(const float *pSrc, uint32_t stride)
  924. {
  925. #if CPPSPMD_SSE2
  926. CPPSPMD_ALIGN(16) float vals[4];
  927. vals[0] = pSrc[0];
  928. vals[1] = pSrc[stride];
  929. vals[2] = pSrc[stride * 2];
  930. vals[3] = pSrc[stride * 3];
  931. return vfloat{ _mm_load_ps(vals) };
  932. #else
  933. __m128 v = _mm_load_ss(pSrc);
  934. v = _mm_insert_ps(v, _mm_load_ss(pSrc + stride), 0x10);
  935. v = _mm_insert_ps(v, _mm_load_ss(pSrc + 2 * stride), 0x20);
  936. v = _mm_insert_ps(v, _mm_load_ss(pSrc + 3 * stride), 0x30);
  937. return vfloat{ v };
  938. #endif
  939. }
  940. CPPSPMD_FORCE_INLINE const vfloat_vref& store(const vfloat_vref& dst, const vfloat& src)
  941. {
  942. // TODO: There's surely a better way
  943. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  944. if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(_mm_castps_si128(src.m_value));
  945. if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(_mm_castps_si128(src.m_value));
  946. if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(_mm_castps_si128(src.m_value));
  947. if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(_mm_castps_si128(src.m_value));
  948. return dst;
  949. }
  950. CPPSPMD_FORCE_INLINE vfloat load(const vfloat_vref& src)
  951. {
  952. // TODO: There's surely a better way
  953. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  954. __m128i k = _mm_setzero_si128();
  955. if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
  956. if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
  957. if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
  958. if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
  959. return vfloat{ _mm_castsi128_ps(k) };
  960. }
  961. CPPSPMD_FORCE_INLINE const vint_vref& store(const vint_vref& dst, const vint& src)
  962. {
  963. // TODO: There's surely a better way
  964. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  965. if (mask & 1) ((int *)(&dst.m_pValue[extract_x(dst.m_vindex)]))[0] = extract_x(src.m_value);
  966. if (mask & 2) ((int *)(&dst.m_pValue[extract_y(dst.m_vindex)]))[1] = extract_y(src.m_value);
  967. if (mask & 4) ((int *)(&dst.m_pValue[extract_z(dst.m_vindex)]))[2] = extract_z(src.m_value);
  968. if (mask & 8) ((int *)(&dst.m_pValue[extract_w(dst.m_vindex)]))[3] = extract_w(src.m_value);
  969. return dst;
  970. }
  971. CPPSPMD_FORCE_INLINE vint load(const vint_vref& src)
  972. {
  973. // TODO: There's surely a better way
  974. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  975. __m128i k = _mm_setzero_si128();
  976. if (mask & 1) k = insert_x(k, ((int *)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
  977. if (mask & 2) k = insert_y(k, ((int *)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
  978. if (mask & 4) k = insert_z(k, ((int *)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
  979. if (mask & 8) k = insert_w(k, ((int *)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
  980. return vint{ k };
  981. }
  982. CPPSPMD_FORCE_INLINE vint load_all(const vint_vref& src)
  983. {
  984. // TODO: There's surely a better way
  985. __m128i k = _mm_setzero_si128();
  986. k = insert_x(k, ((int*)(&src.m_pValue[extract_x(src.m_vindex)]))[0]);
  987. k = insert_y(k, ((int*)(&src.m_pValue[extract_y(src.m_vindex)]))[1]);
  988. k = insert_z(k, ((int*)(&src.m_pValue[extract_z(src.m_vindex)]))[2]);
  989. k = insert_w(k, ((int*)(&src.m_pValue[extract_w(src.m_vindex)]))[3]);
  990. return vint{ k };
  991. }
  992. // Linear integer
  993. struct lint
  994. {
  995. __m128i m_value;
  996. CPPSPMD_FORCE_INLINE explicit lint(__m128i value)
  997. : m_value(value)
  998. { }
  999. CPPSPMD_FORCE_INLINE explicit operator vfloat() const
  1000. {
  1001. return vfloat{ _mm_cvtepi32_ps(m_value) };
  1002. }
  1003. CPPSPMD_FORCE_INLINE explicit operator vint() const
  1004. {
  1005. return vint{ m_value };
  1006. }
  1007. CPPSPMD_FORCE_INLINE int get_first_value() const
  1008. {
  1009. return _mm_cvtsi128_si32(m_value);
  1010. }
  1011. CPPSPMD_FORCE_INLINE float_lref operator[](float* ptr) const
  1012. {
  1013. return float_lref{ ptr + get_first_value() };
  1014. }
  1015. CPPSPMD_FORCE_INLINE int_lref operator[](int* ptr) const
  1016. {
  1017. return int_lref{ ptr + get_first_value() };
  1018. }
  1019. CPPSPMD_FORCE_INLINE int16_lref operator[](int16_t* ptr) const
  1020. {
  1021. return int16_lref{ ptr + get_first_value() };
  1022. }
  1023. CPPSPMD_FORCE_INLINE cint_lref operator[](const int* ptr) const
  1024. {
  1025. return cint_lref{ ptr + get_first_value() };
  1026. }
  1027. private:
  1028. //lint& operator=(const lint&);
  1029. };
  1030. CPPSPMD_FORCE_INLINE lint& store_all(lint& dst, const lint& src)
  1031. {
  1032. dst.m_value = src.m_value;
  1033. return dst;
  1034. }
  1035. const lint program_index = lint{ _mm_set_epi32( 3, 2, 1, 0 ) };
  1036. // SPMD condition helpers
  1037. template<typename IfBody>
  1038. CPPSPMD_FORCE_INLINE void spmd_if(const vbool& cond, const IfBody& ifBody);
  1039. CPPSPMD_FORCE_INLINE void spmd_if_break(const vbool& cond);
  1040. // No breaks, continues, etc. allowed
  1041. template<typename IfBody>
  1042. CPPSPMD_FORCE_INLINE void spmd_sif(const vbool& cond, const IfBody& ifBody);
  1043. // No breaks, continues, etc. allowed
  1044. template<typename IfBody, typename ElseBody>
  1045. CPPSPMD_FORCE_INLINE void spmd_sifelse(const vbool& cond, const IfBody& ifBody, const ElseBody &elseBody);
  1046. template<typename IfBody, typename ElseBody>
  1047. CPPSPMD_FORCE_INLINE void spmd_ifelse(const vbool& cond, const IfBody& ifBody, const ElseBody& elseBody);
  1048. template<typename WhileCondBody, typename WhileBody>
  1049. CPPSPMD_FORCE_INLINE void spmd_while(const WhileCondBody& whileCondBody, const WhileBody& whileBody);
  1050. template<typename ForInitBody, typename ForCondBody, typename ForIncrBody, typename ForBody>
  1051. CPPSPMD_FORCE_INLINE void spmd_for(const ForInitBody& forInitBody, const ForCondBody& forCondBody, const ForIncrBody& forIncrBody, const ForBody& forBody);
  1052. template<typename ForeachBody>
  1053. CPPSPMD_FORCE_INLINE void spmd_foreach(int begin, int end, const ForeachBody& foreachBody);
  1054. #ifdef _DEBUG
  1055. CPPSPMD_FORCE_INLINE void check_masks();
  1056. #else
  1057. CPPSPMD_FORCE_INLINE void check_masks() { }
  1058. #endif
  1059. CPPSPMD_FORCE_INLINE void spmd_break();
  1060. CPPSPMD_FORCE_INLINE void spmd_continue();
  1061. CPPSPMD_FORCE_INLINE void spmd_return();
  1062. template<typename UnmaskedBody>
  1063. CPPSPMD_FORCE_INLINE void spmd_unmasked(const UnmaskedBody& unmaskedBody);
  1064. template<typename SPMDKernel, typename... Args>
  1065. //CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args);
  1066. CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args);
  1067. CPPSPMD_FORCE_INLINE void swap(vint &a, vint &b) { vint temp = a; store(a, b); store(b, temp); }
  1068. CPPSPMD_FORCE_INLINE void swap(vfloat &a, vfloat &b) { vfloat temp = a; store(a, b); store(b, temp); }
  1069. CPPSPMD_FORCE_INLINE void swap(vbool &a, vbool &b) { vbool temp = a; store(a, b); store(b, temp); }
  1070. CPPSPMD_FORCE_INLINE float reduce_add(vfloat v)
  1071. {
  1072. __m128 k3210 = _mm_castsi128_ps(blendv_mask_epi32(_mm_setzero_si128(), _mm_castps_si128(v.m_value), m_exec.m_mask));
  1073. __m128 temp = _mm_add_ps(_mm_shuffle_ps(k3210, k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);
  1074. return _mm_cvtss_f32(_mm_add_ss(_mm_movehl_ps(temp, temp), temp));
  1075. }
  1076. CPPSPMD_FORCE_INLINE int reduce_add(vint v)
  1077. {
  1078. __m128i k3210 = blendv_mask_epi32(_mm_setzero_si128(), v.m_value, m_exec.m_mask);
  1079. __m128i temp = _mm_add_epi32(_mm_shuffle_epi32(k3210, _MM_SHUFFLE(0, 1, 2, 3)), k3210);
  1080. return extract_x(_mm_add_epi32(_mm_castps_si128(_mm_movehl_ps(_mm_castsi128_ps(temp), _mm_castsi128_ps(temp))), temp));
  1081. }
  1082. #include "cppspmd_math_declares.h"
  1083. }; // struct spmd_kernel
  1084. using exec_mask = spmd_kernel::exec_mask;
  1085. using vint = spmd_kernel::vint;
  1086. using int_lref = spmd_kernel::int_lref;
  1087. using cint_vref = spmd_kernel::cint_vref;
  1088. using cint_lref = spmd_kernel::cint_lref;
  1089. using int_vref = spmd_kernel::int_vref;
  1090. using lint = spmd_kernel::lint;
  1091. using vbool = spmd_kernel::vbool;
  1092. using vfloat = spmd_kernel::vfloat;
  1093. using float_lref = spmd_kernel::float_lref;
  1094. using float_vref = spmd_kernel::float_vref;
  1095. using vfloat_vref = spmd_kernel::vfloat_vref;
  1096. using vint_vref = spmd_kernel::vint_vref;
  1097. CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vfloat() const
  1098. {
  1099. return vfloat { _mm_and_ps( _mm_castsi128_ps(m_value), *(const __m128 *)g_onef_128 ) };
  1100. }
  1101. // Returns UINT32_MAX's for true, 0 for false. (Should it return 1's?)
  1102. CPPSPMD_FORCE_INLINE spmd_kernel::vbool::operator vint() const
  1103. {
  1104. return vint { m_value };
  1105. }
  1106. CPPSPMD_FORCE_INLINE vbool operator!(const vbool& v)
  1107. {
  1108. return vbool{ _mm_castps_si128(_mm_xor_ps(_mm_load_ps((const float*)g_allones_128), _mm_castsi128_ps(v.m_value))) };
  1109. }
  1110. CPPSPMD_FORCE_INLINE exec_mask::exec_mask(const vbool& b) { m_mask = b.m_value; }
  1111. CPPSPMD_FORCE_INLINE exec_mask operator^(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_xor_si128(a.m_mask, b.m_mask) }; }
  1112. CPPSPMD_FORCE_INLINE exec_mask operator&(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_and_si128(a.m_mask, b.m_mask) }; }
  1113. CPPSPMD_FORCE_INLINE exec_mask operator|(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_or_si128(a.m_mask, b.m_mask) }; }
  1114. CPPSPMD_FORCE_INLINE bool all(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) == ALL_ON_MOVEMASK; }
  1115. CPPSPMD_FORCE_INLINE bool any(const exec_mask& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_mask)) != 0; }
  1116. // Bad pattern - doesn't factor in the current exec mask. Prefer spmd_any() instead.
  1117. CPPSPMD_FORCE_INLINE bool all(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) == ALL_ON_MOVEMASK; }
  1118. CPPSPMD_FORCE_INLINE bool any(const vbool& e) { return _mm_movemask_ps(_mm_castsi128_ps(e.m_value)) != 0; }
  1119. CPPSPMD_FORCE_INLINE exec_mask andnot(const exec_mask& a, const exec_mask& b) { return exec_mask{ _mm_andnot_si128(a.m_mask, b.m_mask) }; }
  1120. CPPSPMD_FORCE_INLINE vbool operator||(const vbool& a, const vbool& b) { return vbool{ _mm_or_si128(a.m_value, b.m_value) }; }
  1121. CPPSPMD_FORCE_INLINE vbool operator&&(const vbool& a, const vbool& b) { return vbool{ _mm_and_si128(a.m_value, b.m_value) }; }
  1122. CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, const vfloat& b) { return vfloat{ _mm_add_ps(a.m_value, b.m_value) }; }
  1123. CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vfloat& b) { return vfloat{ _mm_sub_ps(a.m_value, b.m_value) }; }
  1124. CPPSPMD_FORCE_INLINE vfloat operator+(float a, const vfloat& b) { return vfloat(a) + b; }
  1125. CPPSPMD_FORCE_INLINE vfloat operator+(const vfloat& a, float b) { return a + vfloat(b); }
  1126. CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, const vint& b) { return a - vfloat(b); }
  1127. CPPSPMD_FORCE_INLINE vfloat operator-(const vint& a, const vfloat& b) { return vfloat(a) - b; }
  1128. CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, int b) { return a - vfloat(b); }
  1129. CPPSPMD_FORCE_INLINE vfloat operator-(int a, const vfloat& b) { return vfloat(a) - b; }
  1130. CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& a, float b) { return a - vfloat(b); }
  1131. CPPSPMD_FORCE_INLINE vfloat operator-(float a, const vfloat& b) { return vfloat(a) - b; }
  1132. CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, const vfloat& b) { return vfloat{ _mm_mul_ps(a.m_value, b.m_value) }; }
  1133. CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, float b) { return a * vfloat(b); }
  1134. CPPSPMD_FORCE_INLINE vfloat operator*(float a, const vfloat& b) { return vfloat(a) * b; }
  1135. CPPSPMD_FORCE_INLINE vfloat operator*(const vfloat& a, int b) { return a * vfloat(b); }
  1136. CPPSPMD_FORCE_INLINE vfloat operator*(int a, const vfloat& b) { return vfloat(a) * b; }
  1137. CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, const vfloat& b) { return vfloat{ _mm_div_ps(a.m_value, b.m_value) }; }
  1138. CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, int b) { return a / vfloat(b); }
  1139. CPPSPMD_FORCE_INLINE vfloat operator/(int a, const vfloat& b) { return vfloat(a) / b; }
  1140. CPPSPMD_FORCE_INLINE vfloat operator/(const vfloat& a, float b) { return a / vfloat(b); }
  1141. CPPSPMD_FORCE_INLINE vfloat operator/(float a, const vfloat& b) { return vfloat(a) / b; }
  1142. CPPSPMD_FORCE_INLINE vfloat operator-(const vfloat& v) { return vfloat{ _mm_sub_ps(_mm_xor_ps(v.m_value, v.m_value), v.m_value) }; }
  1143. CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
  1144. CPPSPMD_FORCE_INLINE vbool operator==(const vfloat& a, float b) { return a == vfloat(b); }
  1145. CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, const vfloat& b) { return !vbool{ _mm_castps_si128(_mm_cmpeq_ps(a.m_value, b.m_value)) }; }
  1146. CPPSPMD_FORCE_INLINE vbool operator!=(const vfloat& a, float b) { return a != vfloat(b); }
  1147. CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmplt_ps(a.m_value, b.m_value)) }; }
  1148. CPPSPMD_FORCE_INLINE vbool operator<(const vfloat& a, float b) { return a < vfloat(b); }
  1149. CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpgt_ps(a.m_value, b.m_value)) }; }
  1150. CPPSPMD_FORCE_INLINE vbool operator>(const vfloat& a, float b) { return a > vfloat(b); }
  1151. CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmple_ps(a.m_value, b.m_value)) }; }
  1152. CPPSPMD_FORCE_INLINE vbool operator<=(const vfloat& a, float b) { return a <= vfloat(b); }
  1153. CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, const vfloat& b) { return vbool{ _mm_castps_si128(_mm_cmpge_ps(a.m_value, b.m_value)) }; }
  1154. CPPSPMD_FORCE_INLINE vbool operator>=(const vfloat& a, float b) { return a >= vfloat(b); }
  1155. CPPSPMD_FORCE_INLINE vfloat spmd_ternaryf(const vbool& cond, const vfloat& a, const vfloat& b) { return vfloat{ blendv_mask_ps(b.m_value, a.m_value, _mm_castsi128_ps(cond.m_value)) }; }
  1156. CPPSPMD_FORCE_INLINE vint spmd_ternaryi(const vbool& cond, const vint& a, const vint& b) { return vint{ blendv_mask_epi32(b.m_value, a.m_value, cond.m_value) }; }
  1157. CPPSPMD_FORCE_INLINE vfloat sqrt(const vfloat& v) { return vfloat{ _mm_sqrt_ps(v.m_value) }; }
  1158. CPPSPMD_FORCE_INLINE vfloat abs(const vfloat& v) { return vfloat{ _mm_andnot_ps(_mm_set1_ps(-0.0f), v.m_value) }; }
  1159. CPPSPMD_FORCE_INLINE vfloat max(const vfloat& a, const vfloat& b) { return vfloat{ _mm_max_ps(a.m_value, b.m_value) }; }
  1160. CPPSPMD_FORCE_INLINE vfloat min(const vfloat& a, const vfloat& b) { return vfloat{ _mm_min_ps(a.m_value, b.m_value) }; }
  1161. #if CPPSPMD_SSE2
  1162. CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat& a)
  1163. {
  1164. __m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU) );
  1165. __m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
  1166. __m128i ai = _mm_cvttps_epi32(a.m_value);
  1167. __m128 af = _mm_cvtepi32_ps(ai);
  1168. return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
  1169. }
  1170. CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& a)
  1171. {
  1172. __m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
  1173. __m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
  1174. __m128i ai = _mm_cvtps_epi32(a.m_value);
  1175. __m128 af = _mm_cvtepi32_ps(ai);
  1176. __m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmpgt_ps(af, a.m_value)));
  1177. af = _mm_add_ps(af, changed);
  1178. return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
  1179. }
  1180. CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a)
  1181. {
  1182. __m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
  1183. __m128i has_fractional = _mm_cmplt_epi32(abs_a, _mm_castps_si128(_mm_set1_ps(8388608.0f)));
  1184. __m128i ai = _mm_cvtps_epi32(a.m_value);
  1185. __m128 af = _mm_cvtepi32_ps(ai);
  1186. __m128 changed = _mm_cvtepi32_ps(_mm_castps_si128(_mm_cmplt_ps(af, a.m_value)));
  1187. af = _mm_sub_ps(af, changed);
  1188. return vfloat{ blendv_mask_ps(a.m_value, af, _mm_castsi128_ps(has_fractional)) };
  1189. }
  1190. // We need to disable unsafe math optimizations for the key operations used for rounding to nearest.
  1191. // I wish there was a better way.
  1192. #if defined(__GNUC__) && !defined(__INTEL_COMPILER) && !defined(__clang__)
  1193. inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optimize("-fno-unsafe-math-optimizations")))
  1194. #elif defined(__clang__)
  1195. inline __m128 add_sub(__m128 a, __m128 b) __attribute__((optnone))
  1196. #elif defined (_MSC_VER)
  1197. #pragma float_control(push)
  1198. #pragma float_control(precise, on)
  1199. inline __m128 add_sub(__m128 a, __m128 b)
  1200. #else
  1201. inline __m128 add_sub(__m128 a, __m128 b)
  1202. #endif
  1203. {
  1204. return _mm_sub_ps(_mm_add_ps(a, b), b);
  1205. }
  1206. #if defined (_MSC_VER)
  1207. #pragma float_control(pop)
  1208. #endif
  1209. CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat& a)
  1210. {
  1211. __m128i no_fract_fp_bits = _mm_castps_si128(_mm_set1_ps(8388608.0f));
  1212. __m128i sign_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x80000000U));
  1213. __m128 force_int = _mm_castsi128_ps(_mm_or_si128(no_fract_fp_bits, sign_a));
  1214. // Can't use individual _mm_add_ps/_mm_sub_ps - this will be optimized out with /fp:fast by clang and probably other compilers.
  1215. //__m128 temp1 = _mm_add_ps(a.m_value, force_int);
  1216. //__m128 temp2 = _mm_sub_ps(temp1, force_int);
  1217. __m128 temp2 = add_sub(a.m_value, force_int);
  1218. __m128i abs_a = _mm_and_si128(_mm_castps_si128(a.m_value), _mm_set1_epi32(0x7FFFFFFFU));
  1219. __m128i has_fractional = _mm_cmplt_epi32(abs_a, no_fract_fp_bits);
  1220. return vfloat{ blendv_mask_ps(a.m_value, temp2, _mm_castsi128_ps(has_fractional)) };
  1221. }
  1222. #else
  1223. CPPSPMD_FORCE_INLINE vfloat floor(const vfloat& v) { return vfloat{ _mm_floor_ps(v.m_value) }; }
  1224. CPPSPMD_FORCE_INLINE vfloat ceil(const vfloat& a) { return vfloat{ _mm_ceil_ps(a.m_value) }; }
  1225. CPPSPMD_FORCE_INLINE vfloat round_nearest(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC ) }; }
  1226. CPPSPMD_FORCE_INLINE vfloat round_truncate(const vfloat &a) { return vfloat{ _mm_round_ps(a.m_value, _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC ) }; }
  1227. #endif
  1228. CPPSPMD_FORCE_INLINE vfloat frac(const vfloat& a) { return a - floor(a); }
  1229. CPPSPMD_FORCE_INLINE vfloat fmod(vfloat a, vfloat b) { vfloat c = frac(abs(a / b)) * abs(b); return spmd_ternaryf(a < 0, -c, c); }
  1230. CPPSPMD_FORCE_INLINE vfloat sign(const vfloat& a) { return spmd_ternaryf(a < 0.0f, 1.0f, 1.0f); }
  1231. CPPSPMD_FORCE_INLINE vint max(const vint& a, const vint& b) { return vint{ max_epi32(a.m_value, b.m_value) }; }
  1232. CPPSPMD_FORCE_INLINE vint min(const vint& a, const vint& b) { return vint{ min_epi32(a.m_value, b.m_value) }; }
  1233. CPPSPMD_FORCE_INLINE vint maxu(const vint& a, const vint& b) { return vint{ max_epu32(a.m_value, b.m_value) }; }
  1234. CPPSPMD_FORCE_INLINE vint minu(const vint& a, const vint& b) { return vint{ min_epu32(a.m_value, b.m_value) }; }
  1235. CPPSPMD_FORCE_INLINE vint abs(const vint& v) { return vint{ abs_epi32(v.m_value) }; }
  1236. CPPSPMD_FORCE_INLINE vint byteswap(const vint& v) { return vint{ shuffle_epi8(v.m_value, _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)) }; }
  1237. CPPSPMD_FORCE_INLINE vint cast_vfloat_to_vint(const vfloat& v) { return vint{ _mm_castps_si128(v.m_value) }; }
  1238. CPPSPMD_FORCE_INLINE vfloat cast_vint_to_vfloat(const vint& v) { return vfloat{ _mm_castsi128_ps(v.m_value) }; }
  1239. CPPSPMD_FORCE_INLINE vfloat clamp(const vfloat& v, const vfloat& a, const vfloat& b)
  1240. {
  1241. return vfloat{ _mm_min_ps(b.m_value, _mm_max_ps(v.m_value, a.m_value) ) };
  1242. }
  1243. CPPSPMD_FORCE_INLINE vint clamp(const vint& v, const vint& a, const vint& b)
  1244. {
  1245. return vint{ min_epi32(b.m_value, max_epi32(v.m_value, a.m_value) ) };
  1246. }
  1247. CPPSPMD_FORCE_INLINE vfloat vfma(const vfloat& a, const vfloat& b, const vfloat& c)
  1248. {
  1249. return vfloat{ _mm_add_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
  1250. }
  1251. CPPSPMD_FORCE_INLINE vfloat vfms(const vfloat& a, const vfloat& b, const vfloat& c)
  1252. {
  1253. return vfloat{ _mm_sub_ps(_mm_mul_ps(a.m_value, b.m_value), c.m_value) };
  1254. }
  1255. CPPSPMD_FORCE_INLINE vfloat vfnma(const vfloat& a, const vfloat& b, const vfloat& c)
  1256. {
  1257. return vfloat{ _mm_sub_ps(c.m_value, _mm_mul_ps(a.m_value, b.m_value)) };
  1258. }
  1259. CPPSPMD_FORCE_INLINE vfloat vfnms(const vfloat& a, const vfloat& b, const vfloat& c)
  1260. {
  1261. return vfloat{ _mm_sub_ps(_mm_sub_ps(_mm_xor_ps(a.m_value, a.m_value), _mm_mul_ps(a.m_value, b.m_value)), c.m_value) };
  1262. }
  1263. CPPSPMD_FORCE_INLINE vfloat lerp(const vfloat &x, const vfloat &y, const vfloat &s) { return vfma(y - x, s, x); }
  1264. CPPSPMD_FORCE_INLINE lint operator+(int a, const lint& b) { return lint{ _mm_add_epi32(_mm_set1_epi32(a), b.m_value) }; }
  1265. CPPSPMD_FORCE_INLINE lint operator+(const lint& a, int b) { return lint{ _mm_add_epi32(a.m_value, _mm_set1_epi32(b)) }; }
  1266. CPPSPMD_FORCE_INLINE vfloat operator+(float a, const lint& b) { return vfloat(a) + vfloat(b); }
  1267. CPPSPMD_FORCE_INLINE vfloat operator+(const lint& a, float b) { return vfloat(a) + vfloat(b); }
  1268. CPPSPMD_FORCE_INLINE vfloat operator*(const lint& a, float b) { return vfloat(a) * vfloat(b); }
  1269. CPPSPMD_FORCE_INLINE vfloat operator*(float b, const lint& a) { return vfloat(a) * vfloat(b); }
  1270. CPPSPMD_FORCE_INLINE vint operator&(const vint& a, const vint& b) { return vint{ _mm_and_si128(a.m_value, b.m_value) }; }
  1271. CPPSPMD_FORCE_INLINE vint operator&(const vint& a, int b) { return a & vint(b); }
  1272. CPPSPMD_FORCE_INLINE vint andnot(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(a.m_value, b.m_value) }; }
  1273. CPPSPMD_FORCE_INLINE vint operator|(const vint& a, const vint& b) { return vint{ _mm_or_si128(a.m_value, b.m_value) }; }
  1274. CPPSPMD_FORCE_INLINE vint operator|(const vint& a, int b) { return a | vint(b); }
  1275. CPPSPMD_FORCE_INLINE vint operator^(const vint& a, const vint& b) { return vint{ _mm_xor_si128(a.m_value, b.m_value) }; }
  1276. CPPSPMD_FORCE_INLINE vint operator^(const vint& a, int b) { return a ^ vint(b); }
  1277. CPPSPMD_FORCE_INLINE vbool operator==(const vint& a, const vint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
  1278. CPPSPMD_FORCE_INLINE vbool operator!=(const vint& a, const vint& b) { return !vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
  1279. CPPSPMD_FORCE_INLINE vbool operator<(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
  1280. CPPSPMD_FORCE_INLINE vbool operator<=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
  1281. CPPSPMD_FORCE_INLINE vbool operator>=(const vint& a, const vint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
  1282. CPPSPMD_FORCE_INLINE vbool operator>(const vint& a, const vint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
  1283. CPPSPMD_FORCE_INLINE vint operator+(const vint& a, const vint& b) { return vint{ _mm_add_epi32(a.m_value, b.m_value) }; }
  1284. CPPSPMD_FORCE_INLINE vint operator-(const vint& a, const vint& b) { return vint{ _mm_sub_epi32(a.m_value, b.m_value) }; }
  1285. CPPSPMD_FORCE_INLINE vint operator+(const vint& a, int b) { return a + vint(b); }
  1286. CPPSPMD_FORCE_INLINE vint operator-(const vint& a, int b) { return a - vint(b); }
  1287. CPPSPMD_FORCE_INLINE vint operator+(int a, const vint& b) { return vint(a) + b; }
  1288. CPPSPMD_FORCE_INLINE vint operator-(int a, const vint& b) { return vint(a) - b; }
  1289. CPPSPMD_FORCE_INLINE vint operator*(const vint& a, const vint& b) { return vint{ mullo_epi32(a.m_value, b.m_value) }; }
  1290. CPPSPMD_FORCE_INLINE vint operator*(const vint& a, int b) { return a * vint(b); }
  1291. CPPSPMD_FORCE_INLINE vint operator*(int a, const vint& b) { return vint(a) * b; }
  1292. CPPSPMD_FORCE_INLINE vint mulhiu(const vint& a, const vint& b) { return vint{ mulhi_epu32(a.m_value, b.m_value) }; }
  1293. CPPSPMD_FORCE_INLINE vint operator-(const vint& v) { return vint{ _mm_sub_epi32(_mm_setzero_si128(), v.m_value) }; }
  1294. CPPSPMD_FORCE_INLINE vint operator~(const vint& a) { return vint{ -a - 1 }; }
  1295. // A few of these break the lane-based abstraction model. They are supported in SSE2, so it makes sense to support them and let the user figure it out.
  1296. CPPSPMD_FORCE_INLINE vint adds_epu8(const vint& a, const vint& b) { return vint{ _mm_adds_epu8(a.m_value, b.m_value) }; }
  1297. CPPSPMD_FORCE_INLINE vint subs_epu8(const vint& a, const vint& b) { return vint{ _mm_subs_epu8(a.m_value, b.m_value) }; }
  1298. CPPSPMD_FORCE_INLINE vint avg_epu8(const vint & a, const vint & b) { return vint{ _mm_avg_epu8(a.m_value, b.m_value) }; }
  1299. CPPSPMD_FORCE_INLINE vint max_epu8(const vint& a, const vint& b) { return vint{ _mm_max_epu8(a.m_value, b.m_value) }; }
  1300. CPPSPMD_FORCE_INLINE vint min_epu8(const vint& a, const vint& b) { return vint{ _mm_min_epu8(a.m_value, b.m_value) }; }
  1301. CPPSPMD_FORCE_INLINE vint sad_epu8(const vint& a, const vint& b) { return vint{ _mm_sad_epu8(a.m_value, b.m_value) }; }
  1302. CPPSPMD_FORCE_INLINE vint add_epi8(const vint& a, const vint& b) { return vint{ _mm_add_epi8(a.m_value, b.m_value) }; }
  1303. CPPSPMD_FORCE_INLINE vint adds_epi8(const vint& a, const vint& b) { return vint{ _mm_adds_epi8(a.m_value, b.m_value) }; }
  1304. CPPSPMD_FORCE_INLINE vint sub_epi8(const vint& a, const vint& b) { return vint{ _mm_sub_epi8(a.m_value, b.m_value) }; }
  1305. CPPSPMD_FORCE_INLINE vint subs_epi8(const vint& a, const vint& b) { return vint{ _mm_subs_epi8(a.m_value, b.m_value) }; }
  1306. CPPSPMD_FORCE_INLINE vint cmpeq_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(a.m_value, b.m_value) }; }
  1307. CPPSPMD_FORCE_INLINE vint cmpgt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi8(a.m_value, b.m_value) }; }
  1308. CPPSPMD_FORCE_INLINE vint cmplt_epi8(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi8(a.m_value, b.m_value) }; }
  1309. CPPSPMD_FORCE_INLINE vint unpacklo_epi8(const vint& a, const vint& b) { return vint{ _mm_unpacklo_epi8(a.m_value, b.m_value) }; }
  1310. CPPSPMD_FORCE_INLINE vint unpackhi_epi8(const vint& a, const vint& b) { return vint{ _mm_unpackhi_epi8(a.m_value, b.m_value) }; }
  1311. CPPSPMD_FORCE_INLINE int movemask_epi8(const vint& a) { return _mm_movemask_epi8(a.m_value); }
  1312. CPPSPMD_FORCE_INLINE int movemask_epi32(const vint& a) { return _mm_movemask_ps(_mm_castsi128_ps(a.m_value)); }
  1313. CPPSPMD_FORCE_INLINE vint cmple_epu8(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi8(_mm_min_epu8(a.m_value, b.m_value), a.m_value) }; }
  1314. CPPSPMD_FORCE_INLINE vint cmpge_epu8(const vint& a, const vint& b) { return vint{ cmple_epu8(b, a) }; }
  1315. CPPSPMD_FORCE_INLINE vint cmpgt_epu8(const vint& a, const vint& b) { return vint{ _mm_andnot_si128(_mm_cmpeq_epi8(a.m_value, b.m_value), _mm_cmpeq_epi8(_mm_max_epu8(a.m_value, b.m_value), a.m_value)) }; }
  1316. CPPSPMD_FORCE_INLINE vint cmplt_epu8(const vint& a, const vint& b) { return vint{ cmpgt_epu8(b, a) }; }
  1317. CPPSPMD_FORCE_INLINE vint absdiff_epu8(const vint& a, const vint& b) { return vint{ _mm_or_si128(_mm_subs_epu8(a.m_value, b.m_value), _mm_subs_epu8(b.m_value, a.m_value)) }; }
  1318. CPPSPMD_FORCE_INLINE vint blendv_epi8(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi8(a.m_value, b.m_value, _mm_cmplt_epi8(mask.m_value, _mm_setzero_si128())) }; }
  1319. CPPSPMD_FORCE_INLINE vint blendv_epi32(const vint& a, const vint& b, const vint &mask) { return vint{ blendv_epi32(a.m_value, b.m_value, mask.m_value) }; }
  1320. CPPSPMD_FORCE_INLINE vint add_epi16(const vint& a, const vint& b) { return vint{ _mm_add_epi16(a.m_value, b.m_value) }; }
  1321. CPPSPMD_FORCE_INLINE vint adds_epi16(const vint& a, const vint& b) { return vint{ _mm_adds_epi16(a.m_value, b.m_value) }; }
  1322. CPPSPMD_FORCE_INLINE vint adds_epu16(const vint& a, const vint& b) { return vint{ _mm_adds_epu16(a.m_value, b.m_value) }; }
  1323. CPPSPMD_FORCE_INLINE vint avg_epu16(const vint& a, const vint& b) { return vint{ _mm_avg_epu16(a.m_value, b.m_value) }; }
  1324. CPPSPMD_FORCE_INLINE vint sub_epi16(const vint& a, const vint& b) { return vint{ _mm_sub_epi16(a.m_value, b.m_value) }; }
  1325. CPPSPMD_FORCE_INLINE vint subs_epi16(const vint& a, const vint& b) { return vint{ _mm_subs_epi16(a.m_value, b.m_value) }; }
  1326. CPPSPMD_FORCE_INLINE vint subs_epu16(const vint& a, const vint& b) { return vint{ _mm_subs_epu16(a.m_value, b.m_value) }; }
  1327. CPPSPMD_FORCE_INLINE vint mullo_epi16(const vint& a, const vint& b) { return vint{ _mm_mullo_epi16(a.m_value, b.m_value) }; }
  1328. CPPSPMD_FORCE_INLINE vint mulhi_epi16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epi16(a.m_value, b.m_value) }; }
  1329. CPPSPMD_FORCE_INLINE vint mulhi_epu16(const vint& a, const vint& b) { return vint{ _mm_mulhi_epu16(a.m_value, b.m_value) }; }
  1330. CPPSPMD_FORCE_INLINE vint min_epi16(const vint& a, const vint& b) { return vint{ _mm_min_epi16(a.m_value, b.m_value) }; }
  1331. CPPSPMD_FORCE_INLINE vint max_epi16(const vint& a, const vint& b) { return vint{ _mm_max_epi16(a.m_value, b.m_value) }; }
  1332. CPPSPMD_FORCE_INLINE vint madd_epi16(const vint& a, const vint& b) { return vint{ _mm_madd_epi16(a.m_value, b.m_value) }; }
  1333. CPPSPMD_FORCE_INLINE vint cmpeq_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpeq_epi16(a.m_value, b.m_value) }; }
  1334. CPPSPMD_FORCE_INLINE vint cmpgt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmpgt_epi16(a.m_value, b.m_value) }; }
  1335. CPPSPMD_FORCE_INLINE vint cmplt_epi16(const vint& a, const vint& b) { return vint{ _mm_cmplt_epi16(a.m_value, b.m_value) }; }
  1336. CPPSPMD_FORCE_INLINE vint packs_epi16(const vint& a, const vint& b) { return vint{ _mm_packs_epi16(a.m_value, b.m_value) }; }
  1337. CPPSPMD_FORCE_INLINE vint packus_epi16(const vint& a, const vint& b) { return vint{ _mm_packus_epi16(a.m_value, b.m_value) }; }
  1338. CPPSPMD_FORCE_INLINE vint uniform_shift_left_epi16(const vint& a, const vint& b) { return vint{ _mm_sll_epi16(a.m_value, b.m_value) }; }
  1339. CPPSPMD_FORCE_INLINE vint uniform_arith_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_sra_epi16(a.m_value, b.m_value) }; }
  1340. CPPSPMD_FORCE_INLINE vint uniform_shift_right_epi16(const vint& a, const vint& b) { return vint{ _mm_srl_epi16(a.m_value, b.m_value) }; }
  1341. #define VINT_SHIFT_LEFT_EPI16(a, b) vint(_mm_slli_epi16((a).m_value, b))
  1342. #define VINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srai_epi16((a).m_value, b))
  1343. #define VUINT_SHIFT_RIGHT_EPI16(a, b) vint(_mm_srli_epi16((a).m_value, b))
  1344. CPPSPMD_FORCE_INLINE vint undefined_vint() { return vint{ _mm_undefined_si128() }; }
  1345. CPPSPMD_FORCE_INLINE vfloat undefined_vfloat() { return vfloat{ _mm_undefined_ps() }; }
  1346. CPPSPMD_FORCE_INLINE vint zero_vint() { return vint{ _mm_setzero_si128() }; }
  1347. CPPSPMD_FORCE_INLINE vfloat zero_vfloat() { return vfloat{ _mm_setzero_ps() }; }
  1348. CPPSPMD_FORCE_INLINE vint vint_lane_set(int v0, int v1, int v2, int v3) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }
  1349. CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set(float v0, float v1, float v2, float v3) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }
  1350. CPPSPMD_FORCE_INLINE vint vint_lane_set_r(int v3, int v2, int v1, int v0) { return vint{ _mm_set_epi32(v3, v2, v1, v0) }; }
  1351. CPPSPMD_FORCE_INLINE vfloat vfloat_lane_set_r(float v3, float v2, float v1, float v0) { return vfloat{ _mm_set_ps(v3, v2, v1, v0) }; }
  1352. // control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int32's in each 128-bit lane.
  1353. #define VINT_LANE_SHUFFLE_EPI32(a, control) vint(_mm_shuffle_epi32((a).m_value, control))
  1354. #define VFLOAT_LANE_SHUFFLE_PS(a, b, control) vfloat(_mm_shuffle_ps((a).m_value, (b).m_value, control))
  1355. // control is an 8-bit immediate value containing 4 2-bit indices which shuffles the int16's in either the high or low 64-bit lane.
  1356. #define VINT_LANE_SHUFFLELO_EPI16(a, control) vint(_mm_shufflelo_epi16((a).m_value, control))
  1357. #define VINT_LANE_SHUFFLEHI_EPI16(a, control) vint(_mm_shufflehi_epi16((a).m_value, control))
  1358. #define VINT_LANE_SHUFFLE_MASK(a, b, c, d) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
  1359. #define VINT_LANE_SHUFFLE_MASK_R(d, c, b, a) ((a) | ((b) << 2) | ((c) << 4) | ((d) << 6))
  1360. #define VINT_LANE_SHIFT_LEFT_BYTES(a, l) vint(_mm_slli_si128((a).m_value, l))
  1361. #define VINT_LANE_SHIFT_RIGHT_BYTES(a, l) vint(_mm_srli_si128((a).m_value, l))
  1362. // Unpack and interleave 8-bit integers from the low or high half of a and b
  1363. CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi8(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi8(a.m_value, b.m_value)); }
  1364. CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi8(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi8(a.m_value, b.m_value)); }
  1365. // Unpack and interleave 16-bit integers from the low or high half of a and b
  1366. CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi16(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi16(a.m_value, b.m_value)); }
  1367. CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi16(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi16(a.m_value, b.m_value)); }
  1368. // Unpack and interleave 32-bit integers from the low or high half of a and b
  1369. CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi32(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi32(a.m_value, b.m_value)); }
  1370. CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi32(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi32(a.m_value, b.m_value)); }
  1371. // Unpack and interleave 64-bit integers from the low or high half of a and b
  1372. CPPSPMD_FORCE_INLINE vint vint_lane_unpacklo_epi64(const vint& a, const vint& b) { return vint(_mm_unpacklo_epi64(a.m_value, b.m_value)); }
  1373. CPPSPMD_FORCE_INLINE vint vint_lane_unpackhi_epi64(const vint& a, const vint& b) { return vint(_mm_unpackhi_epi64(a.m_value, b.m_value)); }
  1374. CPPSPMD_FORCE_INLINE vint vint_set1_epi8(int8_t a) { return vint(_mm_set1_epi8(a)); }
  1375. CPPSPMD_FORCE_INLINE vint vint_set1_epi16(int16_t a) { return vint(_mm_set1_epi16(a)); }
  1376. CPPSPMD_FORCE_INLINE vint vint_set1_epi32(int32_t a) { return vint(_mm_set1_epi32(a)); }
  1377. CPPSPMD_FORCE_INLINE vint vint_set1_epi64(int64_t a) { return vint(_mm_set1_epi64x(a)); }
  1378. CPPSPMD_FORCE_INLINE vint mul_epu32(const vint &a, const vint& b) { return vint(_mm_mul_epu32(a.m_value, b.m_value)); }
  1379. CPPSPMD_FORCE_INLINE vint div_epi32(const vint &a, const vint& b)
  1380. {
  1381. __m128d al = _mm_cvtepi32_pd(a.m_value);
  1382. __m128d ah = _mm_cvtepi32_pd(_mm_unpackhi_epi64(a.m_value, a.m_value));
  1383. __m128d bl = _mm_cvtepi32_pd(b.m_value);
  1384. __m128d bh = _mm_cvtepi32_pd(_mm_unpackhi_epi64(b.m_value, b.m_value));
  1385. __m128d rl = _mm_div_pd(al, bl);
  1386. __m128d rh = _mm_div_pd(ah, bh);
  1387. __m128i rli = _mm_cvttpd_epi32(rl);
  1388. __m128i rhi = _mm_cvttpd_epi32(rh);
  1389. return vint(_mm_unpacklo_epi64(rli, rhi));
  1390. }
  1391. CPPSPMD_FORCE_INLINE vint mod_epi32(const vint &a, const vint& b)
  1392. {
  1393. vint aa = abs(a), ab = abs(b);
  1394. vint q = div_epi32(aa, ab);
  1395. vint r = aa - q * ab;
  1396. return spmd_ternaryi(a < 0, -r, r);
  1397. }
  1398. CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, const vint& b)
  1399. {
  1400. return div_epi32(a, b);
  1401. }
  1402. CPPSPMD_FORCE_INLINE vint operator/ (const vint& a, int b)
  1403. {
  1404. return div_epi32(a, vint(b));
  1405. }
  1406. CPPSPMD_FORCE_INLINE vint operator% (const vint& a, const vint& b)
  1407. {
  1408. return mod_epi32(a, b);
  1409. }
  1410. CPPSPMD_FORCE_INLINE vint operator% (const vint& a, int b)
  1411. {
  1412. return mod_epi32(a, vint(b));
  1413. }
  1414. CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, const vint& b)
  1415. {
  1416. #if 0
  1417. CPPSPMD_ALIGN(32) int result[4];
  1418. result[0] = extract_x(a.m_value) << extract_x(b.m_value);
  1419. result[1] = extract_y(a.m_value) << extract_y(b.m_value);
  1420. result[2] = extract_z(a.m_value) << extract_z(b.m_value);
  1421. result[3] = extract_w(a.m_value) << extract_w(b.m_value);
  1422. return vint{ _mm_load_si128((__m128i*)result) };
  1423. #elif 0
  1424. int x = extract_x(a.m_value) << extract_x(b.m_value);
  1425. int y = extract_y(a.m_value) << extract_y(b.m_value);
  1426. int z = extract_z(a.m_value) << extract_z(b.m_value);
  1427. int w = extract_w(a.m_value) << extract_w(b.m_value);
  1428. __m128i v = insert_x(_mm_undefined_si128(), x);
  1429. v = insert_y(v, y);
  1430. v = insert_z(v, z);
  1431. return vint{ insert_w(v, w) };
  1432. #else
  1433. // What this does: shift left each b lane by 23 bits (to move the shift amount into the FP exponent position), then epi32 add to the integer rep of 1.0f, then cast that to float, then convert that to int to get fast 2^x.
  1434. return a * vint(cast_vint_to_vfloat(vint(_mm_slli_epi32(b.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f))));
  1435. #endif
  1436. }
  1437. // uniform shift left
  1438. CPPSPMD_FORCE_INLINE vint operator<< (const vint& a, int b)
  1439. {
  1440. __m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
  1441. return vint{ _mm_sll_epi32(a.m_value, bv) };
  1442. }
  1443. // uniform arithmetic shift right
  1444. CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, int b)
  1445. {
  1446. __m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
  1447. return vint{ _mm_sra_epi32(a.m_value, bv) };
  1448. }
  1449. // uniform shift right
  1450. CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, int b)
  1451. {
  1452. __m128i bv = _mm_castps_si128(_mm_and_ps(_mm_castsi128_ps(_mm_set1_epi32(b)), _mm_castsi128_ps(_mm_load_si128((const __m128i *)g_x_128))));
  1453. return vint{ _mm_srl_epi32(a.m_value, bv) };
  1454. }
  1455. CPPSPMD_FORCE_INLINE vint vuint_shift_right(const vint& a, const vint& b)
  1456. {
  1457. #if 0
  1458. CPPSPMD_ALIGN(32) int result[4];
  1459. result[0] = ((uint32_t)extract_x(a.m_value)) >> extract_x(b.m_value);
  1460. result[1] = ((uint32_t)extract_y(a.m_value)) >> extract_y(b.m_value);
  1461. result[2] = ((uint32_t)extract_z(a.m_value)) >> extract_z(b.m_value);
  1462. result[3] = ((uint32_t)extract_w(a.m_value)) >> extract_w(b.m_value);
  1463. return vint{ _mm_load_si128((__m128i*)result) };
  1464. #elif 0
  1465. uint32_t x = ((uint32_t)extract_x(a.m_value)) >> ((uint32_t)extract_x(b.m_value));
  1466. uint32_t y = ((uint32_t)extract_y(a.m_value)) >> ((uint32_t)extract_y(b.m_value));
  1467. uint32_t z = ((uint32_t)extract_z(a.m_value)) >> ((uint32_t)extract_z(b.m_value));
  1468. uint32_t w = ((uint32_t)extract_w(a.m_value)) >> ((uint32_t)extract_w(b.m_value));
  1469. __m128i v = insert_x(_mm_undefined_si128(), x);
  1470. v = insert_y(v, y);
  1471. v = insert_z(v, z);
  1472. return vint{ insert_w(v, w) };
  1473. #else
  1474. //vint inv_shift = 32 - b;
  1475. //vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
  1476. // Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
  1477. vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
  1478. // Now convert scale factor to integer.
  1479. vint r = vint(f);
  1480. // mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
  1481. vint q(mulhi_epu32(a.m_value, r.m_value));
  1482. // Handle shift amounts of 0.
  1483. return spmd_ternaryi(b > 0, q, a);
  1484. #endif
  1485. }
  1486. CPPSPMD_FORCE_INLINE vint vuint_shift_right_not_zero(const vint& a, const vint& b)
  1487. {
  1488. //vint inv_shift = 32 - b;
  1489. //vfloat f = cast_vint_to_vfloat(vint(_mm_slli_epi32(inv_shift.m_value, 23)) + cast_vfloat_to_vint(vfloat(1.0f)));
  1490. // Take float rep of 1.0f (0x3f800000), subtract (32<<23), subtract (shift<<23), cast to float.
  1491. vfloat f = cast_vint_to_vfloat(vint(_mm_sub_epi32(_mm_set1_epi32(0x4f800000), _mm_slli_epi32(b.m_value, 23))));
  1492. // Now convert scale factor to integer.
  1493. vint r = vint(f);
  1494. // mulhi_epu32 (using two _mm_mul_epu32), to emulate varying shift left.
  1495. return vint(mulhi_epu32(a.m_value, r.m_value));
  1496. }
  1497. CPPSPMD_FORCE_INLINE vint operator>> (const vint& a, const vint& b)
  1498. {
  1499. #if 0
  1500. CPPSPMD_ALIGN(32) int result[4];
  1501. result[0] = extract_x(a.m_value) >> extract_x(b.m_value);
  1502. result[1] = extract_y(a.m_value) >> extract_y(b.m_value);
  1503. result[2] = extract_z(a.m_value) >> extract_z(b.m_value);
  1504. result[3] = extract_w(a.m_value) >> extract_w(b.m_value);
  1505. return vint{ _mm_load_si128((__m128i*)result) };
  1506. #elif 0
  1507. int x = extract_x(a.m_value) >> extract_x(b.m_value);
  1508. int y = extract_y(a.m_value) >> extract_y(b.m_value);
  1509. int z = extract_z(a.m_value) >> extract_z(b.m_value);
  1510. int w = extract_w(a.m_value) >> extract_w(b.m_value);
  1511. __m128i v = insert_x(_mm_undefined_si128(), x);
  1512. v = insert_y(v, y);
  1513. v = insert_z(v, z);
  1514. return vint{ insert_w(v, w) };
  1515. #else
  1516. vint sign_mask(_mm_cmplt_epi32(a.m_value, _mm_setzero_si128()));
  1517. vint a_shifted = vuint_shift_right(a ^ sign_mask, b) ^ sign_mask;
  1518. return a_shifted;
  1519. #endif
  1520. }
  1521. #undef VINT_SHIFT_LEFT
  1522. #undef VINT_SHIFT_RIGHT
  1523. #undef VUINT_SHIFT_RIGHT
  1524. // Shift left/right by a uniform immediate constant
  1525. #define VINT_SHIFT_LEFT(a, b) vint(_mm_slli_epi32( (a).m_value, (b) ) )
  1526. #define VINT_SHIFT_RIGHT(a, b) vint( _mm_srai_epi32( (a).m_value, (b) ) )
  1527. #define VUINT_SHIFT_RIGHT(a, b) vint( _mm_srli_epi32( (a).m_value, (b) ) )
  1528. #define VINT_ROT(x, k) (VINT_SHIFT_LEFT((x), (k)) | VUINT_SHIFT_RIGHT((x), 32 - (k)))
  1529. CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, const lint& b) { return vbool{ _mm_cmpeq_epi32(a.m_value, b.m_value) }; }
  1530. CPPSPMD_FORCE_INLINE vbool operator==(const lint& a, int b) { return vint(a) == vint(b); }
  1531. CPPSPMD_FORCE_INLINE vbool operator==(int a, const lint& b) { return vint(a) == vint(b); }
  1532. CPPSPMD_FORCE_INLINE vbool operator<(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
  1533. CPPSPMD_FORCE_INLINE vbool operator>(const lint& a, const lint& b) { return vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
  1534. CPPSPMD_FORCE_INLINE vbool operator<=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(a.m_value, b.m_value) }; }
  1535. CPPSPMD_FORCE_INLINE vbool operator>=(const lint& a, const lint& b) { return !vbool{ _mm_cmpgt_epi32(b.m_value, a.m_value) }; }
  1536. CPPSPMD_FORCE_INLINE float extract(const vfloat& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) float values[4]; _mm_store_ps(values, v.m_value); return values[instance]; }
  1537. CPPSPMD_FORCE_INLINE int extract(const vint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
  1538. CPPSPMD_FORCE_INLINE int extract(const lint& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance]; }
  1539. CPPSPMD_FORCE_INLINE bool extract(const vbool& v, int instance) { assert(instance < 4); CPPSPMD_ALIGN(16) int values[4]; _mm_store_si128((__m128i*)values, v.m_value); return values[instance] != 0; }
  1540. #undef VINT_EXTRACT
  1541. #undef VBOOL_EXTRACT
  1542. #undef VFLOAT_EXTRACT
  1543. #if CPPSPMD_SSE2
  1544. // Pass in an immediate constant and the compiler will optimize these expressions.
  1545. #define VINT_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
  1546. #define VBOOL_EXTRACT(v, instance) ( ((instance) == 0) ? extract_x((v).m_value) : (((instance) == 1) ? extract_y((v).m_value) : (((instance) == 2) ? extract_z((v).m_value) : extract_w((v).m_value))) )
  1547. #define VFLOAT_EXTRACT(v, instance) ( ((instance) == 0) ? extractf_ps_x((v).m_value) : (((instance) == 1) ? extractf_ps_y((v).m_value) : (((instance) == 2) ? extractf_ps_z((v).m_value) : extractf_ps_w((v).m_value))) )
  1548. #else
  1549. CPPSPMD_FORCE_INLINE float cast_int_bits_as_float(int v) { return *(const float*)&v; }
  1550. #define VINT_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
  1551. #define VBOOL_EXTRACT(v, instance) _mm_extract_epi32((v).m_value, instance)
  1552. #define VFLOAT_EXTRACT(v, instance) cast_int_bits_as_float(_mm_extract_ps((v).m_value, instance))
  1553. #endif
  1554. CPPSPMD_FORCE_INLINE vfloat &insert(vfloat& v, int instance, float f)
  1555. {
  1556. assert(instance < 4);
  1557. CPPSPMD_ALIGN(16) float values[4];
  1558. _mm_store_ps(values, v.m_value);
  1559. values[instance] = f;
  1560. v.m_value = _mm_load_ps(values);
  1561. return v;
  1562. }
  1563. CPPSPMD_FORCE_INLINE vint &insert(vint& v, int instance, int i)
  1564. {
  1565. assert(instance < 4);
  1566. CPPSPMD_ALIGN(16) int values[4];
  1567. _mm_store_si128((__m128i *)values, v.m_value);
  1568. values[instance] = i;
  1569. v.m_value = _mm_load_si128((__m128i *)values);
  1570. return v;
  1571. }
  1572. CPPSPMD_FORCE_INLINE vint init_lookup4(const uint8_t pTab[16])
  1573. {
  1574. __m128i l = _mm_loadu_si128((const __m128i*)pTab);
  1575. return vint{ l };
  1576. }
  1577. CPPSPMD_FORCE_INLINE vint table_lookup4_8(const vint& a, const vint& table)
  1578. {
  1579. return vint{ shuffle_epi8(table.m_value, a.m_value) };
  1580. }
  1581. CPPSPMD_FORCE_INLINE void init_lookup5(const uint8_t pTab[32], vint& table_0, vint& table_1)
  1582. {
  1583. __m128i l = _mm_loadu_si128((const __m128i*)pTab);
  1584. __m128i h = _mm_loadu_si128((const __m128i*)(pTab + 16));
  1585. table_0.m_value = l;
  1586. table_1.m_value = h;
  1587. }
  1588. CPPSPMD_FORCE_INLINE vint table_lookup5_8(const vint& a, const vint& table_0, const vint& table_1)
  1589. {
  1590. __m128i l_0 = shuffle_epi8(table_0.m_value, a.m_value);
  1591. __m128i h_0 = shuffle_epi8(table_1.m_value, a.m_value);
  1592. __m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
  1593. __m128 v_0 = blendv_ps(_mm_castsi128_ps(l_0), _mm_castsi128_ps(h_0), _mm_castsi128_ps(m_0));
  1594. return vint{ _mm_castps_si128(v_0) };
  1595. }
  1596. CPPSPMD_FORCE_INLINE void init_lookup6(const uint8_t pTab[64], vint& table_0, vint& table_1, vint& table_2, vint& table_3)
  1597. {
  1598. __m128i a = _mm_loadu_si128((const __m128i*)pTab);
  1599. __m128i b = _mm_loadu_si128((const __m128i*)(pTab + 16));
  1600. __m128i c = _mm_loadu_si128((const __m128i*)(pTab + 32));
  1601. __m128i d = _mm_loadu_si128((const __m128i*)(pTab + 48));
  1602. table_0.m_value = a;
  1603. table_1.m_value = b;
  1604. table_2.m_value = c;
  1605. table_3.m_value = d;
  1606. }
  1607. CPPSPMD_FORCE_INLINE vint table_lookup6_8(const vint& a, const vint& table_0, const vint& table_1, const vint& table_2, const vint& table_3)
  1608. {
  1609. __m128i m_0 = _mm_slli_epi32(a.m_value, 31 - 4);
  1610. __m128 av_0;
  1611. {
  1612. __m128i al_0 = shuffle_epi8(table_0.m_value, a.m_value);
  1613. __m128i ah_0 = shuffle_epi8(table_1.m_value, a.m_value);
  1614. av_0 = blendv_ps(_mm_castsi128_ps(al_0), _mm_castsi128_ps(ah_0), _mm_castsi128_ps(m_0));
  1615. }
  1616. __m128 bv_0;
  1617. {
  1618. __m128i bl_0 = shuffle_epi8(table_2.m_value, a.m_value);
  1619. __m128i bh_0 = shuffle_epi8(table_3.m_value, a.m_value);
  1620. bv_0 = blendv_ps(_mm_castsi128_ps(bl_0), _mm_castsi128_ps(bh_0), _mm_castsi128_ps(m_0));
  1621. }
  1622. __m128i m2_0 = _mm_slli_epi32(a.m_value, 31 - 5);
  1623. __m128 v2_0 = blendv_ps(av_0, bv_0, _mm_castsi128_ps(m2_0));
  1624. return vint{ _mm_castps_si128(v2_0) };
  1625. }
  1626. #if 0
  1627. template<typename SPMDKernel, typename... Args>
  1628. CPPSPMD_FORCE_INLINE decltype(auto) spmd_call(Args&&... args)
  1629. {
  1630. SPMDKernel kernel;
  1631. kernel.init(exec_mask::all_on());
  1632. return kernel._call(std::forward<Args>(args)...);
  1633. }
  1634. #else
  1635. template<typename SPMDKernel, typename... Args>
  1636. CPPSPMD_FORCE_INLINE void spmd_call(Args&&... args)
  1637. {
  1638. SPMDKernel kernel;
  1639. kernel.init(exec_mask::all_on());
  1640. kernel._call(std::forward<Args>(args)...);
  1641. }
  1642. #endif
  1643. CPPSPMD_FORCE_INLINE void spmd_kernel::init(const spmd_kernel::exec_mask& kernel_exec)
  1644. {
  1645. m_exec = kernel_exec;
  1646. m_kernel_exec = kernel_exec;
  1647. m_continue_mask = exec_mask::all_off();
  1648. #ifdef _DEBUG
  1649. m_in_loop = false;
  1650. #endif
  1651. }
  1652. CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref& dst, const vfloat& src)
  1653. {
  1654. CPPSPMD_ALIGN(16) int vindex[4];
  1655. _mm_store_si128((__m128i*)vindex, dst.m_vindex);
  1656. CPPSPMD_ALIGN(16) float stored[4];
  1657. _mm_store_ps(stored, src.m_value);
  1658. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  1659. for (int i = 0; i < 4; i++)
  1660. {
  1661. if (mask & (1 << i))
  1662. dst.m_pValue[vindex[i]] = stored[i];
  1663. }
  1664. return dst;
  1665. }
  1666. CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref& dst, const vfloat& src)
  1667. {
  1668. CPPSPMD_ALIGN(16) int vindex[4];
  1669. _mm_store_si128((__m128i*)vindex, dst.m_vindex);
  1670. CPPSPMD_ALIGN(16) float stored[4];
  1671. _mm_store_ps(stored, src.m_value);
  1672. for (int i = 0; i < 4; i++)
  1673. dst.m_pValue[vindex[i]] = stored[i];
  1674. return dst;
  1675. }
  1676. CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store(const float_vref&& dst, const vfloat& src)
  1677. {
  1678. CPPSPMD_ALIGN(16) int vindex[4];
  1679. _mm_store_si128((__m128i*)vindex, dst.m_vindex);
  1680. CPPSPMD_ALIGN(16) float stored[4];
  1681. _mm_store_ps(stored, src.m_value);
  1682. int mask = _mm_movemask_ps(_mm_castsi128_ps(m_exec.m_mask));
  1683. for (int i = 0; i < 4; i++)
  1684. {
  1685. if (mask & (1 << i))
  1686. dst.m_pValue[vindex[i]] = stored[i];
  1687. }
  1688. return dst;
  1689. }
  1690. CPPSPMD_FORCE_INLINE const float_vref& spmd_kernel::store_all(const float_vref&& dst, const vfloat& src)
  1691. {
  1692. CPPSPMD_ALIGN(16) int vindex[4];
  1693. _mm_store_si128((__m128i*)vindex, dst.m_vindex);
  1694. CPPSPMD_ALIGN(16) float stored[4];
  1695. _mm_store_ps(stored, src.m_value);
  1696. for (int i = 0; i < 4; i++)
  1697. dst.m_pValue[vindex[i]] = stored[i];
  1698. return dst;
  1699. }
  1700. #include "cppspmd_flow.h"
  1701. #include "cppspmd_math.h"
  1702. } // namespace cppspmd_sse41