simd.h 76 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2022 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Covers a small intersection of SSE2 and NEON in order to reduce the number
  25. // of bugs from multiple implementations when nothing advanced is required.
  26. #ifndef DFPSR_SIMD
  27. #define DFPSR_SIMD
  28. #include <stdint.h>
  29. #include <cassert>
  30. #include "SafePointer.h"
  31. #include "../math/FVector.h"
  32. #include "../math/IVector.h"
  33. #include "../math/UVector.h"
  34. #define ALIGN16 __attribute__((aligned(16)))
  35. // To allow turning off SIMD intrinsics for testing
  36. #ifdef __SSE2__
  37. // Comment out this line to test without SSE2
  38. #define USE_SSE2
  39. #elif __ARM_NEON
  40. // Comment out this line to test without NEON
  41. #define USE_NEON
  42. #endif
  43. // Everything declared in here handles things specific for SSE.
  44. // Direct use of the macros will not provide portability to all hardware.
  45. #ifdef USE_SSE2
  46. #define USE_BASIC_SIMD
  47. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  48. #include <emmintrin.h> // SSE2
  49. #ifdef __SSSE3__
  50. #include <tmmintrin.h> // SSSE3
  51. // Comment out this line to test without SSSE3
  52. #define USE_SSSE3
  53. #endif
  54. #ifdef __AVX2__
  55. #include <immintrin.h> // AVX2
  56. // Comment out this line to test without AVX2
  57. #define USE_AVX2
  58. #endif
  59. // Vector types
  60. #define SIMD_F32x4 __m128
  61. #define SIMD_U8x16 __m128i
  62. #define SIMD_U16x8 __m128i
  63. #define SIMD_U32x4 __m128i
  64. #define SIMD_I32x4 __m128i
  65. // Vector uploads in address order
  66. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  67. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  68. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  69. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  70. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  71. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  72. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  73. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  74. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  75. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  76. // Conversions
  77. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  78. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  79. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  80. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  81. // Unpacking conversions
  82. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  83. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  84. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  85. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  86. // Saturated packing
  87. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  88. inline SIMD_U8x16 PACK_SAT_U16_TO_U8(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  89. SIMD_U16x8 mask, a2, b2;
  90. mask = _mm_set1_epi16(0x7fff);
  91. a2 = _mm_and_si128(a, mask);
  92. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  93. b2 = _mm_and_si128(b, mask);
  94. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  95. return _mm_packus_epi16(a2, b2);
  96. }
  97. // Reinterpret casting
  98. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  99. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  100. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  101. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  102. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  103. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  104. // Vector float operations returning SIMD_F32x4
  105. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  106. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  107. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  108. // Vector integer operations returning SIMD_I32x4
  109. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  110. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  111. // 32-bit integer multiplications are not available on SSE2.
  112. // Vector integer operations returning SIMD_U32x4
  113. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  114. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  115. // 32-bit integer multiplications are not available on SSE2.
  116. // Vector integer operations returning SIMD_U16x8
  117. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  118. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  119. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  120. // Vector integer operations returning SIMD_U8x16
  121. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  122. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  123. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  124. // No 8-bit multiplications
  125. // Statistics
  126. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  127. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  128. // Bitwise
  129. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  130. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  131. #endif
  132. // Everything declared in here handles things specific for NEON.
  133. // Direct use of the macros will not provide portability to all hardware.
  134. #ifdef USE_NEON
  135. #define USE_BASIC_SIMD
  136. #include <arm_neon.h> // NEON
  137. // Vector types
  138. #define SIMD_F32x4 float32x4_t
  139. #define SIMD_U8x16 uint8x16_t
  140. #define SIMD_U16x8 uint16x8_t
  141. #define SIMD_U32x4 uint32x4_t
  142. #define SIMD_I32x4 int32x4_t
  143. // Vector uploads in address order
  144. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  145. float data[4] ALIGN16 = {a, b, c, d};
  146. return vld1q_f32(data);
  147. }
  148. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  149. return vdupq_n_f32(a);
  150. }
  151. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  152. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  153. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  154. return vld1q_u8(data);
  155. }
  156. inline SIMD_U16x8 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  157. return vdupq_n_u8(a);
  158. }
  159. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  160. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  161. return vld1q_u16(data);
  162. }
  163. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  164. return vdupq_n_u16(a);
  165. }
  166. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  167. uint32_t data[4] ALIGN16 = {a, b, c, d};
  168. return vld1q_u32(data);
  169. }
  170. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  171. return vdupq_n_u32(a);
  172. }
  173. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  174. int32_t data[4] ALIGN16 = {a, b, c, d};
  175. return vld1q_s32(data);
  176. }
  177. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  178. return vdupq_n_s32(a);
  179. }
  180. // Conversions
  181. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  182. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  183. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  184. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  185. // Unpacking conversions
  186. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  187. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  188. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  189. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  190. // Saturated packing
  191. #define PACK_SAT_U16_TO_U8(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  192. // Reinterpret casting
  193. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  194. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  195. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  196. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  197. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  198. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  199. // Vector float operations returning SIMD_F32x4
  200. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  201. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  202. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  203. // Vector integer operations returning SIMD_I32x4
  204. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  205. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  206. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  207. // Vector integer operations returning SIMD_U32x4
  208. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  209. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  210. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  211. // Vector integer operations returning SIMD_U16x8
  212. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  213. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  214. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  215. // Vector integer operations returning SIMD_U8x16
  216. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  217. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  218. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  219. // No 8-bit multiplications
  220. // Statistics
  221. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  222. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  223. // Bitwise
  224. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  225. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  226. #endif
  227. /*
  228. The vector types (F32x4, I32x4, U32x4, U16x8) below are supposed to be portable across different CPU architectures.
  229. When this abstraction layer is mixed with handwritten SIMD intrinsics:
  230. Use "USE_SSE2" instead of "__SSE2__"
  231. Use "USE_AVX2" instead of "__AVX2__"
  232. Use "USE_NEON" instead of "__ARM_NEON"
  233. Portability exceptions:
  234. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  235. Only use when USE_BASIC_SIMD is defined.
  236. Will not work on scalar emulation.
  237. * The "shared_memory" array is only defined for targets with direct access to SIMD registers. (SSE)
  238. Only use when USE_DIRECT_SIMD_MEMORY_ACCESS is defined.
  239. Will not work on NEON or scalar emulation.
  240. * The "emulated" array is ony defined when SIMD is turned off.
  241. Cannot be used when USE_BASIC_SIMD is defined.
  242. Will not work when either SSE or NEON is enabled.
  243. */
  244. union F32x4 {
  245. #ifdef USE_BASIC_SIMD
  246. public:
  247. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  248. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  249. // Direct access cannot be done on NEON!
  250. float shared_memory[4];
  251. #endif
  252. // The SIMD vector of undefined type
  253. // Not accessible while emulating!
  254. SIMD_F32x4 v;
  255. // Construct a portable vector from a native SIMD vector
  256. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  257. // Construct a portable vector from a set of scalars
  258. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  259. // Construct a portable vector from a single duplicated scalar
  260. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  261. #else
  262. public:
  263. // Emulate a SIMD vector as an array of scalars without hardware support
  264. // Only accessible while emulating!
  265. float emulated[4];
  266. // Construct a portable vector from a set of scalars
  267. F32x4(float a1, float a2, float a3, float a4) {
  268. this->emulated[0] = a1;
  269. this->emulated[1] = a2;
  270. this->emulated[2] = a3;
  271. this->emulated[3] = a4;
  272. }
  273. // Construct a portable vector from a single duplicated scalar
  274. explicit F32x4(float scalar) {
  275. this->emulated[0] = scalar;
  276. this->emulated[1] = scalar;
  277. this->emulated[2] = scalar;
  278. this->emulated[3] = scalar;
  279. }
  280. #endif
  281. // Construct a portable SIMD vector from a pointer to aligned data
  282. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  283. static inline F32x4 readAlignedUnsafe(const float* data) {
  284. #ifdef USE_BASIC_SIMD
  285. #ifdef USE_SSE2
  286. return F32x4(_mm_load_ps(data));
  287. #elif USE_NEON
  288. return F32x4(vld1q_f32(data));
  289. #endif
  290. #else
  291. return F32x4(data[0], data[1], data[2], data[3]);
  292. #endif
  293. }
  294. // Write to aligned memory from the existing vector
  295. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  296. inline void writeAlignedUnsafe(float* data) const {
  297. #ifdef USE_BASIC_SIMD
  298. #ifdef USE_SSE2
  299. _mm_store_ps(data, this->v);
  300. #elif USE_NEON
  301. vst1q_f32(data, this->v);
  302. #endif
  303. #else
  304. data[0] = this->emulated[0];
  305. data[1] = this->emulated[1];
  306. data[2] = this->emulated[2];
  307. data[3] = this->emulated[3];
  308. #endif
  309. }
  310. #ifdef DFPSR_GEOMETRY_FVECTOR
  311. dsr::FVector4D get() const {
  312. float data[4] ALIGN16;
  313. this->writeAlignedUnsafe(data);
  314. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  315. }
  316. #endif
  317. // Bound and alignment checked reading
  318. static inline F32x4 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
  319. const float* pointer = data.getUnsafe();
  320. assert(((uintptr_t)pointer & 15) == 0);
  321. #ifdef SAFE_POINTER_CHECKS
  322. data.assertInside(methodName, pointer, 16);
  323. #endif
  324. return F32x4::readAlignedUnsafe(pointer);
  325. }
  326. // Bound and alignment checked writing
  327. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  328. float* pointer = data.getUnsafe();
  329. assert(((uintptr_t)pointer & 15) == 0);
  330. #ifdef SAFE_POINTER_CHECKS
  331. data.assertInside(methodName, pointer, 16);
  332. #endif
  333. this->writeAlignedUnsafe(pointer);
  334. }
  335. // 1 / x
  336. // Useful for multiple divisions with the same denominator
  337. // Useless if the denominator is a constant
  338. F32x4 reciprocal() const {
  339. #ifdef USE_BASIC_SIMD
  340. #ifdef USE_SSE2
  341. // Approximate
  342. SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
  343. // Refine
  344. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
  345. #elif USE_NEON
  346. // Approximate
  347. SIMD_F32x4 result = vrecpeq_f32(this->v);
  348. // Refine
  349. result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
  350. return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
  351. #else
  352. assert(false);
  353. return F32x4(0);
  354. #endif
  355. #else
  356. return F32x4(1.0f / this->emulated[0], 1.0f / this->emulated[1], 1.0f / this->emulated[2], 1.0f / this->emulated[3]);
  357. #endif
  358. }
  359. // 1 / sqrt(x)
  360. // Useful for normalizing vectors
  361. F32x4 reciprocalSquareRoot() const {
  362. #ifdef USE_BASIC_SIMD
  363. #ifdef USE_SSE2
  364. //__m128 reciRoot = _mm_rsqrt_ps(this->v);
  365. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
  366. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
  367. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  368. return F32x4(reciRoot);
  369. #elif USE_NEON
  370. // Approximate
  371. SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
  372. // Refine
  373. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
  374. return reciRoot;
  375. #else
  376. assert(false);
  377. return F32x4(0);
  378. #endif
  379. #else
  380. return F32x4(1.0f / sqrt(this->emulated[0]), 1.0f / sqrt(this->emulated[1]), 1.0f / sqrt(this->emulated[2]), 1.0f / sqrt(this->emulated[3]));
  381. #endif
  382. }
  383. // sqrt(x)
  384. // Useful for getting lengths of vectors
  385. F32x4 squareRoot() const {
  386. #ifdef USE_BASIC_SIMD
  387. #ifdef USE_SSE2
  388. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  389. // Approximate
  390. SIMD_F32x4 root = _mm_sqrt_ps(this->v);
  391. // Refine
  392. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
  393. return F32x4(root);
  394. #elif USE_NEON
  395. return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
  396. #else
  397. assert(false);
  398. return F32x4(0);
  399. #endif
  400. #else
  401. return F32x4(sqrt(this->emulated[0]), sqrt(this->emulated[1]), sqrt(this->emulated[2]), sqrt(this->emulated[3]));
  402. #endif
  403. }
  404. F32x4 clamp(float min, float max) const {
  405. #ifdef USE_BASIC_SIMD
  406. return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)), LOAD_SCALAR_F32_SIMD(max)));
  407. #else
  408. float val0 = this->emulated[0];
  409. float val1 = this->emulated[1];
  410. float val2 = this->emulated[2];
  411. float val3 = this->emulated[3];
  412. if (min > val0) { val0 = min; }
  413. if (max < val0) { val0 = max; }
  414. if (min > val1) { val1 = min; }
  415. if (max < val1) { val1 = max; }
  416. if (min > val2) { val2 = min; }
  417. if (max < val2) { val2 = max; }
  418. if (min > val3) { val3 = min; }
  419. if (max < val3) { val3 = max; }
  420. return F32x4(val0, val1, val2, val3);
  421. #endif
  422. }
  423. F32x4 clampLower(float min) const {
  424. #ifdef USE_BASIC_SIMD
  425. return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)));
  426. #else
  427. float val0 = this->emulated[0];
  428. float val1 = this->emulated[1];
  429. float val2 = this->emulated[2];
  430. float val3 = this->emulated[3];
  431. if (min > val0) { val0 = min; }
  432. if (min > val1) { val1 = min; }
  433. if (min > val2) { val2 = min; }
  434. if (min > val3) { val3 = min; }
  435. return F32x4(val0, val1, val2, val3);
  436. #endif
  437. }
  438. F32x4 clampUpper(float max) const {
  439. #ifdef USE_BASIC_SIMD
  440. return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(max)));
  441. #else
  442. float val0 = this->emulated[0];
  443. float val1 = this->emulated[1];
  444. float val2 = this->emulated[2];
  445. float val3 = this->emulated[3];
  446. if (max < val0) { val0 = max; }
  447. if (max < val1) { val1 = max; }
  448. if (max < val2) { val2 = max; }
  449. if (max < val3) { val3 = max; }
  450. return F32x4(val0, val1, val2, val3);
  451. #endif
  452. }
  453. };
  454. inline dsr::String& string_toStreamIndented(dsr::String& target, const F32x4& source, const dsr::ReadableString& indentation) {
  455. string_append(target, indentation, source.get());
  456. return target;
  457. }
  458. inline bool operator==(const F32x4& left, const F32x4& right) {
  459. float a[4] ALIGN16;
  460. float b[4] ALIGN16;
  461. left.writeAlignedUnsafe(a);
  462. right.writeAlignedUnsafe(b);
  463. return fabs(a[0] - b[0]) < 0.0001f && fabs(a[1] - b[1]) < 0.0001f && fabs(a[2] - b[2]) < 0.0001f && fabs(a[3] - b[3]) < 0.0001f;
  464. }
  465. inline bool operator!=(const F32x4& left, const F32x4& right) {
  466. return !(left == right);
  467. }
  468. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  469. #ifdef USE_BASIC_SIMD
  470. return F32x4(ADD_F32_SIMD(left.v, right.v));
  471. #else
  472. return F32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  473. #endif
  474. }
  475. inline F32x4 operator+(float left, const F32x4& right) {
  476. #ifdef USE_BASIC_SIMD
  477. return F32x4(ADD_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  478. #else
  479. return F32x4(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3]);
  480. #endif
  481. }
  482. inline F32x4 operator+(const F32x4& left, float right) {
  483. #ifdef USE_BASIC_SIMD
  484. return F32x4(ADD_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  485. #else
  486. return F32x4(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right);
  487. #endif
  488. }
  489. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  490. #ifdef USE_BASIC_SIMD
  491. return F32x4(SUB_F32_SIMD(left.v, right.v));
  492. #else
  493. return F32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  494. #endif
  495. }
  496. inline F32x4 operator-(float left, const F32x4& right) {
  497. #ifdef USE_BASIC_SIMD
  498. return F32x4(SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  499. #else
  500. return F32x4(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3]);
  501. #endif
  502. }
  503. inline F32x4 operator-(const F32x4& left, float right) {
  504. #ifdef USE_BASIC_SIMD
  505. return F32x4(SUB_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  506. #else
  507. return F32x4(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right);
  508. #endif
  509. }
  510. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  511. #ifdef USE_BASIC_SIMD
  512. return F32x4(MUL_F32_SIMD(left.v, right.v));
  513. #else
  514. return F32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  515. #endif
  516. }
  517. inline F32x4 operator*(float left, const F32x4& right) {
  518. #ifdef USE_BASIC_SIMD
  519. return F32x4(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  520. #else
  521. return F32x4(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3]);
  522. #endif
  523. }
  524. inline F32x4 operator*(const F32x4& left, float right) {
  525. #ifdef USE_BASIC_SIMD
  526. return F32x4(MUL_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  527. #else
  528. return F32x4(left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right);
  529. #endif
  530. }
  531. inline F32x4 min(const F32x4& left, const F32x4& right) {
  532. #ifdef USE_BASIC_SIMD
  533. return F32x4(MIN_F32_SIMD(left.v, right.v));
  534. #else
  535. float v0 = left.emulated[0];
  536. float v1 = left.emulated[1];
  537. float v2 = left.emulated[2];
  538. float v3 = left.emulated[3];
  539. float r0 = right.emulated[0];
  540. float r1 = right.emulated[1];
  541. float r2 = right.emulated[2];
  542. float r3 = right.emulated[3];
  543. if (r0 < v0) { v0 = r0; }
  544. if (r1 < v1) { v1 = r1; }
  545. if (r2 < v2) { v2 = r2; }
  546. if (r3 < v3) { v3 = r3; }
  547. return F32x4(v0, v1, v2, v3);
  548. #endif
  549. }
  550. inline F32x4 max(const F32x4& left, const F32x4& right) {
  551. #ifdef USE_BASIC_SIMD
  552. return F32x4(MAX_F32_SIMD(left.v, right.v));
  553. #else
  554. float v0 = left.emulated[0];
  555. float v1 = left.emulated[1];
  556. float v2 = left.emulated[2];
  557. float v3 = left.emulated[3];
  558. float r0 = right.emulated[0];
  559. float r1 = right.emulated[1];
  560. float r2 = right.emulated[2];
  561. float r3 = right.emulated[3];
  562. if (r0 > v0) { v0 = r0; }
  563. if (r1 > v1) { v1 = r1; }
  564. if (r2 > v2) { v2 = r2; }
  565. if (r3 > v3) { v3 = r3; }
  566. return F32x4(v0, v1, v2, v3);
  567. #endif
  568. }
  569. union I32x4 {
  570. #ifdef USE_BASIC_SIMD
  571. public:
  572. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  573. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  574. // Direct access cannot be done on NEON!
  575. int32_t shared_memory[4];
  576. #endif
  577. // The SIMD vector of undefined type
  578. // Not accessible while emulating!
  579. SIMD_I32x4 v;
  580. // Construct a portable vector from a native SIMD vector
  581. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  582. // Construct a portable vector from a set of scalars
  583. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  584. // Construct a portable vector from a single duplicated scalar
  585. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  586. #else
  587. public:
  588. // Emulate a SIMD vector as an array of scalars without hardware support
  589. // Only accessible while emulating!
  590. int32_t emulated[4];
  591. // Construct a portable vector from a set of scalars
  592. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  593. this->emulated[0] = a1;
  594. this->emulated[1] = a2;
  595. this->emulated[2] = a3;
  596. this->emulated[3] = a4;
  597. }
  598. // Construct a portable vector from a single duplicated scalar
  599. explicit I32x4(int32_t scalar) {
  600. this->emulated[0] = scalar;
  601. this->emulated[1] = scalar;
  602. this->emulated[2] = scalar;
  603. this->emulated[3] = scalar;
  604. }
  605. #endif
  606. // Construct a portable SIMD vector from a pointer to aligned data
  607. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  608. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  609. #ifdef USE_BASIC_SIMD
  610. #ifdef USE_SSE2
  611. return I32x4(_mm_load_si128((const __m128i*)data));
  612. #elif USE_NEON
  613. return I32x4(vld1q_s32(data));
  614. #endif
  615. #else
  616. return I32x4(data[0], data[1], data[2], data[3]);
  617. #endif
  618. }
  619. // Write to aligned memory from the existing vector
  620. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  621. inline void writeAlignedUnsafe(int32_t* data) const {
  622. #ifdef USE_BASIC_SIMD
  623. #ifdef USE_SSE2
  624. _mm_store_si128((__m128i*)data, this->v);
  625. #elif USE_NEON
  626. vst1q_s32(data, this->v);
  627. #endif
  628. #else
  629. data[0] = this->emulated[0];
  630. data[1] = this->emulated[1];
  631. data[2] = this->emulated[2];
  632. data[3] = this->emulated[3];
  633. #endif
  634. }
  635. #ifdef DFPSR_GEOMETRY_IVECTOR
  636. dsr::IVector4D get() const {
  637. int32_t data[4] ALIGN16;
  638. this->writeAlignedUnsafe(data);
  639. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  640. }
  641. #endif
  642. // Bound and alignment checked reading
  643. static inline I32x4 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
  644. const int32_t* pointer = data.getUnsafe();
  645. assert(((uintptr_t)pointer & 15) == 0);
  646. #ifdef SAFE_POINTER_CHECKS
  647. data.assertInside(methodName, pointer, 16);
  648. #endif
  649. return I32x4::readAlignedUnsafe(pointer);
  650. }
  651. // Bound and alignment checked writing
  652. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  653. int32_t* pointer = data.getUnsafe();
  654. assert(((uintptr_t)pointer & 15) == 0);
  655. #ifdef SAFE_POINTER_CHECKS
  656. data.assertInside(methodName, pointer, 16);
  657. #endif
  658. this->writeAlignedUnsafe(pointer);
  659. }
  660. };
  661. inline dsr::String& string_toStreamIndented(dsr::String& target, const I32x4& source, const dsr::ReadableString& indentation) {
  662. string_append(target, indentation, source.get());
  663. return target;
  664. }
  665. inline bool operator==(const I32x4& left, const I32x4& right) {
  666. int32_t a[4] ALIGN16;
  667. int32_t b[4] ALIGN16;
  668. left.writeAlignedUnsafe(a);
  669. right.writeAlignedUnsafe(b);
  670. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
  671. }
  672. inline bool operator!=(const I32x4& left, const I32x4& right) {
  673. return !(left == right);
  674. }
  675. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  676. #ifdef USE_BASIC_SIMD
  677. return I32x4(ADD_I32_SIMD(left.v, right.v));
  678. #else
  679. return I32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  680. #endif
  681. }
  682. inline I32x4 operator+(int32_t left, const I32x4& right) {
  683. return I32x4(left) + right;
  684. }
  685. inline I32x4 operator+(const I32x4& left, int32_t right) {
  686. return left + I32x4(right);
  687. }
  688. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  689. #ifdef USE_BASIC_SIMD
  690. return I32x4(SUB_I32_SIMD(left.v, right.v));
  691. #else
  692. return I32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  693. #endif
  694. }
  695. inline I32x4 operator-(int32_t left, const I32x4& right) {
  696. return I32x4(left) - right;
  697. }
  698. inline I32x4 operator-(const I32x4& left, int32_t right) {
  699. return left - I32x4(right);
  700. }
  701. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  702. #ifdef USE_BASIC_SIMD
  703. #ifdef USE_SSE2
  704. // Emulate a NEON instruction
  705. return I32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
  706. #elif USE_NEON
  707. return I32x4(MUL_I32_NEON(left.v, right.v));
  708. #endif
  709. #else
  710. return I32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  711. #endif
  712. }
  713. inline I32x4 operator*(int32_t left, const I32x4& right) {
  714. return I32x4(left) * right;
  715. }
  716. inline I32x4 operator*(const I32x4& left, int32_t right) {
  717. return left * I32x4(right);
  718. }
  719. union U32x4 {
  720. #ifdef USE_BASIC_SIMD
  721. public:
  722. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  723. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  724. // Direct access cannot be done on NEON!
  725. uint32_t shared_memory[4];
  726. #endif
  727. // The SIMD vector of undefined type
  728. // Not accessible while emulating!
  729. SIMD_U32x4 v;
  730. // Construct a portable vector from a native SIMD vector
  731. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  732. // Construct a portable vector from a set of scalars
  733. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  734. // Construct a portable vector from a single duplicated scalar
  735. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  736. #else
  737. public:
  738. // Emulate a SIMD vector as an array of scalars without hardware support
  739. // Only accessible while emulating!
  740. uint32_t emulated[4];
  741. // Construct a portable vector from a set of scalars
  742. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  743. this->emulated[0] = a1;
  744. this->emulated[1] = a2;
  745. this->emulated[2] = a3;
  746. this->emulated[3] = a4;
  747. }
  748. // Construct a portable vector from a single duplicated scalar
  749. explicit U32x4(uint32_t scalar) {
  750. this->emulated[0] = scalar;
  751. this->emulated[1] = scalar;
  752. this->emulated[2] = scalar;
  753. this->emulated[3] = scalar;
  754. }
  755. #endif
  756. // Construct a portable SIMD vector from a pointer to aligned data
  757. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  758. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  759. #ifdef USE_BASIC_SIMD
  760. #ifdef USE_SSE2
  761. return U32x4(_mm_load_si128((const __m128i*)data));
  762. #elif USE_NEON
  763. return U32x4(vld1q_u32(data));
  764. #endif
  765. #else
  766. return U32x4(data[0], data[1], data[2], data[3]);
  767. #endif
  768. }
  769. // Write to aligned memory from the existing vector
  770. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  771. inline void writeAlignedUnsafe(uint32_t* data) const {
  772. #ifdef USE_BASIC_SIMD
  773. #ifdef USE_SSE2
  774. _mm_store_si128((__m128i*)data, this->v);
  775. #elif USE_NEON
  776. vst1q_u32(data, this->v);
  777. #endif
  778. #else
  779. data[0] = this->emulated[0];
  780. data[1] = this->emulated[1];
  781. data[2] = this->emulated[2];
  782. data[3] = this->emulated[3];
  783. #endif
  784. }
  785. #ifdef DFPSR_GEOMETRY_UVECTOR
  786. dsr::UVector4D get() const {
  787. uint32_t data[4] ALIGN16;
  788. this->writeAlignedUnsafe(data);
  789. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  790. }
  791. #endif
  792. // Bound and alignment checked reading
  793. static inline U32x4 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
  794. const uint32_t* pointer = data.getUnsafe();
  795. assert(((uintptr_t)pointer & 15) == 0);
  796. #ifdef SAFE_POINTER_CHECKS
  797. data.assertInside(methodName, pointer, 16);
  798. #endif
  799. return U32x4::readAlignedUnsafe(pointer);
  800. }
  801. // Bound and alignment checked writing
  802. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  803. uint32_t* pointer = data.getUnsafe();
  804. assert(((uintptr_t)pointer & 15) == 0);
  805. #ifdef SAFE_POINTER_CHECKS
  806. data.assertInside(methodName, pointer, 16);
  807. #endif
  808. this->writeAlignedUnsafe(pointer);
  809. }
  810. };
  811. inline dsr::String& string_toStreamIndented(dsr::String& target, const U32x4& source, const dsr::ReadableString& indentation) {
  812. string_append(target, indentation, source.get());
  813. return target;
  814. }
  815. inline bool operator==(const U32x4& left, const U32x4& right) {
  816. uint32_t a[4] ALIGN16;
  817. uint32_t b[4] ALIGN16;
  818. left.writeAlignedUnsafe(a);
  819. right.writeAlignedUnsafe(b);
  820. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
  821. }
  822. inline bool operator!=(const U32x4& left, const U32x4& right) {
  823. return !(left == right);
  824. }
  825. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  826. #ifdef USE_BASIC_SIMD
  827. return U32x4(ADD_U32_SIMD(left.v, right.v));
  828. #else
  829. return U32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  830. #endif
  831. }
  832. inline U32x4 operator+(uint32_t left, const U32x4& right) {
  833. return U32x4(left) + right;
  834. }
  835. inline U32x4 operator+(const U32x4& left, uint32_t right) {
  836. return left + U32x4(right);
  837. }
  838. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  839. #ifdef USE_BASIC_SIMD
  840. return U32x4(SUB_U32_SIMD(left.v, right.v));
  841. #else
  842. return U32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  843. #endif
  844. }
  845. inline U32x4 operator-(uint32_t left, const U32x4& right) {
  846. return U32x4(left) - right;
  847. }
  848. inline U32x4 operator-(const U32x4& left, uint32_t right) {
  849. return left - U32x4(right);
  850. }
  851. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  852. #ifdef USE_BASIC_SIMD
  853. #ifdef USE_SSE2
  854. // Emulate a NEON instruction on SSE2 registers
  855. return U32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
  856. #else // NEON
  857. return U32x4(MUL_U32_NEON(left.v, right.v));
  858. #endif
  859. #else
  860. return U32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  861. #endif
  862. }
  863. inline U32x4 operator*(uint32_t left, const U32x4& right) {
  864. return U32x4(left) * right;
  865. }
  866. inline U32x4 operator*(const U32x4& left, uint32_t right) {
  867. return left * U32x4(right);
  868. }
  869. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  870. #ifdef USE_BASIC_SIMD
  871. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  872. #else
  873. return U32x4(left.emulated[0] & right.emulated[0], left.emulated[1] & right.emulated[1], left.emulated[2] & right.emulated[2], left.emulated[3] & right.emulated[3]);
  874. #endif
  875. }
  876. inline U32x4 operator&(const U32x4& left, uint32_t mask) {
  877. #ifdef USE_BASIC_SIMD
  878. return U32x4(BITWISE_AND_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  879. #else
  880. return U32x4(left.emulated[0] & mask, left.emulated[1] & mask, left.emulated[2] & mask, left.emulated[3] & mask);
  881. #endif
  882. }
  883. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  884. #ifdef USE_BASIC_SIMD
  885. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  886. #else
  887. return U32x4(left.emulated[0] | right.emulated[0], left.emulated[1] | right.emulated[1], left.emulated[2] | right.emulated[2], left.emulated[3] | right.emulated[3]);
  888. #endif
  889. }
  890. inline U32x4 operator|(const U32x4& left, uint32_t mask) {
  891. #ifdef USE_BASIC_SIMD
  892. return U32x4(BITWISE_OR_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  893. #else
  894. return U32x4(left.emulated[0] | mask, left.emulated[1] | mask, left.emulated[2] | mask, left.emulated[3] | mask);
  895. #endif
  896. }
  897. inline U32x4 operator<<(const U32x4& left, uint32_t bitOffset) {
  898. #ifdef USE_SSE2
  899. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  900. #else
  901. #ifdef USE_NEON
  902. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  903. #else
  904. return U32x4(left.emulated[0] << bitOffset, left.emulated[1] << bitOffset, left.emulated[2] << bitOffset, left.emulated[3] << bitOffset);
  905. #endif
  906. #endif
  907. }
  908. inline U32x4 operator>>(const U32x4& left, uint32_t bitOffset) {
  909. #ifdef USE_SSE2
  910. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  911. #else
  912. #ifdef USE_NEON
  913. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-bitOffset)));
  914. #else
  915. return U32x4(left.emulated[0] >> bitOffset, left.emulated[1] >> bitOffset, left.emulated[2] >> bitOffset, left.emulated[3] >> bitOffset);
  916. #endif
  917. #endif
  918. }
  919. union U16x8 {
  920. #ifdef USE_BASIC_SIMD
  921. public:
  922. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  923. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  924. // Direct access cannot be done on NEON!
  925. uint16_t shared_memory[8];
  926. #endif
  927. // The SIMD vector of undefined type
  928. // Not accessible while emulating!
  929. SIMD_U16x8 v;
  930. // Construct a portable vector from a native SIMD vector
  931. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  932. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  933. // Reinterpret casting is used
  934. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  935. // Construct a portable vector from a set of scalars
  936. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  937. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  938. // Reinterpret casting is used
  939. // TODO: Remove all reintreprets from constructors to improve readability
  940. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  941. // Construct a portable vector from a single duplicated scalar
  942. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  943. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  944. U32x4 get_U32() const {
  945. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  946. }
  947. #else
  948. public:
  949. // Emulate a SIMD vector as an array of scalars without hardware support
  950. // Only accessible while emulating!
  951. uint16_t emulated[8];
  952. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  953. // Reinterpret casting is used
  954. explicit U16x8(const U32x4& vector) {
  955. uint64_t *target = (uint64_t*)this->emulated;
  956. uint64_t *source = (uint64_t*)vector.emulated;
  957. target[0] = source[0];
  958. target[1] = source[1];
  959. }
  960. // Construct a portable vector from a set of scalars
  961. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  962. this->emulated[0] = a1;
  963. this->emulated[1] = a2;
  964. this->emulated[2] = a3;
  965. this->emulated[3] = a4;
  966. this->emulated[4] = a5;
  967. this->emulated[5] = a6;
  968. this->emulated[6] = a7;
  969. this->emulated[7] = a8;
  970. }
  971. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  972. // Reinterpret casting is used
  973. explicit U16x8(uint32_t scalar) {
  974. uint32_t *target = (uint32_t*)this->emulated;
  975. target[0] = scalar;
  976. target[1] = scalar;
  977. target[2] = scalar;
  978. target[3] = scalar;
  979. }
  980. // Construct a portable vector from a single duplicated scalar
  981. explicit U16x8(uint16_t scalar) {
  982. this->emulated[0] = scalar;
  983. this->emulated[1] = scalar;
  984. this->emulated[2] = scalar;
  985. this->emulated[3] = scalar;
  986. this->emulated[4] = scalar;
  987. this->emulated[5] = scalar;
  988. this->emulated[6] = scalar;
  989. this->emulated[7] = scalar;
  990. }
  991. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  992. U32x4 get_U32() const {
  993. U32x4 result(0);
  994. uint64_t *target = (uint64_t*)result.emulated;
  995. uint64_t *source = (uint64_t*)this->emulated;
  996. target[0] = source[0];
  997. target[1] = source[1];
  998. return result;
  999. }
  1000. #endif
  1001. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1002. //static inline U16x8 readSlow(uint16_t* data) {
  1003. // return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1004. //}
  1005. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  1006. #ifdef USE_BASIC_SIMD
  1007. #ifdef USE_SSE2
  1008. return U16x8(_mm_load_si128((const __m128i*)data));
  1009. #elif USE_NEON
  1010. return U16x8(vld1q_u16(data));
  1011. #endif
  1012. #else
  1013. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1014. #endif
  1015. }
  1016. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1017. inline void writeAlignedUnsafe(uint16_t* data) const {
  1018. #ifdef USE_BASIC_SIMD
  1019. #ifdef USE_SSE2
  1020. _mm_store_si128((__m128i*)data, this->v);
  1021. #elif USE_NEON
  1022. vst1q_u16(data, this->v);
  1023. #endif
  1024. #else
  1025. data[0] = this->emulated[0];
  1026. data[1] = this->emulated[1];
  1027. data[2] = this->emulated[2];
  1028. data[3] = this->emulated[3];
  1029. data[4] = this->emulated[4];
  1030. data[5] = this->emulated[5];
  1031. data[6] = this->emulated[6];
  1032. data[7] = this->emulated[7];
  1033. #endif
  1034. }
  1035. // Bound and alignment checked reading
  1036. static inline U16x8 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
  1037. const uint16_t* pointer = data.getUnsafe();
  1038. assert(((uintptr_t)pointer & 15) == 0);
  1039. #ifdef SAFE_POINTER_CHECKS
  1040. data.assertInside(methodName, pointer, 16);
  1041. #endif
  1042. return U16x8::readAlignedUnsafe(pointer);
  1043. }
  1044. // Bound and alignment checked writing
  1045. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1046. uint16_t* pointer = data.getUnsafe();
  1047. assert(((uintptr_t)pointer & 15) == 0);
  1048. #ifdef SAFE_POINTER_CHECKS
  1049. data.assertInside(methodName, pointer, 16);
  1050. #endif
  1051. this->writeAlignedUnsafe(pointer);
  1052. }
  1053. };
  1054. inline dsr::String& string_toStreamIndented(dsr::String& target, const U16x8& source, const dsr::ReadableString& indentation) {
  1055. ALIGN16 uint16_t data[8];
  1056. source.writeAlignedUnsafe(data);
  1057. string_append(target, indentation, "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7], ")");
  1058. return target;
  1059. }
  1060. inline bool operator==(const U16x8& left, const U16x8& right) {
  1061. ALIGN16 uint16_t a[8];
  1062. ALIGN16 uint16_t b[8];
  1063. left.writeAlignedUnsafe(a);
  1064. right.writeAlignedUnsafe(b);
  1065. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7];
  1066. }
  1067. inline bool operator!=(const U16x8& left, const U16x8& right) {
  1068. return !(left == right);
  1069. }
  1070. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  1071. #ifdef USE_BASIC_SIMD
  1072. return U16x8(ADD_U16_SIMD(left.v, right.v));
  1073. #else
  1074. return U16x8(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3],
  1075. left.emulated[4] + right.emulated[4], left.emulated[5] + right.emulated[5], left.emulated[6] + right.emulated[6], left.emulated[7] + right.emulated[7]);
  1076. #endif
  1077. }
  1078. inline U16x8 operator+(uint16_t left, const U16x8& right) {
  1079. #ifdef USE_BASIC_SIMD
  1080. return U16x8(ADD_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1081. #else
  1082. return U16x8(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3],
  1083. left + right.emulated[4], left + right.emulated[5], left + right.emulated[6], left + right.emulated[7]);
  1084. #endif
  1085. }
  1086. inline U16x8 operator+(const U16x8& left, uint16_t right) {
  1087. #ifdef USE_BASIC_SIMD
  1088. return U16x8(ADD_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1089. #else
  1090. return U16x8(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right,
  1091. left.emulated[4] + right, left.emulated[5] + right, left.emulated[6] + right, left.emulated[7] + right);
  1092. #endif
  1093. }
  1094. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  1095. #ifdef USE_BASIC_SIMD
  1096. return U16x8(SUB_U16_SIMD(left.v, right.v));
  1097. #else
  1098. return U16x8(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3],
  1099. left.emulated[4] - right.emulated[4], left.emulated[5] - right.emulated[5], left.emulated[6] - right.emulated[6], left.emulated[7] - right.emulated[7]);
  1100. #endif
  1101. }
  1102. inline U16x8 operator-(uint16_t left, const U16x8& right) {
  1103. #ifdef USE_BASIC_SIMD
  1104. return U16x8(SUB_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1105. #else
  1106. return U16x8(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3],
  1107. left - right.emulated[4], left - right.emulated[5], left - right.emulated[6], left - right.emulated[7]);
  1108. #endif
  1109. }
  1110. inline U16x8 operator-(const U16x8& left, uint16_t right) {
  1111. #ifdef USE_BASIC_SIMD
  1112. return U16x8(SUB_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1113. #else
  1114. return U16x8(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right,
  1115. left.emulated[4] - right, left.emulated[5] - right, left.emulated[6] - right, left.emulated[7] - right);
  1116. #endif
  1117. }
  1118. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  1119. #ifdef USE_BASIC_SIMD
  1120. return U16x8(MUL_U16_SIMD(left.v, right.v));
  1121. #else
  1122. return U16x8(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3],
  1123. left.emulated[4] * right.emulated[4], left.emulated[5] * right.emulated[5], left.emulated[6] * right.emulated[6], left.emulated[7] * right.emulated[7]);
  1124. #endif
  1125. }
  1126. inline U16x8 operator*(uint16_t left, const U16x8& right) {
  1127. #ifdef USE_BASIC_SIMD
  1128. return U16x8(MUL_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1129. #else
  1130. return U16x8(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3],
  1131. left * right.emulated[4], left * right.emulated[5], left * right.emulated[6], left * right.emulated[7]);
  1132. #endif
  1133. }
  1134. inline U16x8 operator*(const U16x8& left, uint16_t right) {
  1135. #ifdef USE_BASIC_SIMD
  1136. return U16x8(MUL_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1137. #else
  1138. return U16x8(
  1139. left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right,
  1140. left.emulated[4] * right, left.emulated[5] * right, left.emulated[6] * right, left.emulated[7] * right
  1141. );
  1142. #endif
  1143. }
  1144. union U8x16 {
  1145. #ifdef USE_BASIC_SIMD
  1146. public:
  1147. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  1148. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1149. // Direct access cannot be done on NEON!
  1150. uint8_t shared_memory[16];
  1151. #endif
  1152. // The SIMD vector of undefined type
  1153. // Not accessible while emulating!
  1154. SIMD_U8x16 v;
  1155. // Construct a portable vector from a native SIMD vector
  1156. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  1157. // Construct a portable vector from a set of scalars
  1158. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1159. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  1160. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1161. // Construct a portable vector from a single duplicated scalar
  1162. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  1163. #else
  1164. public:
  1165. // Emulate a SIMD vector as an array of scalars without hardware support
  1166. // Only accessible while emulating!
  1167. uint8_t emulated[16];
  1168. // Construct a portable vector from a set of scalars
  1169. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1170. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  1171. this->emulated[0] = a1;
  1172. this->emulated[1] = a2;
  1173. this->emulated[2] = a3;
  1174. this->emulated[3] = a4;
  1175. this->emulated[4] = a5;
  1176. this->emulated[5] = a6;
  1177. this->emulated[6] = a7;
  1178. this->emulated[7] = a8;
  1179. this->emulated[8] = a9;
  1180. this->emulated[9] = a10;
  1181. this->emulated[10] = a11;
  1182. this->emulated[11] = a12;
  1183. this->emulated[12] = a13;
  1184. this->emulated[13] = a14;
  1185. this->emulated[14] = a15;
  1186. this->emulated[15] = a16;
  1187. }
  1188. // Construct a portable vector from a single duplicated scalar
  1189. explicit U8x16(uint8_t scalar) {
  1190. this->emulated[0] = scalar;
  1191. this->emulated[1] = scalar;
  1192. this->emulated[2] = scalar;
  1193. this->emulated[3] = scalar;
  1194. this->emulated[4] = scalar;
  1195. this->emulated[5] = scalar;
  1196. this->emulated[6] = scalar;
  1197. this->emulated[7] = scalar;
  1198. this->emulated[8] = scalar;
  1199. this->emulated[9] = scalar;
  1200. this->emulated[10] = scalar;
  1201. this->emulated[11] = scalar;
  1202. this->emulated[12] = scalar;
  1203. this->emulated[13] = scalar;
  1204. this->emulated[14] = scalar;
  1205. this->emulated[15] = scalar;
  1206. }
  1207. #endif
  1208. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  1209. #ifdef USE_BASIC_SIMD
  1210. #ifdef USE_SSE2
  1211. return U8x16(_mm_load_si128((const __m128i*)data));
  1212. #elif USE_NEON
  1213. return U8x16(vld1q_u8(data));
  1214. #endif
  1215. #else
  1216. return U8x16(
  1217. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  1218. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  1219. );
  1220. #endif
  1221. }
  1222. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1223. inline void writeAlignedUnsafe(uint8_t* data) const {
  1224. #ifdef USE_BASIC_SIMD
  1225. #ifdef USE_SSE2
  1226. _mm_store_si128((__m128i*)data, this->v);
  1227. #elif USE_NEON
  1228. vst1q_u8(data, this->v);
  1229. #endif
  1230. #else
  1231. data[0] = this->emulated[0];
  1232. data[1] = this->emulated[1];
  1233. data[2] = this->emulated[2];
  1234. data[3] = this->emulated[3];
  1235. data[4] = this->emulated[4];
  1236. data[5] = this->emulated[5];
  1237. data[6] = this->emulated[6];
  1238. data[7] = this->emulated[7];
  1239. data[8] = this->emulated[8];
  1240. data[9] = this->emulated[9];
  1241. data[10] = this->emulated[10];
  1242. data[11] = this->emulated[11];
  1243. data[12] = this->emulated[12];
  1244. data[13] = this->emulated[13];
  1245. data[14] = this->emulated[14];
  1246. data[15] = this->emulated[15];
  1247. #endif
  1248. }
  1249. // Bound and alignment checked reading
  1250. static inline U8x16 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
  1251. const uint8_t* pointer = data.getUnsafe();
  1252. assert(((uintptr_t)pointer & 15) == 0);
  1253. #ifdef SAFE_POINTER_CHECKS
  1254. data.assertInside(methodName, pointer, 16);
  1255. #endif
  1256. return U8x16::readAlignedUnsafe(pointer);
  1257. }
  1258. // Bound and alignment checked writing
  1259. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1260. uint8_t* pointer = data.getUnsafe();
  1261. assert(((uintptr_t)pointer & 15) == 0);
  1262. #ifdef SAFE_POINTER_CHECKS
  1263. data.assertInside(methodName, pointer, 16);
  1264. #endif
  1265. this->writeAlignedUnsafe(pointer);
  1266. }
  1267. };
  1268. inline dsr::String& string_toStreamIndented(dsr::String& target, const U8x16& source, const dsr::ReadableString& indentation) {
  1269. ALIGN16 uint8_t data[16];
  1270. source.writeAlignedUnsafe(data);
  1271. string_append(target, indentation,
  1272. "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7],
  1273. ", ", data[8], ", ", data[9], ", ", data[10], ", ", data[11], ", ", data[12], ", ", data[13], ", ", data[14], ", ", data[15], ")"
  1274. );
  1275. return target;
  1276. }
  1277. inline bool operator==(const U8x16& left, const U8x16& right) {
  1278. ALIGN16 uint8_t a[16];
  1279. ALIGN16 uint8_t b[16];
  1280. left.writeAlignedUnsafe(a);
  1281. right.writeAlignedUnsafe(b);
  1282. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7]
  1283. && a[8] == b[8] && a[9] == b[9] && a[10] == b[10] && a[11] == b[11] && a[12] == b[12] && a[13] == b[13] && a[14] == b[14] && a[15] == b[15];
  1284. }
  1285. inline bool operator!=(const U8x16& left, const U8x16& right) {
  1286. return !(left == right);
  1287. }
  1288. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  1289. #ifdef USE_BASIC_SIMD
  1290. return U8x16(ADD_U8_SIMD(left.v, right.v));
  1291. #else
  1292. return U8x16(
  1293. left.emulated[0] + right.emulated[0],
  1294. left.emulated[1] + right.emulated[1],
  1295. left.emulated[2] + right.emulated[2],
  1296. left.emulated[3] + right.emulated[3],
  1297. left.emulated[4] + right.emulated[4],
  1298. left.emulated[5] + right.emulated[5],
  1299. left.emulated[6] + right.emulated[6],
  1300. left.emulated[7] + right.emulated[7],
  1301. left.emulated[8] + right.emulated[8],
  1302. left.emulated[9] + right.emulated[9],
  1303. left.emulated[10] + right.emulated[10],
  1304. left.emulated[11] + right.emulated[11],
  1305. left.emulated[12] + right.emulated[12],
  1306. left.emulated[13] + right.emulated[13],
  1307. left.emulated[14] + right.emulated[14],
  1308. left.emulated[15] + right.emulated[15]
  1309. );
  1310. #endif
  1311. }
  1312. inline U8x16 operator+(uint8_t left, const U8x16& right) {
  1313. #ifdef USE_BASIC_SIMD
  1314. return U8x16(ADD_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
  1315. #else
  1316. return U8x16(
  1317. left + right.emulated[0],
  1318. left + right.emulated[1],
  1319. left + right.emulated[2],
  1320. left + right.emulated[3],
  1321. left + right.emulated[4],
  1322. left + right.emulated[5],
  1323. left + right.emulated[6],
  1324. left + right.emulated[7],
  1325. left + right.emulated[8],
  1326. left + right.emulated[9],
  1327. left + right.emulated[10],
  1328. left + right.emulated[11],
  1329. left + right.emulated[12],
  1330. left + right.emulated[13],
  1331. left + right.emulated[14],
  1332. left + right.emulated[15]
  1333. );
  1334. #endif
  1335. }
  1336. inline U8x16 operator+(const U8x16& left, uint8_t right) {
  1337. #ifdef USE_BASIC_SIMD
  1338. return U8x16(ADD_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
  1339. #else
  1340. return U8x16(
  1341. left.emulated[0] + right,
  1342. left.emulated[1] + right,
  1343. left.emulated[2] + right,
  1344. left.emulated[3] + right,
  1345. left.emulated[4] + right,
  1346. left.emulated[5] + right,
  1347. left.emulated[6] + right,
  1348. left.emulated[7] + right,
  1349. left.emulated[8] + right,
  1350. left.emulated[9] + right,
  1351. left.emulated[10] + right,
  1352. left.emulated[11] + right,
  1353. left.emulated[12] + right,
  1354. left.emulated[13] + right,
  1355. left.emulated[14] + right,
  1356. left.emulated[15] + right
  1357. );
  1358. #endif
  1359. }
  1360. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  1361. #ifdef USE_BASIC_SIMD
  1362. return U8x16(SUB_U8_SIMD(left.v, right.v));
  1363. #else
  1364. return U8x16(
  1365. left.emulated[0] - right.emulated[0],
  1366. left.emulated[1] - right.emulated[1],
  1367. left.emulated[2] - right.emulated[2],
  1368. left.emulated[3] - right.emulated[3],
  1369. left.emulated[4] - right.emulated[4],
  1370. left.emulated[5] - right.emulated[5],
  1371. left.emulated[6] - right.emulated[6],
  1372. left.emulated[7] - right.emulated[7],
  1373. left.emulated[8] - right.emulated[8],
  1374. left.emulated[9] - right.emulated[9],
  1375. left.emulated[10] - right.emulated[10],
  1376. left.emulated[11] - right.emulated[11],
  1377. left.emulated[12] - right.emulated[12],
  1378. left.emulated[13] - right.emulated[13],
  1379. left.emulated[14] - right.emulated[14],
  1380. left.emulated[15] - right.emulated[15]
  1381. );
  1382. #endif
  1383. }
  1384. inline U8x16 operator-(uint8_t left, const U8x16& right) {
  1385. #ifdef USE_BASIC_SIMD
  1386. return U8x16(SUB_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
  1387. #else
  1388. return U8x16(
  1389. left - right.emulated[0],
  1390. left - right.emulated[1],
  1391. left - right.emulated[2],
  1392. left - right.emulated[3],
  1393. left - right.emulated[4],
  1394. left - right.emulated[5],
  1395. left - right.emulated[6],
  1396. left - right.emulated[7],
  1397. left - right.emulated[8],
  1398. left - right.emulated[9],
  1399. left - right.emulated[10],
  1400. left - right.emulated[11],
  1401. left - right.emulated[12],
  1402. left - right.emulated[13],
  1403. left - right.emulated[14],
  1404. left - right.emulated[15]
  1405. );
  1406. #endif
  1407. }
  1408. inline U8x16 operator-(const U8x16& left, uint8_t right) {
  1409. #ifdef USE_BASIC_SIMD
  1410. return U8x16(SUB_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
  1411. #else
  1412. return U8x16(
  1413. left.emulated[0] - right,
  1414. left.emulated[1] - right,
  1415. left.emulated[2] - right,
  1416. left.emulated[3] - right,
  1417. left.emulated[4] - right,
  1418. left.emulated[5] - right,
  1419. left.emulated[6] - right,
  1420. left.emulated[7] - right,
  1421. left.emulated[8] - right,
  1422. left.emulated[9] - right,
  1423. left.emulated[10] - right,
  1424. left.emulated[11] - right,
  1425. left.emulated[12] - right,
  1426. left.emulated[13] - right,
  1427. left.emulated[14] - right,
  1428. left.emulated[15] - right
  1429. );
  1430. #endif
  1431. }
  1432. inline uint8_t saturateToU8(uint32_t x) {
  1433. // No need to check lower bound for unsigned input
  1434. return x > 255 ? 255 : x;
  1435. }
  1436. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  1437. #ifdef USE_BASIC_SIMD
  1438. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  1439. #else
  1440. return U8x16(
  1441. saturateToU8((uint32_t)left.emulated[0] + (uint32_t)right.emulated[0]),
  1442. saturateToU8((uint32_t)left.emulated[1] + (uint32_t)right.emulated[1]),
  1443. saturateToU8((uint32_t)left.emulated[2] + (uint32_t)right.emulated[2]),
  1444. saturateToU8((uint32_t)left.emulated[3] + (uint32_t)right.emulated[3]),
  1445. saturateToU8((uint32_t)left.emulated[4] + (uint32_t)right.emulated[4]),
  1446. saturateToU8((uint32_t)left.emulated[5] + (uint32_t)right.emulated[5]),
  1447. saturateToU8((uint32_t)left.emulated[6] + (uint32_t)right.emulated[6]),
  1448. saturateToU8((uint32_t)left.emulated[7] + (uint32_t)right.emulated[7]),
  1449. saturateToU8((uint32_t)left.emulated[8] + (uint32_t)right.emulated[8]),
  1450. saturateToU8((uint32_t)left.emulated[9] + (uint32_t)right.emulated[9]),
  1451. saturateToU8((uint32_t)left.emulated[10] + (uint32_t)right.emulated[10]),
  1452. saturateToU8((uint32_t)left.emulated[11] + (uint32_t)right.emulated[11]),
  1453. saturateToU8((uint32_t)left.emulated[12] + (uint32_t)right.emulated[12]),
  1454. saturateToU8((uint32_t)left.emulated[13] + (uint32_t)right.emulated[13]),
  1455. saturateToU8((uint32_t)left.emulated[14] + (uint32_t)right.emulated[14]),
  1456. saturateToU8((uint32_t)left.emulated[15] + (uint32_t)right.emulated[15])
  1457. );
  1458. #endif
  1459. }
  1460. inline I32x4 truncateToI32(const F32x4& vector) {
  1461. #ifdef USE_BASIC_SIMD
  1462. return I32x4(F32_TO_I32_SIMD(vector.v));
  1463. #else
  1464. return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
  1465. #endif
  1466. }
  1467. inline U32x4 truncateToU32(const F32x4& vector) {
  1468. #ifdef USE_BASIC_SIMD
  1469. return U32x4(F32_TO_U32_SIMD(vector.v));
  1470. #else
  1471. return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
  1472. #endif
  1473. }
  1474. inline F32x4 floatFromI32(const I32x4& vector) {
  1475. #ifdef USE_BASIC_SIMD
  1476. return F32x4(I32_TO_F32_SIMD(vector.v));
  1477. #else
  1478. return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
  1479. #endif
  1480. }
  1481. inline F32x4 floatFromU32(const U32x4& vector) {
  1482. #ifdef USE_BASIC_SIMD
  1483. return F32x4(U32_TO_F32_SIMD(vector.v));
  1484. #else
  1485. return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
  1486. #endif
  1487. }
  1488. inline I32x4 I32FromU32(const U32x4& vector) {
  1489. #ifdef USE_BASIC_SIMD
  1490. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  1491. #else
  1492. return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
  1493. #endif
  1494. }
  1495. inline U32x4 U32FromI32(const I32x4& vector) {
  1496. #ifdef USE_BASIC_SIMD
  1497. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  1498. #else
  1499. return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
  1500. #endif
  1501. }
  1502. // Warning! Behaviour depends on endianness.
  1503. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  1504. #ifdef USE_BASIC_SIMD
  1505. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  1506. #else
  1507. const uint8_t *source = (const uint8_t*)vector.emulated;
  1508. return U8x16(
  1509. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  1510. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  1511. );
  1512. #endif
  1513. }
  1514. // Warning! Behaviour depends on endianness.
  1515. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  1516. #ifdef USE_BASIC_SIMD
  1517. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  1518. #else
  1519. const uint32_t *source = (const uint32_t*)vector.emulated;
  1520. return U32x4(source[0], source[1], source[2], source[3]);
  1521. #endif
  1522. }
  1523. // Unpacking to larger integers
  1524. inline U32x4 lowerToU32(const U16x8& vector) {
  1525. #ifdef USE_BASIC_SIMD
  1526. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  1527. #else
  1528. return U32x4(vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3]);
  1529. #endif
  1530. }
  1531. inline U32x4 higherToU32(const U16x8& vector) {
  1532. #ifdef USE_BASIC_SIMD
  1533. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  1534. #else
  1535. return U32x4(vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]);
  1536. #endif
  1537. }
  1538. inline U16x8 lowerToU16(const U8x16& vector) {
  1539. #ifdef USE_BASIC_SIMD
  1540. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  1541. #else
  1542. return U16x8(
  1543. vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3],
  1544. vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]
  1545. );
  1546. #endif
  1547. }
  1548. inline U16x8 higherToU16(const U8x16& vector) {
  1549. #ifdef USE_BASIC_SIMD
  1550. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  1551. #else
  1552. return U16x8(
  1553. vector.emulated[8], vector.emulated[9], vector.emulated[10], vector.emulated[11],
  1554. vector.emulated[12], vector.emulated[13], vector.emulated[14], vector.emulated[15]
  1555. );
  1556. #endif
  1557. }
  1558. // Saturated packing
  1559. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  1560. #ifdef USE_BASIC_SIMD
  1561. return U8x16(PACK_SAT_U16_TO_U8(lower.v, upper.v));
  1562. #else
  1563. return U8x16(
  1564. saturateToU8(lower.emulated[0]),
  1565. saturateToU8(lower.emulated[1]),
  1566. saturateToU8(lower.emulated[2]),
  1567. saturateToU8(lower.emulated[3]),
  1568. saturateToU8(lower.emulated[4]),
  1569. saturateToU8(lower.emulated[5]),
  1570. saturateToU8(lower.emulated[6]),
  1571. saturateToU8(lower.emulated[7]),
  1572. saturateToU8(upper.emulated[0]),
  1573. saturateToU8(upper.emulated[1]),
  1574. saturateToU8(upper.emulated[2]),
  1575. saturateToU8(upper.emulated[3]),
  1576. saturateToU8(upper.emulated[4]),
  1577. saturateToU8(upper.emulated[5]),
  1578. saturateToU8(upper.emulated[6]),
  1579. saturateToU8(upper.emulated[7])
  1580. );
  1581. #endif
  1582. }
  1583. // Helper macros for generating the vector extract functions.
  1584. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  1585. #ifdef USE_BASIC_SIMD
  1586. #ifdef USE_SSE2
  1587. #ifdef USE_SSSE3
  1588. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  1589. #else
  1590. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  1591. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  1592. ALIGN16 uint8_t vectorBuffer[32];
  1593. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  1594. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  1595. return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  1596. }
  1597. #endif
  1598. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  1599. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  1600. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  1601. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  1602. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  1603. #elif USE_NEON
  1604. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  1605. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  1606. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  1607. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  1608. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  1609. #endif
  1610. #else
  1611. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1612. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1613. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1614. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1615. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1616. #endif
  1617. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  1618. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  1619. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  1620. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  1621. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  1622. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0])) }
  1623. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1])) }
  1624. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1625. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
  1626. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
  1627. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
  1628. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
  1629. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7])) }
  1630. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8])) }
  1631. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9])) }
  1632. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10])) }
  1633. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11])) }
  1634. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12])) }
  1635. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13])) }
  1636. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13], b.emulated[14])) }
  1637. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  1638. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  1639. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0])) }
  1640. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1])) }
  1641. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1642. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
  1643. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
  1644. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
  1645. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
  1646. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  1647. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  1648. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1649. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1650. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1651. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  1652. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  1653. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1654. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1655. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1656. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  1657. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  1658. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1659. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1660. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1661. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  1662. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  1663. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  1664. #ifdef USE_AVX2
  1665. #define GATHER_I32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  1666. #define GATHER_U32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  1667. #define GATHER_F32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  1668. #endif
  1669. static inline U32x4 gather(const dsr::SafePointer<uint32_t> data, const U32x4 &elementOffset) {
  1670. #ifdef USE_AVX2
  1671. return U32x4(GATHER_U32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1672. #else
  1673. dsr::UVector4D elementOffsetS = elementOffset.get();
  1674. return U32x4(
  1675. *(data + elementOffsetS.x),
  1676. *(data + elementOffsetS.y),
  1677. *(data + elementOffsetS.z),
  1678. *(data + elementOffsetS.w)
  1679. );
  1680. #endif
  1681. }
  1682. static inline I32x4 gather(const dsr::SafePointer<int32_t> data, const U32x4 &elementOffset) {
  1683. #ifdef USE_AVX2
  1684. return I32x4(GATHER_U32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1685. #else
  1686. dsr::UVector4D elementOffsetS = elementOffset.get();
  1687. return I32x4(
  1688. *(data + elementOffsetS.x),
  1689. *(data + elementOffsetS.y),
  1690. *(data + elementOffsetS.z),
  1691. *(data + elementOffsetS.w)
  1692. );
  1693. #endif
  1694. }
  1695. static inline F32x4 gather(const dsr::SafePointer<float> data, const U32x4 &elementOffset) {
  1696. #ifdef USE_AVX2
  1697. return F32x4(GATHER_F32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1698. #else
  1699. dsr::UVector4D elementOffsetS = elementOffset.get();
  1700. return F32x4(
  1701. *(data + elementOffsetS.x),
  1702. *(data + elementOffsetS.y),
  1703. *(data + elementOffsetS.z),
  1704. *(data + elementOffsetS.w)
  1705. );
  1706. #endif
  1707. }
  1708. #endif