simd.h 76 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2022 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Covers a small intersection of SSE2 and NEON in order to reduce the number
  25. // of bugs from multiple implementations when nothing advanced is required.
  26. #ifndef DFPSR_SIMD
  27. #define DFPSR_SIMD
  28. #include <stdint.h>
  29. #include <cassert>
  30. #include "SafePointer.h"
  31. #include "../math/FVector.h"
  32. #include "../math/IVector.h"
  33. #include "../math/UVector.h"
  34. #define ALIGN16 __attribute__((aligned(16)))
  35. // To allow turning off SIMD intrinsics for testing
  36. #ifdef __SSE2__
  37. // Comment out this line to test without SSE2
  38. #define USE_SSE2
  39. #elif __ARM_NEON
  40. // Comment out this line to test without NEON
  41. #define USE_NEON
  42. #endif
  43. // Everything declared in here handles things specific for SSE.
  44. // Direct use of the macros will not provide portability to all hardware.
  45. #ifdef USE_SSE2
  46. #define USE_BASIC_SIMD
  47. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  48. #include <emmintrin.h> // SSE2
  49. #ifdef __SSSE3__
  50. #include <tmmintrin.h> // SSSE3
  51. // Comment out this line to test without SSSE3
  52. #define USE_SSSE3
  53. #endif
  54. #ifdef __AVX2__
  55. #include <immintrin.h> // AVX2
  56. // Comment out this line to test without AVX2
  57. #define USE_AVX2
  58. #endif
  59. // Vector types
  60. #define SIMD_F32x4 __m128
  61. #define SIMD_U8x16 __m128i
  62. #define SIMD_U16x8 __m128i
  63. #define SIMD_U32x4 __m128i
  64. #define SIMD_I32x4 __m128i
  65. // Vector uploads in address order
  66. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  67. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  68. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  69. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  70. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  71. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  72. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  73. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  74. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  75. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  76. // Conversions
  77. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  78. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  79. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  80. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  81. // Unpacking conversions
  82. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  83. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  84. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  85. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  86. // Saturated packing
  87. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  88. inline SIMD_U8x16 PACK_SAT_U16_TO_U8(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  89. SIMD_U16x8 mask, a2, b2;
  90. mask = _mm_set1_epi16(0x7fff);
  91. a2 = _mm_and_si128(a, mask);
  92. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  93. b2 = _mm_and_si128(b, mask);
  94. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  95. return _mm_packus_epi16(a2, b2);
  96. }
  97. // Reinterpret casting
  98. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  99. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  100. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  101. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  102. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  103. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  104. // Vector float operations returning SIMD_F32x4
  105. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  106. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  107. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  108. // Vector integer operations returning SIMD_I32x4
  109. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  110. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  111. // 32-bit integer multiplications are not available on SSE2.
  112. // Vector integer operations returning SIMD_U32x4
  113. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  114. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  115. // 32-bit integer multiplications are not available on SSE2.
  116. // Vector integer operations returning SIMD_U16x8
  117. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  118. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  119. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  120. // Vector integer operations returning SIMD_U8x16
  121. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  122. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  123. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  124. // No 8-bit multiplications
  125. // Statistics
  126. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  127. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  128. // Bitwise
  129. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  130. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  131. #endif
  132. // Everything declared in here handles things specific for NEON.
  133. // Direct use of the macros will not provide portability to all hardware.
  134. #ifdef USE_NEON
  135. #define USE_BASIC_SIMD
  136. #include <arm_neon.h> // NEON
  137. // Vector types
  138. #define SIMD_F32x4 float32x4_t
  139. #define SIMD_U8x16 uint8x16_t
  140. #define SIMD_U16x8 uint16x8_t
  141. #define SIMD_U32x4 uint32x4_t
  142. #define SIMD_I32x4 int32x4_t
  143. // Vector uploads in address order
  144. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  145. float data[4] ALIGN16 = {a, b, c, d};
  146. return vld1q_f32(data);
  147. }
  148. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  149. return vdupq_n_f32(a);
  150. }
  151. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  152. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  153. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  154. return vld1q_u8(data);
  155. }
  156. inline SIMD_U16x8 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  157. return vdupq_n_u8(a);
  158. }
  159. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  160. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  161. return vld1q_u16(data);
  162. }
  163. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  164. return vdupq_n_u16(a);
  165. }
  166. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  167. uint32_t data[4] ALIGN16 = {a, b, c, d};
  168. return vld1q_u32(data);
  169. }
  170. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  171. return vdupq_n_u32(a);
  172. }
  173. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  174. int32_t data[4] ALIGN16 = {a, b, c, d};
  175. return vld1q_s32(data);
  176. }
  177. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  178. return vdupq_n_s32(a);
  179. }
  180. // Conversions
  181. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  182. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  183. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  184. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  185. // Unpacking conversions
  186. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  187. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  188. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  189. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  190. // Saturated packing
  191. #define PACK_SAT_U16_TO_U8(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  192. // Reinterpret casting
  193. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  194. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  195. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  196. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  197. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  198. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  199. // Vector float operations returning SIMD_F32x4
  200. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  201. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  202. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  203. // Vector integer operations returning SIMD_I32x4
  204. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  205. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  206. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  207. // Vector integer operations returning SIMD_U32x4
  208. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  209. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  210. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  211. // Vector integer operations returning SIMD_U16x8
  212. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  213. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  214. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  215. // Vector integer operations returning SIMD_U8x16
  216. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  217. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  218. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  219. // No 8-bit multiplications
  220. // Statistics
  221. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  222. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  223. // Bitwise
  224. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  225. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  226. #endif
  227. /*
  228. The vector types (F32x4, I32x4, U32x4, U16x8) below are supposed to be portable across different CPU architectures.
  229. When this abstraction layer is mixed with handwritten SIMD intrinsics:
  230. Use "USE_SSE2" instead of "__SSE2__"
  231. Use "USE_AVX2" instead of "__AVX2__"
  232. Use "USE_NEON" instead of "__ARM_NEON"
  233. Portability exceptions:
  234. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  235. Only use when USE_BASIC_SIMD is defined.
  236. Will not work on scalar emulation.
  237. * The "shared_memory" array is only defined for targets with direct access to SIMD registers. (SSE)
  238. Only use when USE_DIRECT_SIMD_MEMORY_ACCESS is defined.
  239. Will not work on NEON or scalar emulation.
  240. * The "emulated" array is ony defined when SIMD is turned off.
  241. Cannot be used when USE_BASIC_SIMD is defined.
  242. Will not work when either SSE or NEON is enabled.
  243. */
  244. union F32x4 {
  245. #ifdef USE_BASIC_SIMD
  246. public:
  247. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  248. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  249. // Direct access cannot be done on NEON!
  250. float shared_memory[4];
  251. #endif
  252. // The SIMD vector of undefined type
  253. // Not accessible while emulating!
  254. SIMD_F32x4 v;
  255. // Construct a portable vector from a native SIMD vector
  256. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  257. // Construct a portable vector from a set of scalars
  258. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  259. // Construct a portable vector from a single duplicated scalar
  260. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  261. #else
  262. public:
  263. // Emulate a SIMD vector as an array of scalars without hardware support
  264. // Only accessible while emulating!
  265. float emulated[4];
  266. // Construct a portable vector from a set of scalars
  267. F32x4(float a1, float a2, float a3, float a4) {
  268. this->emulated[0] = a1;
  269. this->emulated[1] = a2;
  270. this->emulated[2] = a3;
  271. this->emulated[3] = a4;
  272. }
  273. // Construct a portable vector from a single duplicated scalar
  274. explicit F32x4(float scalar) {
  275. this->emulated[0] = scalar;
  276. this->emulated[1] = scalar;
  277. this->emulated[2] = scalar;
  278. this->emulated[3] = scalar;
  279. }
  280. #endif
  281. // Construct a portable SIMD vector from a pointer to aligned data
  282. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  283. static inline F32x4 readAlignedUnsafe(const float* data) {
  284. #ifdef USE_BASIC_SIMD
  285. #ifdef USE_SSE2
  286. return F32x4(_mm_load_ps(data));
  287. #elif USE_NEON
  288. return F32x4(vld1q_f32(data));
  289. #endif
  290. #else
  291. return F32x4(data[0], data[1], data[2], data[3]);
  292. #endif
  293. }
  294. // Write to aligned memory from the existing vector
  295. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  296. inline void writeAlignedUnsafe(float* data) const {
  297. #ifdef USE_BASIC_SIMD
  298. #ifdef USE_SSE2
  299. _mm_store_ps(data, this->v);
  300. #elif USE_NEON
  301. vst1q_f32(data, this->v);
  302. #endif
  303. #else
  304. data[0] = this->emulated[0];
  305. data[1] = this->emulated[1];
  306. data[2] = this->emulated[2];
  307. data[3] = this->emulated[3];
  308. #endif
  309. }
  310. #ifdef DFPSR_GEOMETRY_FVECTOR
  311. dsr::FVector4D get() const {
  312. float data[4] ALIGN16;
  313. this->writeAlignedUnsafe(data);
  314. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  315. }
  316. #endif
  317. // Bound and alignment checked reading
  318. static inline F32x4 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
  319. const float* pointer = data.getUnsafe();
  320. assert(((uintptr_t)pointer & 15) == 0);
  321. #ifdef SAFE_POINTER_CHECKS
  322. data.assertInside(methodName, pointer, 16);
  323. #endif
  324. return F32x4::readAlignedUnsafe(pointer);
  325. }
  326. // Bound and alignment checked writing
  327. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  328. float* pointer = data.getUnsafe();
  329. assert(((uintptr_t)pointer & 15) == 0);
  330. #ifdef SAFE_POINTER_CHECKS
  331. data.assertInside(methodName, pointer, 16);
  332. #endif
  333. this->writeAlignedUnsafe(pointer);
  334. }
  335. // 1 / x
  336. // Useful for multiple divisions with the same denominator
  337. // Useless if the denominator is a constant
  338. F32x4 reciprocal() const {
  339. #ifdef USE_BASIC_SIMD
  340. #ifdef USE_SSE2
  341. // Approximate
  342. SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
  343. // Refine
  344. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
  345. #elif USE_NEON
  346. // Approximate
  347. SIMD_F32x4 result = vrecpeq_f32(this->v);
  348. // Refine
  349. result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
  350. return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
  351. #else
  352. assert(false);
  353. return F32x4(0);
  354. #endif
  355. #else
  356. return F32x4(1.0f / this->emulated[0], 1.0f / this->emulated[1], 1.0f / this->emulated[2], 1.0f / this->emulated[3]);
  357. #endif
  358. }
  359. // 1 / sqrt(x)
  360. // Useful for normalizing vectors
  361. F32x4 reciprocalSquareRoot() const {
  362. #ifdef USE_BASIC_SIMD
  363. #ifdef USE_SSE2
  364. //__m128 reciRoot = _mm_rsqrt_ps(this->v);
  365. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
  366. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
  367. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  368. return F32x4(reciRoot);
  369. #elif USE_NEON
  370. // TODO: Test on ARM
  371. // Approximate
  372. SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
  373. // Refine
  374. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
  375. return reciRoot;
  376. #else
  377. assert(false);
  378. return F32x4(0);
  379. #endif
  380. #else
  381. return F32x4(1.0f / sqrt(this->emulated[0]), 1.0f / sqrt(this->emulated[1]), 1.0f / sqrt(this->emulated[2]), 1.0f / sqrt(this->emulated[3]));
  382. #endif
  383. }
  384. // sqrt(x)
  385. // Useful for getting lengths of vectors
  386. F32x4 squareRoot() const {
  387. #ifdef USE_BASIC_SIMD
  388. #ifdef USE_SSE2
  389. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  390. // Approximate
  391. SIMD_F32x4 root = _mm_sqrt_ps(this->v);
  392. // Refine
  393. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
  394. return F32x4(root);
  395. #elif USE_NEON
  396. // TODO: Test on ARM
  397. return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
  398. #else
  399. assert(false);
  400. return F32x4(0);
  401. #endif
  402. #else
  403. return F32x4(sqrt(this->emulated[0]), sqrt(this->emulated[1]), sqrt(this->emulated[2]), sqrt(this->emulated[3]));
  404. #endif
  405. }
  406. F32x4 clamp(float min, float max) const {
  407. #ifdef USE_BASIC_SIMD
  408. return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)), LOAD_SCALAR_F32_SIMD(max)));
  409. #else
  410. float val0 = this->emulated[0];
  411. float val1 = this->emulated[1];
  412. float val2 = this->emulated[2];
  413. float val3 = this->emulated[3];
  414. if (min > val0) { val0 = min; }
  415. if (max < val0) { val0 = max; }
  416. if (min > val1) { val1 = min; }
  417. if (max < val1) { val1 = max; }
  418. if (min > val2) { val2 = min; }
  419. if (max < val2) { val2 = max; }
  420. if (min > val3) { val3 = min; }
  421. if (max < val3) { val3 = max; }
  422. return F32x4(val0, val1, val2, val3);
  423. #endif
  424. }
  425. F32x4 clampLower(float min) const {
  426. #ifdef USE_BASIC_SIMD
  427. return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)));
  428. #else
  429. float val0 = this->emulated[0];
  430. float val1 = this->emulated[1];
  431. float val2 = this->emulated[2];
  432. float val3 = this->emulated[3];
  433. if (min > val0) { val0 = min; }
  434. if (min > val1) { val1 = min; }
  435. if (min > val2) { val2 = min; }
  436. if (min > val3) { val3 = min; }
  437. return F32x4(val0, val1, val2, val3);
  438. #endif
  439. }
  440. F32x4 clampUpper(float max) const {
  441. #ifdef USE_BASIC_SIMD
  442. return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(max)));
  443. #else
  444. float val0 = this->emulated[0];
  445. float val1 = this->emulated[1];
  446. float val2 = this->emulated[2];
  447. float val3 = this->emulated[3];
  448. if (max < val0) { val0 = max; }
  449. if (max < val1) { val1 = max; }
  450. if (max < val2) { val2 = max; }
  451. if (max < val3) { val3 = max; }
  452. return F32x4(val0, val1, val2, val3);
  453. #endif
  454. }
  455. };
  456. inline dsr::String& string_toStreamIndented(dsr::String& target, const F32x4& source, const dsr::ReadableString& indentation) {
  457. string_append(target, indentation, source.get());
  458. return target;
  459. }
  460. inline bool operator==(const F32x4& left, const F32x4& right) {
  461. float a[4] ALIGN16;
  462. float b[4] ALIGN16;
  463. left.writeAlignedUnsafe(a);
  464. right.writeAlignedUnsafe(b);
  465. return fabs(a[0] - b[0]) < 0.0001f && fabs(a[1] - b[1]) < 0.0001f && fabs(a[2] - b[2]) < 0.0001f && fabs(a[3] - b[3]) < 0.0001f;
  466. }
  467. inline bool operator!=(const F32x4& left, const F32x4& right) {
  468. return !(left == right);
  469. }
  470. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  471. #ifdef USE_BASIC_SIMD
  472. return F32x4(ADD_F32_SIMD(left.v, right.v));
  473. #else
  474. return F32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  475. #endif
  476. }
  477. inline F32x4 operator+(float left, const F32x4& right) {
  478. #ifdef USE_BASIC_SIMD
  479. return F32x4(ADD_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  480. #else
  481. return F32x4(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3]);
  482. #endif
  483. }
  484. inline F32x4 operator+(const F32x4& left, float right) {
  485. #ifdef USE_BASIC_SIMD
  486. return F32x4(ADD_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  487. #else
  488. return F32x4(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right);
  489. #endif
  490. }
  491. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  492. #ifdef USE_BASIC_SIMD
  493. return F32x4(SUB_F32_SIMD(left.v, right.v));
  494. #else
  495. return F32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  496. #endif
  497. }
  498. inline F32x4 operator-(float left, const F32x4& right) {
  499. #ifdef USE_BASIC_SIMD
  500. return F32x4(SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  501. #else
  502. return F32x4(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3]);
  503. #endif
  504. }
  505. inline F32x4 operator-(const F32x4& left, float right) {
  506. #ifdef USE_BASIC_SIMD
  507. return F32x4(SUB_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  508. #else
  509. return F32x4(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right);
  510. #endif
  511. }
  512. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  513. #ifdef USE_BASIC_SIMD
  514. return F32x4(MUL_F32_SIMD(left.v, right.v));
  515. #else
  516. return F32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  517. #endif
  518. }
  519. inline F32x4 operator*(float left, const F32x4& right) {
  520. #ifdef USE_BASIC_SIMD
  521. return F32x4(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  522. #else
  523. return F32x4(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3]);
  524. #endif
  525. }
  526. inline F32x4 operator*(const F32x4& left, float right) {
  527. #ifdef USE_BASIC_SIMD
  528. return F32x4(MUL_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  529. #else
  530. return F32x4(left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right);
  531. #endif
  532. }
  533. inline F32x4 min(const F32x4& left, const F32x4& right) {
  534. #ifdef USE_BASIC_SIMD
  535. return F32x4(MIN_F32_SIMD(left.v, right.v));
  536. #else
  537. float v0 = left.emulated[0];
  538. float v1 = left.emulated[1];
  539. float v2 = left.emulated[2];
  540. float v3 = left.emulated[3];
  541. float r0 = right.emulated[0];
  542. float r1 = right.emulated[1];
  543. float r2 = right.emulated[2];
  544. float r3 = right.emulated[3];
  545. if (r0 < v0) { v0 = r0; }
  546. if (r1 < v1) { v1 = r1; }
  547. if (r2 < v2) { v2 = r2; }
  548. if (r3 < v3) { v3 = r3; }
  549. return F32x4(v0, v1, v2, v3);
  550. #endif
  551. }
  552. inline F32x4 max(const F32x4& left, const F32x4& right) {
  553. #ifdef USE_BASIC_SIMD
  554. return F32x4(MAX_F32_SIMD(left.v, right.v));
  555. #else
  556. float v0 = left.emulated[0];
  557. float v1 = left.emulated[1];
  558. float v2 = left.emulated[2];
  559. float v3 = left.emulated[3];
  560. float r0 = right.emulated[0];
  561. float r1 = right.emulated[1];
  562. float r2 = right.emulated[2];
  563. float r3 = right.emulated[3];
  564. if (r0 > v0) { v0 = r0; }
  565. if (r1 > v1) { v1 = r1; }
  566. if (r2 > v2) { v2 = r2; }
  567. if (r3 > v3) { v3 = r3; }
  568. return F32x4(v0, v1, v2, v3);
  569. #endif
  570. }
  571. union I32x4 {
  572. #ifdef USE_BASIC_SIMD
  573. public:
  574. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  575. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  576. // Direct access cannot be done on NEON!
  577. int32_t shared_memory[4];
  578. #endif
  579. // The SIMD vector of undefined type
  580. // Not accessible while emulating!
  581. SIMD_I32x4 v;
  582. // Construct a portable vector from a native SIMD vector
  583. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  584. // Construct a portable vector from a set of scalars
  585. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  586. // Construct a portable vector from a single duplicated scalar
  587. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  588. #else
  589. public:
  590. // Emulate a SIMD vector as an array of scalars without hardware support
  591. // Only accessible while emulating!
  592. int32_t emulated[4];
  593. // Construct a portable vector from a set of scalars
  594. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  595. this->emulated[0] = a1;
  596. this->emulated[1] = a2;
  597. this->emulated[2] = a3;
  598. this->emulated[3] = a4;
  599. }
  600. // Construct a portable vector from a single duplicated scalar
  601. explicit I32x4(int32_t scalar) {
  602. this->emulated[0] = scalar;
  603. this->emulated[1] = scalar;
  604. this->emulated[2] = scalar;
  605. this->emulated[3] = scalar;
  606. }
  607. #endif
  608. // Construct a portable SIMD vector from a pointer to aligned data
  609. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  610. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  611. #ifdef USE_BASIC_SIMD
  612. #ifdef USE_SSE2
  613. return I32x4(_mm_load_si128((const __m128i*)data));
  614. #elif USE_NEON
  615. return I32x4(vld1q_s32(data));
  616. #endif
  617. #else
  618. return I32x4(data[0], data[1], data[2], data[3]);
  619. #endif
  620. }
  621. // Write to aligned memory from the existing vector
  622. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  623. inline void writeAlignedUnsafe(int32_t* data) const {
  624. #ifdef USE_BASIC_SIMD
  625. #ifdef USE_SSE2
  626. _mm_store_si128((__m128i*)data, this->v);
  627. #elif USE_NEON
  628. vst1q_s32(data, this->v);
  629. #endif
  630. #else
  631. data[0] = this->emulated[0];
  632. data[1] = this->emulated[1];
  633. data[2] = this->emulated[2];
  634. data[3] = this->emulated[3];
  635. #endif
  636. }
  637. #ifdef DFPSR_GEOMETRY_IVECTOR
  638. dsr::IVector4D get() const {
  639. int32_t data[4] ALIGN16;
  640. this->writeAlignedUnsafe(data);
  641. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  642. }
  643. #endif
  644. // Bound and alignment checked reading
  645. static inline I32x4 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
  646. const int32_t* pointer = data.getUnsafe();
  647. assert(((uintptr_t)pointer & 15) == 0);
  648. #ifdef SAFE_POINTER_CHECKS
  649. data.assertInside(methodName, pointer, 16);
  650. #endif
  651. return I32x4::readAlignedUnsafe(pointer);
  652. }
  653. // Bound and alignment checked writing
  654. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  655. int32_t* pointer = data.getUnsafe();
  656. assert(((uintptr_t)pointer & 15) == 0);
  657. #ifdef SAFE_POINTER_CHECKS
  658. data.assertInside(methodName, pointer, 16);
  659. #endif
  660. this->writeAlignedUnsafe(pointer);
  661. }
  662. };
  663. inline dsr::String& string_toStreamIndented(dsr::String& target, const I32x4& source, const dsr::ReadableString& indentation) {
  664. string_append(target, indentation, source.get());
  665. return target;
  666. }
  667. inline bool operator==(const I32x4& left, const I32x4& right) {
  668. int32_t a[4] ALIGN16;
  669. int32_t b[4] ALIGN16;
  670. left.writeAlignedUnsafe(a);
  671. right.writeAlignedUnsafe(b);
  672. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
  673. }
  674. inline bool operator!=(const I32x4& left, const I32x4& right) {
  675. return !(left == right);
  676. }
  677. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  678. #ifdef USE_BASIC_SIMD
  679. return I32x4(ADD_I32_SIMD(left.v, right.v));
  680. #else
  681. return I32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  682. #endif
  683. }
  684. inline I32x4 operator+(int32_t left, const I32x4& right) {
  685. return I32x4(left) + right;
  686. }
  687. inline I32x4 operator+(const I32x4& left, int32_t right) {
  688. return left + I32x4(right);
  689. }
  690. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  691. #ifdef USE_BASIC_SIMD
  692. return I32x4(SUB_I32_SIMD(left.v, right.v));
  693. #else
  694. return I32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  695. #endif
  696. }
  697. inline I32x4 operator-(int32_t left, const I32x4& right) {
  698. return I32x4(left) - right;
  699. }
  700. inline I32x4 operator-(const I32x4& left, int32_t right) {
  701. return left - I32x4(right);
  702. }
  703. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  704. #ifdef USE_BASIC_SIMD
  705. #ifdef USE_SSE2
  706. // Emulate a NEON instruction
  707. return I32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
  708. #elif USE_NEON
  709. return I32x4(MUL_I32_NEON(left.v, right.v));
  710. #endif
  711. #else
  712. return I32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  713. #endif
  714. }
  715. inline I32x4 operator*(int32_t left, const I32x4& right) {
  716. return I32x4(left) * right;
  717. }
  718. inline I32x4 operator*(const I32x4& left, int32_t right) {
  719. return left * I32x4(right);
  720. }
  721. union U32x4 {
  722. #ifdef USE_BASIC_SIMD
  723. public:
  724. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  725. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  726. // Direct access cannot be done on NEON!
  727. uint32_t shared_memory[4];
  728. #endif
  729. // The SIMD vector of undefined type
  730. // Not accessible while emulating!
  731. SIMD_U32x4 v;
  732. // Construct a portable vector from a native SIMD vector
  733. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  734. // Construct a portable vector from a set of scalars
  735. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  736. // Construct a portable vector from a single duplicated scalar
  737. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  738. #else
  739. public:
  740. // Emulate a SIMD vector as an array of scalars without hardware support
  741. // Only accessible while emulating!
  742. uint32_t emulated[4];
  743. // Construct a portable vector from a set of scalars
  744. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  745. this->emulated[0] = a1;
  746. this->emulated[1] = a2;
  747. this->emulated[2] = a3;
  748. this->emulated[3] = a4;
  749. }
  750. // Construct a portable vector from a single duplicated scalar
  751. explicit U32x4(uint32_t scalar) {
  752. this->emulated[0] = scalar;
  753. this->emulated[1] = scalar;
  754. this->emulated[2] = scalar;
  755. this->emulated[3] = scalar;
  756. }
  757. #endif
  758. // Construct a portable SIMD vector from a pointer to aligned data
  759. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  760. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  761. #ifdef USE_BASIC_SIMD
  762. #ifdef USE_SSE2
  763. return U32x4(_mm_load_si128((const __m128i*)data));
  764. #elif USE_NEON
  765. return U32x4(vld1q_u32(data));
  766. #endif
  767. #else
  768. return U32x4(data[0], data[1], data[2], data[3]);
  769. #endif
  770. }
  771. // Write to aligned memory from the existing vector
  772. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  773. inline void writeAlignedUnsafe(uint32_t* data) const {
  774. #ifdef USE_BASIC_SIMD
  775. #ifdef USE_SSE2
  776. _mm_store_si128((__m128i*)data, this->v);
  777. #elif USE_NEON
  778. vst1q_u32(data, this->v);
  779. #endif
  780. #else
  781. data[0] = this->emulated[0];
  782. data[1] = this->emulated[1];
  783. data[2] = this->emulated[2];
  784. data[3] = this->emulated[3];
  785. #endif
  786. }
  787. #ifdef DFPSR_GEOMETRY_UVECTOR
  788. dsr::UVector4D get() const {
  789. uint32_t data[4] ALIGN16;
  790. this->writeAlignedUnsafe(data);
  791. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  792. }
  793. #endif
  794. // Bound and alignment checked reading
  795. static inline U32x4 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
  796. const uint32_t* pointer = data.getUnsafe();
  797. assert(((uintptr_t)pointer & 15) == 0);
  798. #ifdef SAFE_POINTER_CHECKS
  799. data.assertInside(methodName, pointer, 16);
  800. #endif
  801. return U32x4::readAlignedUnsafe(pointer);
  802. }
  803. // Bound and alignment checked writing
  804. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  805. uint32_t* pointer = data.getUnsafe();
  806. assert(((uintptr_t)pointer & 15) == 0);
  807. #ifdef SAFE_POINTER_CHECKS
  808. data.assertInside(methodName, pointer, 16);
  809. #endif
  810. this->writeAlignedUnsafe(pointer);
  811. }
  812. };
  813. inline dsr::String& string_toStreamIndented(dsr::String& target, const U32x4& source, const dsr::ReadableString& indentation) {
  814. string_append(target, indentation, source.get());
  815. return target;
  816. }
  817. inline bool operator==(const U32x4& left, const U32x4& right) {
  818. uint32_t a[4] ALIGN16;
  819. uint32_t b[4] ALIGN16;
  820. left.writeAlignedUnsafe(a);
  821. right.writeAlignedUnsafe(b);
  822. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
  823. }
  824. inline bool operator!=(const U32x4& left, const U32x4& right) {
  825. return !(left == right);
  826. }
  827. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  828. #ifdef USE_BASIC_SIMD
  829. return U32x4(ADD_U32_SIMD(left.v, right.v));
  830. #else
  831. return U32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  832. #endif
  833. }
  834. inline U32x4 operator+(uint32_t left, const U32x4& right) {
  835. return U32x4(left) + right;
  836. }
  837. inline U32x4 operator+(const U32x4& left, uint32_t right) {
  838. return left + U32x4(right);
  839. }
  840. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  841. #ifdef USE_BASIC_SIMD
  842. return U32x4(SUB_U32_SIMD(left.v, right.v));
  843. #else
  844. return U32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  845. #endif
  846. }
  847. inline U32x4 operator-(uint32_t left, const U32x4& right) {
  848. return U32x4(left) - right;
  849. }
  850. inline U32x4 operator-(const U32x4& left, uint32_t right) {
  851. return left - U32x4(right);
  852. }
  853. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  854. #ifdef USE_BASIC_SIMD
  855. #ifdef USE_SSE2
  856. // Emulate a NEON instruction on SSE2 registers
  857. return U32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
  858. #else // NEON
  859. return U32x4(MUL_U32_NEON(left.v, right.v));
  860. #endif
  861. #else
  862. return U32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  863. #endif
  864. }
  865. inline U32x4 operator*(uint32_t left, const U32x4& right) {
  866. return U32x4(left) * right;
  867. }
  868. inline U32x4 operator*(const U32x4& left, uint32_t right) {
  869. return left * U32x4(right);
  870. }
  871. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  872. #ifdef USE_BASIC_SIMD
  873. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  874. #else
  875. return U32x4(left.emulated[0] & right.emulated[0], left.emulated[1] & right.emulated[1], left.emulated[2] & right.emulated[2], left.emulated[3] & right.emulated[3]);
  876. #endif
  877. }
  878. inline U32x4 operator&(const U32x4& left, uint32_t mask) {
  879. #ifdef USE_BASIC_SIMD
  880. return U32x4(BITWISE_AND_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  881. #else
  882. return U32x4(left.emulated[0] & mask, left.emulated[1] & mask, left.emulated[2] & mask, left.emulated[3] & mask);
  883. #endif
  884. }
  885. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  886. #ifdef USE_BASIC_SIMD
  887. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  888. #else
  889. return U32x4(left.emulated[0] | right.emulated[0], left.emulated[1] | right.emulated[1], left.emulated[2] | right.emulated[2], left.emulated[3] | right.emulated[3]);
  890. #endif
  891. }
  892. inline U32x4 operator|(const U32x4& left, uint32_t mask) {
  893. #ifdef USE_BASIC_SIMD
  894. return U32x4(BITWISE_OR_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  895. #else
  896. return U32x4(left.emulated[0] | mask, left.emulated[1] | mask, left.emulated[2] | mask, left.emulated[3] | mask);
  897. #endif
  898. }
  899. inline U32x4 operator<<(const U32x4& left, uint32_t bitOffset) {
  900. #ifdef USE_SSE2
  901. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  902. #else
  903. #ifdef USE_NEON
  904. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  905. #else
  906. return U32x4(left.emulated[0] << bitOffset, left.emulated[1] << bitOffset, left.emulated[2] << bitOffset, left.emulated[3] << bitOffset);
  907. #endif
  908. #endif
  909. }
  910. inline U32x4 operator>>(const U32x4& left, uint32_t bitOffset) {
  911. #ifdef USE_SSE2
  912. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  913. #else
  914. #ifdef USE_NEON
  915. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-bitOffset)));
  916. #else
  917. return U32x4(left.emulated[0] >> bitOffset, left.emulated[1] >> bitOffset, left.emulated[2] >> bitOffset, left.emulated[3] >> bitOffset);
  918. #endif
  919. #endif
  920. }
  921. union U16x8 {
  922. #ifdef USE_BASIC_SIMD
  923. public:
  924. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  925. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  926. // Direct access cannot be done on NEON!
  927. uint16_t shared_memory[8];
  928. #endif
  929. // The SIMD vector of undefined type
  930. // Not accessible while emulating!
  931. SIMD_U16x8 v;
  932. // Construct a portable vector from a native SIMD vector
  933. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  934. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  935. // Reinterpret casting is used
  936. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  937. // Construct a portable vector from a set of scalars
  938. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  939. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  940. // Reinterpret casting is used
  941. // TODO: Remove all reintreprets from constructors to improve readability
  942. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  943. // Construct a portable vector from a single duplicated scalar
  944. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  945. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  946. U32x4 get_U32() const {
  947. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  948. }
  949. #else
  950. public:
  951. // Emulate a SIMD vector as an array of scalars without hardware support
  952. // Only accessible while emulating!
  953. uint16_t emulated[8];
  954. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  955. // Reinterpret casting is used
  956. explicit U16x8(const U32x4& vector) {
  957. uint64_t *target = (uint64_t*)this->emulated;
  958. uint64_t *source = (uint64_t*)vector.emulated;
  959. target[0] = source[0];
  960. target[1] = source[1];
  961. }
  962. // Construct a portable vector from a set of scalars
  963. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  964. this->emulated[0] = a1;
  965. this->emulated[1] = a2;
  966. this->emulated[2] = a3;
  967. this->emulated[3] = a4;
  968. this->emulated[4] = a5;
  969. this->emulated[5] = a6;
  970. this->emulated[6] = a7;
  971. this->emulated[7] = a8;
  972. }
  973. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  974. // Reinterpret casting is used
  975. explicit U16x8(uint32_t scalar) {
  976. uint32_t *target = (uint32_t*)this->emulated;
  977. target[0] = scalar;
  978. target[1] = scalar;
  979. target[2] = scalar;
  980. target[3] = scalar;
  981. }
  982. // Construct a portable vector from a single duplicated scalar
  983. explicit U16x8(uint16_t scalar) {
  984. this->emulated[0] = scalar;
  985. this->emulated[1] = scalar;
  986. this->emulated[2] = scalar;
  987. this->emulated[3] = scalar;
  988. this->emulated[4] = scalar;
  989. this->emulated[5] = scalar;
  990. this->emulated[6] = scalar;
  991. this->emulated[7] = scalar;
  992. }
  993. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  994. U32x4 get_U32() const {
  995. U32x4 result(0);
  996. uint64_t *target = (uint64_t*)result.emulated;
  997. uint64_t *source = (uint64_t*)this->emulated;
  998. target[0] = source[0];
  999. target[1] = source[1];
  1000. return result;
  1001. }
  1002. #endif
  1003. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1004. //static inline U16x8 readSlow(uint16_t* data) {
  1005. // return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1006. //}
  1007. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  1008. #ifdef USE_BASIC_SIMD
  1009. #ifdef USE_SSE2
  1010. return U16x8(_mm_load_si128((const __m128i*)data));
  1011. #elif USE_NEON
  1012. return U16x8(vld1q_u16(data));
  1013. #endif
  1014. #else
  1015. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1016. #endif
  1017. }
  1018. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1019. inline void writeAlignedUnsafe(uint16_t* data) const {
  1020. #ifdef USE_BASIC_SIMD
  1021. #ifdef USE_SSE2
  1022. _mm_store_si128((__m128i*)data, this->v);
  1023. #elif USE_NEON
  1024. vst1q_u16(data, this->v);
  1025. #endif
  1026. #else
  1027. data[0] = this->emulated[0];
  1028. data[1] = this->emulated[1];
  1029. data[2] = this->emulated[2];
  1030. data[3] = this->emulated[3];
  1031. data[4] = this->emulated[4];
  1032. data[5] = this->emulated[5];
  1033. data[6] = this->emulated[6];
  1034. data[7] = this->emulated[7];
  1035. #endif
  1036. }
  1037. // Bound and alignment checked reading
  1038. static inline U16x8 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
  1039. const uint16_t* pointer = data.getUnsafe();
  1040. assert(((uintptr_t)pointer & 15) == 0);
  1041. #ifdef SAFE_POINTER_CHECKS
  1042. data.assertInside(methodName, pointer, 16);
  1043. #endif
  1044. return U16x8::readAlignedUnsafe(pointer);
  1045. }
  1046. // Bound and alignment checked writing
  1047. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1048. uint16_t* pointer = data.getUnsafe();
  1049. assert(((uintptr_t)pointer & 15) == 0);
  1050. #ifdef SAFE_POINTER_CHECKS
  1051. data.assertInside(methodName, pointer, 16);
  1052. #endif
  1053. this->writeAlignedUnsafe(pointer);
  1054. }
  1055. };
  1056. inline dsr::String& string_toStreamIndented(dsr::String& target, const U16x8& source, const dsr::ReadableString& indentation) {
  1057. ALIGN16 uint16_t data[8];
  1058. source.writeAlignedUnsafe(data);
  1059. string_append(target, indentation, "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7], ")");
  1060. return target;
  1061. }
  1062. inline bool operator==(const U16x8& left, const U16x8& right) {
  1063. ALIGN16 uint16_t a[8];
  1064. ALIGN16 uint16_t b[8];
  1065. left.writeAlignedUnsafe(a);
  1066. right.writeAlignedUnsafe(b);
  1067. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7];
  1068. }
  1069. inline bool operator!=(const U16x8& left, const U16x8& right) {
  1070. return !(left == right);
  1071. }
  1072. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  1073. #ifdef USE_BASIC_SIMD
  1074. return U16x8(ADD_U16_SIMD(left.v, right.v));
  1075. #else
  1076. return U16x8(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3],
  1077. left.emulated[4] + right.emulated[4], left.emulated[5] + right.emulated[5], left.emulated[6] + right.emulated[6], left.emulated[7] + right.emulated[7]);
  1078. #endif
  1079. }
  1080. inline U16x8 operator+(uint16_t left, const U16x8& right) {
  1081. #ifdef USE_BASIC_SIMD
  1082. return U16x8(ADD_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1083. #else
  1084. return U16x8(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3],
  1085. left + right.emulated[4], left + right.emulated[5], left + right.emulated[6], left + right.emulated[7]);
  1086. #endif
  1087. }
  1088. inline U16x8 operator+(const U16x8& left, uint16_t right) {
  1089. #ifdef USE_BASIC_SIMD
  1090. return U16x8(ADD_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1091. #else
  1092. return U16x8(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right,
  1093. left.emulated[4] + right, left.emulated[5] + right, left.emulated[6] + right, left.emulated[7] + right);
  1094. #endif
  1095. }
  1096. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  1097. #ifdef USE_BASIC_SIMD
  1098. return U16x8(SUB_U16_SIMD(left.v, right.v));
  1099. #else
  1100. return U16x8(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3],
  1101. left.emulated[4] - right.emulated[4], left.emulated[5] - right.emulated[5], left.emulated[6] - right.emulated[6], left.emulated[7] - right.emulated[7]);
  1102. #endif
  1103. }
  1104. inline U16x8 operator-(uint16_t left, const U16x8& right) {
  1105. #ifdef USE_BASIC_SIMD
  1106. return U16x8(SUB_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1107. #else
  1108. return U16x8(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3],
  1109. left - right.emulated[4], left - right.emulated[5], left - right.emulated[6], left - right.emulated[7]);
  1110. #endif
  1111. }
  1112. inline U16x8 operator-(const U16x8& left, uint16_t right) {
  1113. #ifdef USE_BASIC_SIMD
  1114. return U16x8(SUB_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1115. #else
  1116. return U16x8(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right,
  1117. left.emulated[4] - right, left.emulated[5] - right, left.emulated[6] - right, left.emulated[7] - right);
  1118. #endif
  1119. }
  1120. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  1121. #ifdef USE_BASIC_SIMD
  1122. return U16x8(MUL_U16_SIMD(left.v, right.v));
  1123. #else
  1124. return U16x8(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3],
  1125. left.emulated[4] * right.emulated[4], left.emulated[5] * right.emulated[5], left.emulated[6] * right.emulated[6], left.emulated[7] * right.emulated[7]);
  1126. #endif
  1127. }
  1128. inline U16x8 operator*(uint16_t left, const U16x8& right) {
  1129. #ifdef USE_BASIC_SIMD
  1130. return U16x8(MUL_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1131. #else
  1132. return U16x8(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3],
  1133. left * right.emulated[4], left * right.emulated[5], left * right.emulated[6], left * right.emulated[7]);
  1134. #endif
  1135. }
  1136. inline U16x8 operator*(const U16x8& left, uint16_t right) {
  1137. #ifdef USE_BASIC_SIMD
  1138. return U16x8(MUL_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1139. #else
  1140. return U16x8(
  1141. left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right,
  1142. left.emulated[4] * right, left.emulated[5] * right, left.emulated[6] * right, left.emulated[7] * right
  1143. );
  1144. #endif
  1145. }
  1146. union U8x16 {
  1147. #ifdef USE_BASIC_SIMD
  1148. public:
  1149. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  1150. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1151. // Direct access cannot be done on NEON!
  1152. uint8_t shared_memory[16];
  1153. #endif
  1154. // The SIMD vector of undefined type
  1155. // Not accessible while emulating!
  1156. SIMD_U8x16 v;
  1157. // Construct a portable vector from a native SIMD vector
  1158. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  1159. // Construct a portable vector from a set of scalars
  1160. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1161. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  1162. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1163. // Construct a portable vector from a single duplicated scalar
  1164. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  1165. #else
  1166. public:
  1167. // Emulate a SIMD vector as an array of scalars without hardware support
  1168. // Only accessible while emulating!
  1169. uint8_t emulated[16];
  1170. // Construct a portable vector from a set of scalars
  1171. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1172. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  1173. this->emulated[0] = a1;
  1174. this->emulated[1] = a2;
  1175. this->emulated[2] = a3;
  1176. this->emulated[3] = a4;
  1177. this->emulated[4] = a5;
  1178. this->emulated[5] = a6;
  1179. this->emulated[6] = a7;
  1180. this->emulated[7] = a8;
  1181. this->emulated[8] = a9;
  1182. this->emulated[9] = a10;
  1183. this->emulated[10] = a11;
  1184. this->emulated[11] = a12;
  1185. this->emulated[12] = a13;
  1186. this->emulated[13] = a14;
  1187. this->emulated[14] = a15;
  1188. this->emulated[15] = a16;
  1189. }
  1190. // Construct a portable vector from a single duplicated scalar
  1191. explicit U8x16(uint8_t scalar) {
  1192. this->emulated[0] = scalar;
  1193. this->emulated[1] = scalar;
  1194. this->emulated[2] = scalar;
  1195. this->emulated[3] = scalar;
  1196. this->emulated[4] = scalar;
  1197. this->emulated[5] = scalar;
  1198. this->emulated[6] = scalar;
  1199. this->emulated[7] = scalar;
  1200. this->emulated[8] = scalar;
  1201. this->emulated[9] = scalar;
  1202. this->emulated[10] = scalar;
  1203. this->emulated[11] = scalar;
  1204. this->emulated[12] = scalar;
  1205. this->emulated[13] = scalar;
  1206. this->emulated[14] = scalar;
  1207. this->emulated[15] = scalar;
  1208. }
  1209. #endif
  1210. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  1211. #ifdef USE_BASIC_SIMD
  1212. #ifdef USE_SSE2
  1213. return U8x16(_mm_load_si128((const __m128i*)data));
  1214. #elif USE_NEON
  1215. return U8x16(vld1q_u8(data));
  1216. #endif
  1217. #else
  1218. return U8x16(
  1219. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  1220. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  1221. );
  1222. #endif
  1223. }
  1224. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1225. inline void writeAlignedUnsafe(uint8_t* data) const {
  1226. #ifdef USE_BASIC_SIMD
  1227. #ifdef USE_SSE2
  1228. _mm_store_si128((__m128i*)data, this->v);
  1229. #elif USE_NEON
  1230. vst1q_u8(data, this->v);
  1231. #endif
  1232. #else
  1233. data[0] = this->emulated[0];
  1234. data[1] = this->emulated[1];
  1235. data[2] = this->emulated[2];
  1236. data[3] = this->emulated[3];
  1237. data[4] = this->emulated[4];
  1238. data[5] = this->emulated[5];
  1239. data[6] = this->emulated[6];
  1240. data[7] = this->emulated[7];
  1241. data[8] = this->emulated[8];
  1242. data[9] = this->emulated[9];
  1243. data[10] = this->emulated[10];
  1244. data[11] = this->emulated[11];
  1245. data[12] = this->emulated[12];
  1246. data[13] = this->emulated[13];
  1247. data[14] = this->emulated[14];
  1248. data[15] = this->emulated[15];
  1249. #endif
  1250. }
  1251. // Bound and alignment checked reading
  1252. static inline U8x16 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
  1253. const uint8_t* pointer = data.getUnsafe();
  1254. assert(((uintptr_t)pointer & 15) == 0);
  1255. #ifdef SAFE_POINTER_CHECKS
  1256. data.assertInside(methodName, pointer, 16);
  1257. #endif
  1258. return U8x16::readAlignedUnsafe(pointer);
  1259. }
  1260. // Bound and alignment checked writing
  1261. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1262. uint8_t* pointer = data.getUnsafe();
  1263. assert(((uintptr_t)pointer & 15) == 0);
  1264. #ifdef SAFE_POINTER_CHECKS
  1265. data.assertInside(methodName, pointer, 16);
  1266. #endif
  1267. this->writeAlignedUnsafe(pointer);
  1268. }
  1269. };
  1270. inline dsr::String& string_toStreamIndented(dsr::String& target, const U8x16& source, const dsr::ReadableString& indentation) {
  1271. ALIGN16 uint8_t data[16];
  1272. source.writeAlignedUnsafe(data);
  1273. string_append(target, indentation,
  1274. "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7],
  1275. ", ", data[8], ", ", data[9], ", ", data[10], ", ", data[11], ", ", data[12], ", ", data[13], ", ", data[14], ", ", data[15], ")"
  1276. );
  1277. return target;
  1278. }
  1279. inline bool operator==(const U8x16& left, const U8x16& right) {
  1280. ALIGN16 uint8_t a[16];
  1281. ALIGN16 uint8_t b[16];
  1282. left.writeAlignedUnsafe(a);
  1283. right.writeAlignedUnsafe(b);
  1284. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7]
  1285. && a[8] == b[8] && a[9] == b[9] && a[10] == b[10] && a[11] == b[11] && a[12] == b[12] && a[13] == b[13] && a[14] == b[14] && a[15] == b[15];
  1286. }
  1287. inline bool operator!=(const U8x16& left, const U8x16& right) {
  1288. return !(left == right);
  1289. }
  1290. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  1291. #ifdef USE_BASIC_SIMD
  1292. return U8x16(ADD_U8_SIMD(left.v, right.v));
  1293. #else
  1294. return U8x16(
  1295. left.emulated[0] + right.emulated[0],
  1296. left.emulated[1] + right.emulated[1],
  1297. left.emulated[2] + right.emulated[2],
  1298. left.emulated[3] + right.emulated[3],
  1299. left.emulated[4] + right.emulated[4],
  1300. left.emulated[5] + right.emulated[5],
  1301. left.emulated[6] + right.emulated[6],
  1302. left.emulated[7] + right.emulated[7],
  1303. left.emulated[8] + right.emulated[8],
  1304. left.emulated[9] + right.emulated[9],
  1305. left.emulated[10] + right.emulated[10],
  1306. left.emulated[11] + right.emulated[11],
  1307. left.emulated[12] + right.emulated[12],
  1308. left.emulated[13] + right.emulated[13],
  1309. left.emulated[14] + right.emulated[14],
  1310. left.emulated[15] + right.emulated[15]
  1311. );
  1312. #endif
  1313. }
  1314. inline U8x16 operator+(uint8_t left, const U8x16& right) {
  1315. #ifdef USE_BASIC_SIMD
  1316. return U8x16(ADD_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
  1317. #else
  1318. return U8x16(
  1319. left + right.emulated[0],
  1320. left + right.emulated[1],
  1321. left + right.emulated[2],
  1322. left + right.emulated[3],
  1323. left + right.emulated[4],
  1324. left + right.emulated[5],
  1325. left + right.emulated[6],
  1326. left + right.emulated[7],
  1327. left + right.emulated[8],
  1328. left + right.emulated[9],
  1329. left + right.emulated[10],
  1330. left + right.emulated[11],
  1331. left + right.emulated[12],
  1332. left + right.emulated[13],
  1333. left + right.emulated[14],
  1334. left + right.emulated[15]
  1335. );
  1336. #endif
  1337. }
  1338. inline U8x16 operator+(const U8x16& left, uint8_t right) {
  1339. #ifdef USE_BASIC_SIMD
  1340. return U8x16(ADD_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
  1341. #else
  1342. return U8x16(
  1343. left.emulated[0] + right,
  1344. left.emulated[1] + right,
  1345. left.emulated[2] + right,
  1346. left.emulated[3] + right,
  1347. left.emulated[4] + right,
  1348. left.emulated[5] + right,
  1349. left.emulated[6] + right,
  1350. left.emulated[7] + right,
  1351. left.emulated[8] + right,
  1352. left.emulated[9] + right,
  1353. left.emulated[10] + right,
  1354. left.emulated[11] + right,
  1355. left.emulated[12] + right,
  1356. left.emulated[13] + right,
  1357. left.emulated[14] + right,
  1358. left.emulated[15] + right
  1359. );
  1360. #endif
  1361. }
  1362. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  1363. #ifdef USE_BASIC_SIMD
  1364. return U8x16(SUB_U8_SIMD(left.v, right.v));
  1365. #else
  1366. return U8x16(
  1367. left.emulated[0] - right.emulated[0],
  1368. left.emulated[1] - right.emulated[1],
  1369. left.emulated[2] - right.emulated[2],
  1370. left.emulated[3] - right.emulated[3],
  1371. left.emulated[4] - right.emulated[4],
  1372. left.emulated[5] - right.emulated[5],
  1373. left.emulated[6] - right.emulated[6],
  1374. left.emulated[7] - right.emulated[7],
  1375. left.emulated[8] - right.emulated[8],
  1376. left.emulated[9] - right.emulated[9],
  1377. left.emulated[10] - right.emulated[10],
  1378. left.emulated[11] - right.emulated[11],
  1379. left.emulated[12] - right.emulated[12],
  1380. left.emulated[13] - right.emulated[13],
  1381. left.emulated[14] - right.emulated[14],
  1382. left.emulated[15] - right.emulated[15]
  1383. );
  1384. #endif
  1385. }
  1386. inline U8x16 operator-(uint8_t left, const U8x16& right) {
  1387. #ifdef USE_BASIC_SIMD
  1388. return U8x16(SUB_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
  1389. #else
  1390. return U8x16(
  1391. left - right.emulated[0],
  1392. left - right.emulated[1],
  1393. left - right.emulated[2],
  1394. left - right.emulated[3],
  1395. left - right.emulated[4],
  1396. left - right.emulated[5],
  1397. left - right.emulated[6],
  1398. left - right.emulated[7],
  1399. left - right.emulated[8],
  1400. left - right.emulated[9],
  1401. left - right.emulated[10],
  1402. left - right.emulated[11],
  1403. left - right.emulated[12],
  1404. left - right.emulated[13],
  1405. left - right.emulated[14],
  1406. left - right.emulated[15]
  1407. );
  1408. #endif
  1409. }
  1410. inline U8x16 operator-(const U8x16& left, uint8_t right) {
  1411. #ifdef USE_BASIC_SIMD
  1412. return U8x16(SUB_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
  1413. #else
  1414. return U8x16(
  1415. left.emulated[0] - right,
  1416. left.emulated[1] - right,
  1417. left.emulated[2] - right,
  1418. left.emulated[3] - right,
  1419. left.emulated[4] - right,
  1420. left.emulated[5] - right,
  1421. left.emulated[6] - right,
  1422. left.emulated[7] - right,
  1423. left.emulated[8] - right,
  1424. left.emulated[9] - right,
  1425. left.emulated[10] - right,
  1426. left.emulated[11] - right,
  1427. left.emulated[12] - right,
  1428. left.emulated[13] - right,
  1429. left.emulated[14] - right,
  1430. left.emulated[15] - right
  1431. );
  1432. #endif
  1433. }
  1434. inline uint8_t saturateToU8(uint32_t x) {
  1435. // No need to check lower bound for unsigned input
  1436. return x > 255 ? 255 : x;
  1437. }
  1438. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  1439. #ifdef USE_BASIC_SIMD
  1440. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  1441. #else
  1442. return U8x16(
  1443. saturateToU8((uint32_t)left.emulated[0] + (uint32_t)right.emulated[0]),
  1444. saturateToU8((uint32_t)left.emulated[1] + (uint32_t)right.emulated[1]),
  1445. saturateToU8((uint32_t)left.emulated[2] + (uint32_t)right.emulated[2]),
  1446. saturateToU8((uint32_t)left.emulated[3] + (uint32_t)right.emulated[3]),
  1447. saturateToU8((uint32_t)left.emulated[4] + (uint32_t)right.emulated[4]),
  1448. saturateToU8((uint32_t)left.emulated[5] + (uint32_t)right.emulated[5]),
  1449. saturateToU8((uint32_t)left.emulated[6] + (uint32_t)right.emulated[6]),
  1450. saturateToU8((uint32_t)left.emulated[7] + (uint32_t)right.emulated[7]),
  1451. saturateToU8((uint32_t)left.emulated[8] + (uint32_t)right.emulated[8]),
  1452. saturateToU8((uint32_t)left.emulated[9] + (uint32_t)right.emulated[9]),
  1453. saturateToU8((uint32_t)left.emulated[10] + (uint32_t)right.emulated[10]),
  1454. saturateToU8((uint32_t)left.emulated[11] + (uint32_t)right.emulated[11]),
  1455. saturateToU8((uint32_t)left.emulated[12] + (uint32_t)right.emulated[12]),
  1456. saturateToU8((uint32_t)left.emulated[13] + (uint32_t)right.emulated[13]),
  1457. saturateToU8((uint32_t)left.emulated[14] + (uint32_t)right.emulated[14]),
  1458. saturateToU8((uint32_t)left.emulated[15] + (uint32_t)right.emulated[15])
  1459. );
  1460. #endif
  1461. }
  1462. inline I32x4 truncateToI32(const F32x4& vector) {
  1463. #ifdef USE_BASIC_SIMD
  1464. return I32x4(F32_TO_I32_SIMD(vector.v));
  1465. #else
  1466. return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
  1467. #endif
  1468. }
  1469. inline U32x4 truncateToU32(const F32x4& vector) {
  1470. #ifdef USE_BASIC_SIMD
  1471. return U32x4(F32_TO_U32_SIMD(vector.v));
  1472. #else
  1473. return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
  1474. #endif
  1475. }
  1476. inline F32x4 floatFromI32(const I32x4& vector) {
  1477. #ifdef USE_BASIC_SIMD
  1478. return F32x4(I32_TO_F32_SIMD(vector.v));
  1479. #else
  1480. return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
  1481. #endif
  1482. }
  1483. inline F32x4 floatFromU32(const U32x4& vector) {
  1484. #ifdef USE_BASIC_SIMD
  1485. return F32x4(U32_TO_F32_SIMD(vector.v));
  1486. #else
  1487. return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
  1488. #endif
  1489. }
  1490. inline I32x4 I32FromU32(const U32x4& vector) {
  1491. #ifdef USE_BASIC_SIMD
  1492. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  1493. #else
  1494. return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
  1495. #endif
  1496. }
  1497. inline U32x4 U32FromI32(const I32x4& vector) {
  1498. #ifdef USE_BASIC_SIMD
  1499. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  1500. #else
  1501. return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
  1502. #endif
  1503. }
  1504. // Warning! Behaviour depends on endianness.
  1505. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  1506. #ifdef USE_BASIC_SIMD
  1507. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  1508. #else
  1509. const uint8_t *source = (const uint8_t*)vector.emulated;
  1510. return U8x16(
  1511. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  1512. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  1513. );
  1514. #endif
  1515. }
  1516. // Warning! Behaviour depends on endianness.
  1517. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  1518. #ifdef USE_BASIC_SIMD
  1519. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  1520. #else
  1521. const uint32_t *source = (const uint32_t*)vector.emulated;
  1522. return U32x4(source[0], source[1], source[2], source[3]);
  1523. #endif
  1524. }
  1525. // Unpacking to larger integers
  1526. inline U32x4 lowerToU32(const U16x8& vector) {
  1527. #ifdef USE_BASIC_SIMD
  1528. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  1529. #else
  1530. return U32x4(vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3]);
  1531. #endif
  1532. }
  1533. inline U32x4 higherToU32(const U16x8& vector) {
  1534. #ifdef USE_BASIC_SIMD
  1535. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  1536. #else
  1537. return U32x4(vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]);
  1538. #endif
  1539. }
  1540. inline U16x8 lowerToU16(const U8x16& vector) {
  1541. #ifdef USE_BASIC_SIMD
  1542. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  1543. #else
  1544. return U16x8(
  1545. vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3],
  1546. vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]
  1547. );
  1548. #endif
  1549. }
  1550. inline U16x8 higherToU16(const U8x16& vector) {
  1551. #ifdef USE_BASIC_SIMD
  1552. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  1553. #else
  1554. return U16x8(
  1555. vector.emulated[8], vector.emulated[9], vector.emulated[10], vector.emulated[11],
  1556. vector.emulated[12], vector.emulated[13], vector.emulated[14], vector.emulated[15]
  1557. );
  1558. #endif
  1559. }
  1560. // Saturated packing
  1561. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  1562. #ifdef USE_BASIC_SIMD
  1563. return U8x16(PACK_SAT_U16_TO_U8(lower.v, upper.v));
  1564. #else
  1565. return U8x16(
  1566. saturateToU8(lower.emulated[0]),
  1567. saturateToU8(lower.emulated[1]),
  1568. saturateToU8(lower.emulated[2]),
  1569. saturateToU8(lower.emulated[3]),
  1570. saturateToU8(lower.emulated[4]),
  1571. saturateToU8(lower.emulated[5]),
  1572. saturateToU8(lower.emulated[6]),
  1573. saturateToU8(lower.emulated[7]),
  1574. saturateToU8(upper.emulated[0]),
  1575. saturateToU8(upper.emulated[1]),
  1576. saturateToU8(upper.emulated[2]),
  1577. saturateToU8(upper.emulated[3]),
  1578. saturateToU8(upper.emulated[4]),
  1579. saturateToU8(upper.emulated[5]),
  1580. saturateToU8(upper.emulated[6]),
  1581. saturateToU8(upper.emulated[7])
  1582. );
  1583. #endif
  1584. }
  1585. // Helper macros for generating the vector extract functions.
  1586. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  1587. #ifdef USE_BASIC_SIMD
  1588. #ifdef USE_SSE2
  1589. #ifdef USE_SSSE3
  1590. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  1591. #else
  1592. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  1593. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  1594. ALIGN16 uint8_t vectorBuffer[32];
  1595. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  1596. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  1597. return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  1598. }
  1599. #endif
  1600. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  1601. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  1602. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  1603. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  1604. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  1605. #elif USE_NEON
  1606. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  1607. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  1608. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  1609. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  1610. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  1611. #endif
  1612. #else
  1613. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1614. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1615. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1616. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1617. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1618. #endif
  1619. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  1620. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  1621. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  1622. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  1623. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  1624. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0])) }
  1625. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1])) }
  1626. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1627. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
  1628. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
  1629. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
  1630. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
  1631. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7])) }
  1632. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8])) }
  1633. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9])) }
  1634. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10])) }
  1635. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11])) }
  1636. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12])) }
  1637. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13])) }
  1638. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13], b.emulated[14])) }
  1639. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  1640. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  1641. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0])) }
  1642. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1])) }
  1643. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1644. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
  1645. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
  1646. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
  1647. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
  1648. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  1649. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  1650. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1651. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1652. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1653. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  1654. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  1655. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1656. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1657. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1658. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  1659. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  1660. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1661. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1662. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1663. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  1664. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  1665. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  1666. #ifdef USE_AVX2
  1667. #define GATHER_I32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  1668. #define GATHER_U32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  1669. #define GATHER_F32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  1670. #endif
  1671. static inline U32x4 gather(const dsr::SafePointer<uint32_t> data, const U32x4 &elementOffset) {
  1672. #ifdef USE_AVX2
  1673. return U32x4(GATHER_U32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1674. #else
  1675. dsr::UVector4D elementOffsetS = elementOffset.get();
  1676. return U32x4(
  1677. *(data + elementOffsetS.x),
  1678. *(data + elementOffsetS.y),
  1679. *(data + elementOffsetS.z),
  1680. *(data + elementOffsetS.w)
  1681. );
  1682. #endif
  1683. }
  1684. static inline I32x4 gather(const dsr::SafePointer<int32_t> data, const U32x4 &elementOffset) {
  1685. #ifdef USE_AVX2
  1686. return I32x4(GATHER_U32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1687. #else
  1688. dsr::UVector4D elementOffsetS = elementOffset.get();
  1689. return I32x4(
  1690. *(data + elementOffsetS.x),
  1691. *(data + elementOffsetS.y),
  1692. *(data + elementOffsetS.z),
  1693. *(data + elementOffsetS.w)
  1694. );
  1695. #endif
  1696. }
  1697. static inline F32x4 gather(const dsr::SafePointer<float> data, const U32x4 &elementOffset) {
  1698. #ifdef USE_AVX2
  1699. return F32x4(GATHER_F32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1700. #else
  1701. dsr::UVector4D elementOffsetS = elementOffset.get();
  1702. return F32x4(
  1703. *(data + elementOffsetS.x),
  1704. *(data + elementOffsetS.y),
  1705. *(data + elementOffsetS.z),
  1706. *(data + elementOffsetS.w)
  1707. );
  1708. #endif
  1709. }
  1710. #endif