simd.h 62 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2019 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Covers a small intersection of SSE2 and NEON in order to reduce the number
  25. // of bugs from multiple implementations when nothing advanced is required.
  26. #ifndef DFPSR_SIMD
  27. #define DFPSR_SIMD
  28. #include <stdint.h>
  29. #include <cassert>
  30. #include "SafePointer.h"
  31. #include "../math/FVector.h"
  32. #include "../math/IVector.h"
  33. #include "../math/UVector.h"
  34. #define ALIGN16 __attribute__((aligned(16)))
  35. // To allow turning off SIMD intrinsics for testing
  36. #ifdef __SSE2__
  37. // Comment out this line to test without SSE2
  38. #define USE_SSE2
  39. #elif __ARM_NEON
  40. // Comment out this line to test without NEON
  41. #define USE_NEON
  42. #endif
  43. // Everything declared in here handles things specific for SSE.
  44. // Direct use of the macros will not provide portability to all hardware.
  45. #ifdef USE_SSE2
  46. #define USE_BASIC_SIMD
  47. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  48. #include <emmintrin.h> // SSE2
  49. #ifdef __AVX2__
  50. #include <immintrin.h> // AVX2
  51. #define GATHER_U32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  52. // Comment out this line to test without AVX2
  53. #define USE_AVX2
  54. #endif
  55. // Vector types
  56. #define SIMD_F32x4 __m128
  57. #define SIMD_U8x16 __m128i
  58. #define SIMD_U16x8 __m128i
  59. #define SIMD_U32x4 __m128i
  60. #define SIMD_I32x4 __m128i
  61. // Vector uploads in address order
  62. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  63. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  64. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  65. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  66. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  67. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  68. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  69. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  70. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  71. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  72. // Conversions
  73. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  74. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  75. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  76. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  77. // Unpacking conversions
  78. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  79. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  80. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  81. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  82. // Saturated packing
  83. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  84. inline SIMD_U8x16 PACK_SAT_U16_TO_U8(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  85. SIMD_U16x8 mask, a2, b2;
  86. mask = _mm_set1_epi16(0x7fff);
  87. a2 = _mm_and_si128(a, mask);
  88. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  89. b2 = _mm_and_si128(b, mask);
  90. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  91. return _mm_packus_epi16(a2, b2);
  92. }
  93. // Reinterpret casting
  94. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  95. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  96. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  97. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  98. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  99. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  100. // Vector float operations returning SIMD_F32x4
  101. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  102. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  103. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  104. // Vector integer operations returning SIMD_I32x4
  105. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  106. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  107. // 32-bit integer multiplications are not available on SSE2.
  108. // Vector integer operations returning SIMD_U32x4
  109. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  110. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  111. // 32-bit integer multiplications are not available on SSE2.
  112. // Vector integer operations returning SIMD_U16x8
  113. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  114. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  115. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  116. // Vector integer operations returning SIMD_U8x16
  117. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  118. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  119. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  120. // No 8-bit multiplications
  121. // Statistics
  122. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  123. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  124. // Bitwise
  125. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  126. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  127. #endif
  128. // Everything declared in here handles things specific for NEON.
  129. // Direct use of the macros will not provide portability to all hardware.
  130. #ifdef USE_NEON
  131. #define USE_BASIC_SIMD
  132. #include <arm_neon.h> // NEON
  133. // Vector types
  134. #define SIMD_F32x4 float32x4_t
  135. #define SIMD_U8x16 uint8x16_t
  136. #define SIMD_U16x8 uint16x8_t
  137. #define SIMD_U32x4 uint32x4_t
  138. #define SIMD_I32x4 int32x4_t
  139. // Vector uploads in address order
  140. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  141. float data[4] ALIGN16 = {a, b, c, d};
  142. return vld1q_f32(data);
  143. }
  144. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  145. return vdupq_n_f32(a);
  146. }
  147. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  148. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  149. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  150. return vld1q_u8(data);
  151. }
  152. inline SIMD_U16x8 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  153. return vdupq_n_u8(a);
  154. }
  155. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  156. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  157. return vld1q_u16(data);
  158. }
  159. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  160. return vdupq_n_u16(a);
  161. }
  162. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  163. uint32_t data[4] ALIGN16 = {a, b, c, d};
  164. return vld1q_u32(data);
  165. }
  166. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  167. return vdupq_n_u32(a);
  168. }
  169. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  170. int32_t data[4] ALIGN16 = {a, b, c, d};
  171. return vld1q_s32(data);
  172. }
  173. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  174. return vdupq_n_s32(a);
  175. }
  176. // Conversions
  177. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  178. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  179. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  180. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  181. // Unpacking conversions
  182. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  183. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  184. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  185. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  186. // Saturated packing
  187. #define PACK_SAT_U16_TO_U8(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  188. // Reinterpret casting
  189. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  190. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  191. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  192. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  193. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  194. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  195. // Vector float operations returning SIMD_F32x4
  196. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  197. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  198. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  199. // Vector integer operations returning SIMD_I32x4
  200. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  201. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  202. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  203. // Vector integer operations returning SIMD_U32x4
  204. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  205. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  206. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  207. // Vector integer operations returning SIMD_U16x8
  208. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  209. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  210. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  211. // Vector integer operations returning SIMD_U8x16
  212. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  213. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  214. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  215. // No 8-bit multiplications
  216. // Statistics
  217. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  218. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  219. // Bitwise
  220. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  221. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  222. #endif
  223. /*
  224. The vector types (F32x4, I32x4, U32x4, U16x8) below are supposed to be portable across different CPU architectures.
  225. When this abstraction layer is mixed with handwritten SIMD intrinsics:
  226. Use "USE_SSE2" instead of "__SSE2__"
  227. Use "USE_AVX2" instead of "__AVX2__"
  228. Use "USE_NEON" instead of "__ARM_NEON"
  229. Portability exceptions:
  230. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  231. Only use when USE_BASIC_SIMD is defined.
  232. Will not work on scalar emulation.
  233. * The "shared_memory" array is only defined for targets with direct access to SIMD registers. (SSE)
  234. Only use when USE_DIRECT_SIMD_MEMORY_ACCESS is defined.
  235. Will not work on NEON or scalar emulation.
  236. * The "emulated" array is ony defined when SIMD is turned off.
  237. Cannot be used when USE_BASIC_SIMD is defined.
  238. Will not work when either SSE or NEON is enabled.
  239. */
  240. union F32x4 {
  241. #ifdef USE_BASIC_SIMD
  242. public:
  243. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  244. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  245. // Direct access cannot be done on NEON!
  246. float shared_memory[4];
  247. #endif
  248. // The SIMD vector of undefined type
  249. // Not accessible while emulating!
  250. SIMD_F32x4 v;
  251. // Construct a portable vector from a native SIMD vector
  252. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  253. // Construct a portable vector from a set of scalars
  254. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  255. // Construct a portable vector from a single duplicated scalar
  256. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  257. #else
  258. public:
  259. // Emulate a SIMD vector as an array of scalars without hardware support
  260. // Only accessible while emulating!
  261. float emulated[4];
  262. // Construct a portable vector from a set of scalars
  263. F32x4(float a1, float a2, float a3, float a4) {
  264. this->emulated[0] = a1;
  265. this->emulated[1] = a2;
  266. this->emulated[2] = a3;
  267. this->emulated[3] = a4;
  268. }
  269. // Construct a portable vector from a single duplicated scalar
  270. explicit F32x4(float scalar) {
  271. this->emulated[0] = scalar;
  272. this->emulated[1] = scalar;
  273. this->emulated[2] = scalar;
  274. this->emulated[3] = scalar;
  275. }
  276. #endif
  277. // Construct a portable SIMD vector from a pointer to aligned data
  278. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  279. static inline F32x4 readAlignedUnsafe(const float* data) {
  280. #ifdef USE_BASIC_SIMD
  281. #ifdef USE_SSE2
  282. return F32x4(_mm_load_ps(data));
  283. #elif USE_NEON
  284. return F32x4(vld1q_f32(data));
  285. #endif
  286. #else
  287. return F32x4(data[0], data[1], data[2], data[3]);
  288. #endif
  289. }
  290. // Write to aligned memory from the existing vector
  291. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  292. inline void writeAlignedUnsafe(float* data) const {
  293. #ifdef USE_BASIC_SIMD
  294. #ifdef USE_SSE2
  295. _mm_store_ps(data, this->v);
  296. #elif USE_NEON
  297. vst1q_f32(data, this->v);
  298. #endif
  299. #else
  300. data[0] = this->emulated[0];
  301. data[1] = this->emulated[1];
  302. data[2] = this->emulated[2];
  303. data[3] = this->emulated[3];
  304. #endif
  305. }
  306. #ifdef DFPSR_GEOMETRY_FVECTOR
  307. dsr::FVector4D get() const {
  308. float data[4] ALIGN16;
  309. this->writeAlignedUnsafe(data);
  310. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  311. }
  312. #endif
  313. // Bound and alignment checked reading
  314. static inline F32x4 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
  315. const float* pointer = data.getUnsafe();
  316. assert(((uintptr_t)pointer & 15) == 0);
  317. #ifdef SAFE_POINTER_CHECKS
  318. data.assertInside(methodName, pointer, 16);
  319. #endif
  320. return F32x4::readAlignedUnsafe(pointer);
  321. }
  322. // Bound and alignment checked writing
  323. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  324. float* pointer = data.getUnsafe();
  325. assert(((uintptr_t)pointer & 15) == 0);
  326. #ifdef SAFE_POINTER_CHECKS
  327. data.assertInside(methodName, pointer, 16);
  328. #endif
  329. this->writeAlignedUnsafe(pointer);
  330. }
  331. // 1 / x
  332. // Useful for multiple divisions with the same denominator
  333. // Useless if the denominator is a constant
  334. F32x4 reciprocal() const {
  335. #ifdef USE_BASIC_SIMD
  336. #ifdef USE_SSE2
  337. // Approximate
  338. SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
  339. // Refine
  340. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
  341. #elif USE_NEON
  342. // Approximate
  343. SIMD_F32x4 result = vrecpeq_f32(this->v);
  344. // Refine
  345. result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
  346. return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
  347. #else
  348. assert(false);
  349. return F32x4(0);
  350. #endif
  351. #else
  352. return F32x4(1.0f / this->emulated[0], 1.0f / this->emulated[1], 1.0f / this->emulated[2], 1.0f / this->emulated[3]);
  353. #endif
  354. }
  355. // 1 / sqrt(x)
  356. // Useful for normalizing vectors
  357. F32x4 reciprocalSquareRoot() const {
  358. #ifdef USE_BASIC_SIMD
  359. #ifdef USE_SSE2
  360. //__m128 reciRoot = _mm_rsqrt_ps(this->v);
  361. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
  362. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
  363. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  364. return F32x4(reciRoot);
  365. #elif USE_NEON
  366. // TODO: Test on ARM
  367. // Approximate
  368. SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
  369. // Refine
  370. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
  371. return reciRoot;
  372. #else
  373. assert(false);
  374. return F32x4(0);
  375. #endif
  376. #else
  377. return F32x4(1.0f / sqrt(this->emulated[0]), 1.0f / sqrt(this->emulated[1]), 1.0f / sqrt(this->emulated[2]), 1.0f / sqrt(this->emulated[3]));
  378. #endif
  379. }
  380. // sqrt(x)
  381. // Useful for getting lengths of vectors
  382. F32x4 squareRoot() const {
  383. #ifdef USE_BASIC_SIMD
  384. #ifdef USE_SSE2
  385. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  386. // Approximate
  387. SIMD_F32x4 root = _mm_sqrt_ps(this->v);
  388. // Refine
  389. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
  390. return F32x4(root);
  391. #elif USE_NEON
  392. // TODO: Test on ARM
  393. return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
  394. #else
  395. assert(false);
  396. return F32x4(0);
  397. #endif
  398. #else
  399. return F32x4(sqrt(this->emulated[0]), sqrt(this->emulated[1]), sqrt(this->emulated[2]), sqrt(this->emulated[3]));
  400. #endif
  401. }
  402. F32x4 clamp(float min, float max) const {
  403. #ifdef USE_BASIC_SIMD
  404. return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)), LOAD_SCALAR_F32_SIMD(max)));
  405. #else
  406. float val0 = this->emulated[0];
  407. float val1 = this->emulated[1];
  408. float val2 = this->emulated[2];
  409. float val3 = this->emulated[3];
  410. if (min > val0) { val0 = min; }
  411. if (max < val0) { val0 = max; }
  412. if (min > val1) { val1 = min; }
  413. if (max < val1) { val1 = max; }
  414. if (min > val2) { val2 = min; }
  415. if (max < val2) { val2 = max; }
  416. if (min > val3) { val3 = min; }
  417. if (max < val3) { val3 = max; }
  418. return F32x4(val0, val1, val2, val3);
  419. #endif
  420. }
  421. F32x4 clampLower(float min) const {
  422. #ifdef USE_BASIC_SIMD
  423. return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)));
  424. #else
  425. float val0 = this->emulated[0];
  426. float val1 = this->emulated[1];
  427. float val2 = this->emulated[2];
  428. float val3 = this->emulated[3];
  429. if (min > val0) { val0 = min; }
  430. if (min > val1) { val1 = min; }
  431. if (min > val2) { val2 = min; }
  432. if (min > val3) { val3 = min; }
  433. return F32x4(val0, val1, val2, val3);
  434. #endif
  435. }
  436. F32x4 clampUpper(float max) const {
  437. #ifdef USE_BASIC_SIMD
  438. return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(max)));
  439. #else
  440. float val0 = this->emulated[0];
  441. float val1 = this->emulated[1];
  442. float val2 = this->emulated[2];
  443. float val3 = this->emulated[3];
  444. if (max < val0) { val0 = max; }
  445. if (max < val1) { val1 = max; }
  446. if (max < val2) { val2 = max; }
  447. if (max < val3) { val3 = max; }
  448. return F32x4(val0, val1, val2, val3);
  449. #endif
  450. }
  451. };
  452. inline dsr::String& string_toStreamIndented(dsr::String& target, const F32x4& source, const dsr::ReadableString& indentation) {
  453. string_append(target, indentation, source.get());
  454. return target;
  455. }
  456. inline bool operator==(const F32x4& left, const F32x4& right) {
  457. float a[4] ALIGN16;
  458. float b[4] ALIGN16;
  459. left.writeAlignedUnsafe(a);
  460. right.writeAlignedUnsafe(b);
  461. return fabs(a[0] - b[0]) < 0.0001f && fabs(a[1] - b[1]) < 0.0001f && fabs(a[2] - b[2]) < 0.0001f && fabs(a[3] - b[3]) < 0.0001f;
  462. }
  463. inline bool operator!=(const F32x4& left, const F32x4& right) {
  464. return !(left == right);
  465. }
  466. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  467. #ifdef USE_BASIC_SIMD
  468. return F32x4(ADD_F32_SIMD(left.v, right.v));
  469. #else
  470. return F32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  471. #endif
  472. }
  473. inline F32x4 operator+(float left, const F32x4& right) {
  474. #ifdef USE_BASIC_SIMD
  475. return F32x4(ADD_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  476. #else
  477. return F32x4(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3]);
  478. #endif
  479. }
  480. inline F32x4 operator+(const F32x4& left, float right) {
  481. #ifdef USE_BASIC_SIMD
  482. return F32x4(ADD_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  483. #else
  484. return F32x4(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right);
  485. #endif
  486. }
  487. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  488. #ifdef USE_BASIC_SIMD
  489. return F32x4(SUB_F32_SIMD(left.v, right.v));
  490. #else
  491. return F32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  492. #endif
  493. }
  494. inline F32x4 operator-(float left, const F32x4& right) {
  495. #ifdef USE_BASIC_SIMD
  496. return F32x4(SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  497. #else
  498. return F32x4(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3]);
  499. #endif
  500. }
  501. inline F32x4 operator-(const F32x4& left, float right) {
  502. #ifdef USE_BASIC_SIMD
  503. return F32x4(SUB_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  504. #else
  505. return F32x4(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right);
  506. #endif
  507. }
  508. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  509. #ifdef USE_BASIC_SIMD
  510. return F32x4(MUL_F32_SIMD(left.v, right.v));
  511. #else
  512. return F32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  513. #endif
  514. }
  515. inline F32x4 operator*(float left, const F32x4& right) {
  516. #ifdef USE_BASIC_SIMD
  517. return F32x4(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  518. #else
  519. return F32x4(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3]);
  520. #endif
  521. }
  522. inline F32x4 operator*(const F32x4& left, float right) {
  523. #ifdef USE_BASIC_SIMD
  524. return F32x4(MUL_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  525. #else
  526. return F32x4(left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right);
  527. #endif
  528. }
  529. inline F32x4 min(const F32x4& left, const F32x4& right) {
  530. #ifdef USE_BASIC_SIMD
  531. return F32x4(MIN_F32_SIMD(left.v, right.v));
  532. #else
  533. float v0 = left.emulated[0];
  534. float v1 = left.emulated[1];
  535. float v2 = left.emulated[2];
  536. float v3 = left.emulated[3];
  537. float r0 = right.emulated[0];
  538. float r1 = right.emulated[1];
  539. float r2 = right.emulated[2];
  540. float r3 = right.emulated[3];
  541. if (r0 < v0) { v0 = r0; }
  542. if (r1 < v1) { v1 = r1; }
  543. if (r2 < v2) { v2 = r2; }
  544. if (r3 < v3) { v3 = r3; }
  545. return F32x4(v0, v1, v2, v3);
  546. #endif
  547. }
  548. inline F32x4 max(const F32x4& left, const F32x4& right) {
  549. #ifdef USE_BASIC_SIMD
  550. return F32x4(MAX_F32_SIMD(left.v, right.v));
  551. #else
  552. float v0 = left.emulated[0];
  553. float v1 = left.emulated[1];
  554. float v2 = left.emulated[2];
  555. float v3 = left.emulated[3];
  556. float r0 = right.emulated[0];
  557. float r1 = right.emulated[1];
  558. float r2 = right.emulated[2];
  559. float r3 = right.emulated[3];
  560. if (r0 > v0) { v0 = r0; }
  561. if (r1 > v1) { v1 = r1; }
  562. if (r2 > v2) { v2 = r2; }
  563. if (r3 > v3) { v3 = r3; }
  564. return F32x4(v0, v1, v2, v3);
  565. #endif
  566. }
  567. union I32x4 {
  568. #ifdef USE_BASIC_SIMD
  569. public:
  570. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  571. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  572. // Direct access cannot be done on NEON!
  573. int32_t shared_memory[4];
  574. #endif
  575. // The SIMD vector of undefined type
  576. // Not accessible while emulating!
  577. SIMD_I32x4 v;
  578. // Construct a portable vector from a native SIMD vector
  579. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  580. // Construct a portable vector from a set of scalars
  581. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  582. // Construct a portable vector from a single duplicated scalar
  583. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  584. #else
  585. public:
  586. // Emulate a SIMD vector as an array of scalars without hardware support
  587. // Only accessible while emulating!
  588. int32_t emulated[4];
  589. // Construct a portable vector from a set of scalars
  590. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  591. this->emulated[0] = a1;
  592. this->emulated[1] = a2;
  593. this->emulated[2] = a3;
  594. this->emulated[3] = a4;
  595. }
  596. // Construct a portable vector from a single duplicated scalar
  597. explicit I32x4(int32_t scalar) {
  598. this->emulated[0] = scalar;
  599. this->emulated[1] = scalar;
  600. this->emulated[2] = scalar;
  601. this->emulated[3] = scalar;
  602. }
  603. #endif
  604. // Construct a portable SIMD vector from a pointer to aligned data
  605. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  606. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  607. #ifdef USE_BASIC_SIMD
  608. #ifdef USE_SSE2
  609. return I32x4(_mm_load_si128((const __m128i*)data));
  610. #elif USE_NEON
  611. return I32x4(vld1q_s32(data));
  612. #endif
  613. #else
  614. return I32x4(data[0], data[1], data[2], data[3]);
  615. #endif
  616. }
  617. // Write to aligned memory from the existing vector
  618. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  619. inline void writeAlignedUnsafe(int32_t* data) const {
  620. #ifdef USE_BASIC_SIMD
  621. #ifdef USE_SSE2
  622. _mm_store_si128((__m128i*)data, this->v);
  623. #elif USE_NEON
  624. vst1q_s32(data, this->v);
  625. #endif
  626. #else
  627. data[0] = this->emulated[0];
  628. data[1] = this->emulated[1];
  629. data[2] = this->emulated[2];
  630. data[3] = this->emulated[3];
  631. #endif
  632. }
  633. #ifdef DFPSR_GEOMETRY_IVECTOR
  634. dsr::IVector4D get() const {
  635. int32_t data[4] ALIGN16;
  636. this->writeAlignedUnsafe(data);
  637. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  638. }
  639. #endif
  640. // Bound and alignment checked reading
  641. static inline I32x4 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
  642. const int32_t* pointer = data.getUnsafe();
  643. assert(((uintptr_t)pointer & 15) == 0);
  644. #ifdef SAFE_POINTER_CHECKS
  645. data.assertInside(methodName, pointer, 16);
  646. #endif
  647. return I32x4::readAlignedUnsafe(pointer);
  648. }
  649. // Bound and alignment checked writing
  650. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  651. int32_t* pointer = data.getUnsafe();
  652. assert(((uintptr_t)pointer & 15) == 0);
  653. #ifdef SAFE_POINTER_CHECKS
  654. data.assertInside(methodName, pointer, 16);
  655. #endif
  656. this->writeAlignedUnsafe(pointer);
  657. }
  658. };
  659. inline dsr::String& string_toStreamIndented(dsr::String& target, const I32x4& source, const dsr::ReadableString& indentation) {
  660. string_append(target, indentation, source.get());
  661. return target;
  662. }
  663. inline bool operator==(const I32x4& left, const I32x4& right) {
  664. int32_t a[4] ALIGN16;
  665. int32_t b[4] ALIGN16;
  666. left.writeAlignedUnsafe(a);
  667. right.writeAlignedUnsafe(b);
  668. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
  669. }
  670. inline bool operator!=(const I32x4& left, const I32x4& right) {
  671. return !(left == right);
  672. }
  673. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  674. #ifdef USE_BASIC_SIMD
  675. return I32x4(ADD_I32_SIMD(left.v, right.v));
  676. #else
  677. return I32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  678. #endif
  679. }
  680. inline I32x4 operator+(int32_t left, const I32x4& right) {
  681. return I32x4(left) + right;
  682. }
  683. inline I32x4 operator+(const I32x4& left, int32_t right) {
  684. return left + I32x4(right);
  685. }
  686. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  687. #ifdef USE_BASIC_SIMD
  688. return I32x4(SUB_I32_SIMD(left.v, right.v));
  689. #else
  690. return I32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  691. #endif
  692. }
  693. inline I32x4 operator-(int32_t left, const I32x4& right) {
  694. return I32x4(left) - right;
  695. }
  696. inline I32x4 operator-(const I32x4& left, int32_t right) {
  697. return left - I32x4(right);
  698. }
  699. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  700. #ifdef USE_BASIC_SIMD
  701. #ifdef USE_SSE2
  702. // Emulate a NEON instruction
  703. return I32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
  704. #elif USE_NEON
  705. return I32x4(MUL_I32_NEON(left.v, right.v));
  706. #endif
  707. #else
  708. return I32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  709. #endif
  710. }
  711. inline I32x4 operator*(int32_t left, const I32x4& right) {
  712. return I32x4(left) * right;
  713. }
  714. inline I32x4 operator*(const I32x4& left, int32_t right) {
  715. return left * I32x4(right);
  716. }
  717. union U32x4 {
  718. #ifdef USE_BASIC_SIMD
  719. public:
  720. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  721. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  722. // Direct access cannot be done on NEON!
  723. uint32_t shared_memory[4];
  724. #endif
  725. // The SIMD vector of undefined type
  726. // Not accessible while emulating!
  727. SIMD_U32x4 v;
  728. // Construct a portable vector from a native SIMD vector
  729. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  730. // Construct a portable vector from a set of scalars
  731. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  732. // Construct a portable vector from a single duplicated scalar
  733. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  734. #else
  735. public:
  736. // Emulate a SIMD vector as an array of scalars without hardware support
  737. // Only accessible while emulating!
  738. uint32_t emulated[4];
  739. // Construct a portable vector from a set of scalars
  740. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  741. this->emulated[0] = a1;
  742. this->emulated[1] = a2;
  743. this->emulated[2] = a3;
  744. this->emulated[3] = a4;
  745. }
  746. // Construct a portable vector from a single duplicated scalar
  747. explicit U32x4(uint32_t scalar) {
  748. this->emulated[0] = scalar;
  749. this->emulated[1] = scalar;
  750. this->emulated[2] = scalar;
  751. this->emulated[3] = scalar;
  752. }
  753. #endif
  754. // Construct a portable SIMD vector from a pointer to aligned data
  755. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  756. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  757. #ifdef USE_BASIC_SIMD
  758. #ifdef USE_SSE2
  759. return U32x4(_mm_load_si128((const __m128i*)data));
  760. #elif USE_NEON
  761. return U32x4(vld1q_u32(data));
  762. #endif
  763. #else
  764. return U32x4(data[0], data[1], data[2], data[3]);
  765. #endif
  766. }
  767. // Write to aligned memory from the existing vector
  768. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  769. inline void writeAlignedUnsafe(uint32_t* data) const {
  770. #ifdef USE_BASIC_SIMD
  771. #ifdef USE_SSE2
  772. _mm_store_si128((__m128i*)data, this->v);
  773. #elif USE_NEON
  774. vst1q_u32(data, this->v);
  775. #endif
  776. #else
  777. data[0] = this->emulated[0];
  778. data[1] = this->emulated[1];
  779. data[2] = this->emulated[2];
  780. data[3] = this->emulated[3];
  781. #endif
  782. }
  783. #ifdef DFPSR_GEOMETRY_UVECTOR
  784. dsr::UVector4D get() const {
  785. uint32_t data[4] ALIGN16;
  786. this->writeAlignedUnsafe(data);
  787. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  788. }
  789. #endif
  790. // Bound and alignment checked reading
  791. static inline U32x4 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
  792. const uint32_t* pointer = data.getUnsafe();
  793. assert(((uintptr_t)pointer & 15) == 0);
  794. #ifdef SAFE_POINTER_CHECKS
  795. data.assertInside(methodName, pointer, 16);
  796. #endif
  797. return U32x4::readAlignedUnsafe(pointer);
  798. }
  799. // Bound and alignment checked writing
  800. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  801. uint32_t* pointer = data.getUnsafe();
  802. assert(((uintptr_t)pointer & 15) == 0);
  803. #ifdef SAFE_POINTER_CHECKS
  804. data.assertInside(methodName, pointer, 16);
  805. #endif
  806. this->writeAlignedUnsafe(pointer);
  807. }
  808. };
  809. inline dsr::String& string_toStreamIndented(dsr::String& target, const U32x4& source, const dsr::ReadableString& indentation) {
  810. string_append(target, indentation, source.get());
  811. return target;
  812. }
  813. inline bool operator==(const U32x4& left, const U32x4& right) {
  814. uint32_t a[4] ALIGN16;
  815. uint32_t b[4] ALIGN16;
  816. left.writeAlignedUnsafe(a);
  817. right.writeAlignedUnsafe(b);
  818. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
  819. }
  820. inline bool operator!=(const U32x4& left, const U32x4& right) {
  821. return !(left == right);
  822. }
  823. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  824. #ifdef USE_BASIC_SIMD
  825. return U32x4(ADD_U32_SIMD(left.v, right.v));
  826. #else
  827. return U32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  828. #endif
  829. }
  830. inline U32x4 operator+(uint32_t left, const U32x4& right) {
  831. return U32x4(left) + right;
  832. }
  833. inline U32x4 operator+(const U32x4& left, uint32_t right) {
  834. return left + U32x4(right);
  835. }
  836. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  837. #ifdef USE_BASIC_SIMD
  838. return U32x4(SUB_U32_SIMD(left.v, right.v));
  839. #else
  840. return U32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  841. #endif
  842. }
  843. inline U32x4 operator-(uint32_t left, const U32x4& right) {
  844. return U32x4(left) - right;
  845. }
  846. inline U32x4 operator-(const U32x4& left, uint32_t right) {
  847. return left - U32x4(right);
  848. }
  849. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  850. #ifdef USE_BASIC_SIMD
  851. #ifdef USE_SSE2
  852. // Emulate a NEON instruction on SSE2 registers
  853. return U32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
  854. #else // NEON
  855. return U32x4(MUL_U32_NEON(left.v, right.v));
  856. #endif
  857. #else
  858. return U32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  859. #endif
  860. }
  861. inline U32x4 operator*(uint32_t left, const U32x4& right) {
  862. return U32x4(left) * right;
  863. }
  864. inline U32x4 operator*(const U32x4& left, uint32_t right) {
  865. return left * U32x4(right);
  866. }
  867. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  868. #ifdef USE_BASIC_SIMD
  869. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  870. #else
  871. return U32x4(left.emulated[0] & right.emulated[0], left.emulated[1] & right.emulated[1], left.emulated[2] & right.emulated[2], left.emulated[3] & right.emulated[3]);
  872. #endif
  873. }
  874. inline U32x4 operator&(const U32x4& left, uint32_t mask) {
  875. #ifdef USE_BASIC_SIMD
  876. return U32x4(BITWISE_AND_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  877. #else
  878. return U32x4(left.emulated[0] & mask, left.emulated[1] & mask, left.emulated[2] & mask, left.emulated[3] & mask);
  879. #endif
  880. }
  881. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  882. #ifdef USE_BASIC_SIMD
  883. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  884. #else
  885. return U32x4(left.emulated[0] | right.emulated[0], left.emulated[1] | right.emulated[1], left.emulated[2] | right.emulated[2], left.emulated[3] | right.emulated[3]);
  886. #endif
  887. }
  888. inline U32x4 operator|(const U32x4& left, uint32_t mask) {
  889. #ifdef USE_BASIC_SIMD
  890. return U32x4(BITWISE_OR_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  891. #else
  892. return U32x4(left.emulated[0] | mask, left.emulated[1] | mask, left.emulated[2] | mask, left.emulated[3] | mask);
  893. #endif
  894. }
  895. inline U32x4 operator<<(const U32x4& left, uint32_t bitOffset) {
  896. #ifdef USE_SSE2
  897. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  898. #else
  899. #ifdef USE_NEON
  900. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  901. #else
  902. return U32x4(left.emulated[0] << bitOffset, left.emulated[1] << bitOffset, left.emulated[2] << bitOffset, left.emulated[3] << bitOffset);
  903. #endif
  904. #endif
  905. }
  906. inline U32x4 operator>>(const U32x4& left, uint32_t bitOffset) {
  907. #ifdef USE_SSE2
  908. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  909. #else
  910. #ifdef USE_NEON
  911. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-bitOffset)));
  912. #else
  913. return U32x4(left.emulated[0] >> bitOffset, left.emulated[1] >> bitOffset, left.emulated[2] >> bitOffset, left.emulated[3] >> bitOffset);
  914. #endif
  915. #endif
  916. }
  917. union U16x8 {
  918. #ifdef USE_BASIC_SIMD
  919. public:
  920. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  921. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  922. // Direct access cannot be done on NEON!
  923. uint16_t shared_memory[8];
  924. #endif
  925. // The SIMD vector of undefined type
  926. // Not accessible while emulating!
  927. SIMD_U16x8 v;
  928. // Construct a portable vector from a native SIMD vector
  929. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  930. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  931. // Reinterpret casting is used
  932. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  933. // Construct a portable vector from a set of scalars
  934. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  935. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  936. // Reinterpret casting is used
  937. // TODO: Remove all reintreprets from constructors to improve readability
  938. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  939. // Construct a portable vector from a single duplicated scalar
  940. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  941. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  942. U32x4 get_U32() const {
  943. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  944. }
  945. #else
  946. public:
  947. // Emulate a SIMD vector as an array of scalars without hardware support
  948. // Only accessible while emulating!
  949. uint16_t emulated[8];
  950. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  951. // Reinterpret casting is used
  952. explicit U16x8(const U32x4& vector) {
  953. uint64_t *target = (uint64_t*)this->emulated;
  954. uint64_t *source = (uint64_t*)vector.emulated;
  955. target[0] = source[0];
  956. target[1] = source[1];
  957. }
  958. // Construct a portable vector from a set of scalars
  959. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  960. this->emulated[0] = a1;
  961. this->emulated[1] = a2;
  962. this->emulated[2] = a3;
  963. this->emulated[3] = a4;
  964. this->emulated[4] = a5;
  965. this->emulated[5] = a6;
  966. this->emulated[6] = a7;
  967. this->emulated[7] = a8;
  968. }
  969. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  970. // Reinterpret casting is used
  971. explicit U16x8(uint32_t scalar) {
  972. uint32_t *target = (uint32_t*)this->emulated;
  973. target[0] = scalar;
  974. target[1] = scalar;
  975. target[2] = scalar;
  976. target[3] = scalar;
  977. }
  978. // Construct a portable vector from a single duplicated scalar
  979. explicit U16x8(uint16_t scalar) {
  980. this->emulated[0] = scalar;
  981. this->emulated[1] = scalar;
  982. this->emulated[2] = scalar;
  983. this->emulated[3] = scalar;
  984. this->emulated[4] = scalar;
  985. this->emulated[5] = scalar;
  986. this->emulated[6] = scalar;
  987. this->emulated[7] = scalar;
  988. }
  989. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  990. U32x4 get_U32() const {
  991. U32x4 result(0);
  992. uint64_t *target = (uint64_t*)result.emulated;
  993. uint64_t *source = (uint64_t*)this->emulated;
  994. target[0] = source[0];
  995. target[1] = source[1];
  996. return result;
  997. }
  998. #endif
  999. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1000. //static inline U16x8 readSlow(uint16_t* data) {
  1001. // return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1002. //}
  1003. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  1004. #ifdef USE_BASIC_SIMD
  1005. #ifdef USE_SSE2
  1006. return U16x8(_mm_load_si128((const __m128i*)data));
  1007. #elif USE_NEON
  1008. return U16x8(vld1q_u16(data));
  1009. #endif
  1010. #else
  1011. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1012. #endif
  1013. }
  1014. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1015. inline void writeAlignedUnsafe(uint16_t* data) const {
  1016. #ifdef USE_BASIC_SIMD
  1017. #ifdef USE_SSE2
  1018. _mm_store_si128((__m128i*)data, this->v);
  1019. #elif USE_NEON
  1020. vst1q_u16(data, this->v);
  1021. #endif
  1022. #else
  1023. data[0] = this->emulated[0];
  1024. data[1] = this->emulated[1];
  1025. data[2] = this->emulated[2];
  1026. data[3] = this->emulated[3];
  1027. data[4] = this->emulated[4];
  1028. data[5] = this->emulated[5];
  1029. data[6] = this->emulated[6];
  1030. data[7] = this->emulated[7];
  1031. #endif
  1032. }
  1033. // Bound and alignment checked reading
  1034. static inline U16x8 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
  1035. const uint16_t* pointer = data.getUnsafe();
  1036. assert(((uintptr_t)pointer & 15) == 0);
  1037. #ifdef SAFE_POINTER_CHECKS
  1038. data.assertInside(methodName, pointer, 16);
  1039. #endif
  1040. return U16x8::readAlignedUnsafe(pointer);
  1041. }
  1042. // Bound and alignment checked writing
  1043. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1044. uint16_t* pointer = data.getUnsafe();
  1045. assert(((uintptr_t)pointer & 15) == 0);
  1046. #ifdef SAFE_POINTER_CHECKS
  1047. data.assertInside(methodName, pointer, 16);
  1048. #endif
  1049. this->writeAlignedUnsafe(pointer);
  1050. }
  1051. };
  1052. inline dsr::String& string_toStreamIndented(dsr::String& target, const U16x8& source, const dsr::ReadableString& indentation) {
  1053. ALIGN16 uint16_t data[8];
  1054. source.writeAlignedUnsafe(data);
  1055. string_append(target, indentation, "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7], ")");
  1056. return target;
  1057. }
  1058. inline bool operator==(const U16x8& left, const U16x8& right) {
  1059. ALIGN16 uint16_t a[8];
  1060. ALIGN16 uint16_t b[8];
  1061. left.writeAlignedUnsafe(a);
  1062. right.writeAlignedUnsafe(b);
  1063. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7];
  1064. }
  1065. inline bool operator!=(const U16x8& left, const U16x8& right) {
  1066. return !(left == right);
  1067. }
  1068. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  1069. #ifdef USE_BASIC_SIMD
  1070. return U16x8(ADD_U16_SIMD(left.v, right.v));
  1071. #else
  1072. return U16x8(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3],
  1073. left.emulated[4] + right.emulated[4], left.emulated[5] + right.emulated[5], left.emulated[6] + right.emulated[6], left.emulated[7] + right.emulated[7]);
  1074. #endif
  1075. }
  1076. inline U16x8 operator+(uint16_t left, const U16x8& right) {
  1077. #ifdef USE_BASIC_SIMD
  1078. return U16x8(ADD_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1079. #else
  1080. return U16x8(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3],
  1081. left + right.emulated[4], left + right.emulated[5], left + right.emulated[6], left + right.emulated[7]);
  1082. #endif
  1083. }
  1084. inline U16x8 operator+(const U16x8& left, uint16_t right) {
  1085. #ifdef USE_BASIC_SIMD
  1086. return U16x8(ADD_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1087. #else
  1088. return U16x8(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right,
  1089. left.emulated[4] + right, left.emulated[5] + right, left.emulated[6] + right, left.emulated[7] + right);
  1090. #endif
  1091. }
  1092. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  1093. #ifdef USE_BASIC_SIMD
  1094. return U16x8(SUB_U16_SIMD(left.v, right.v));
  1095. #else
  1096. return U16x8(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3],
  1097. left.emulated[4] - right.emulated[4], left.emulated[5] - right.emulated[5], left.emulated[6] - right.emulated[6], left.emulated[7] - right.emulated[7]);
  1098. #endif
  1099. }
  1100. inline U16x8 operator-(uint16_t left, const U16x8& right) {
  1101. #ifdef USE_BASIC_SIMD
  1102. return U16x8(SUB_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1103. #else
  1104. return U16x8(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3],
  1105. left - right.emulated[4], left - right.emulated[5], left - right.emulated[6], left - right.emulated[7]);
  1106. #endif
  1107. }
  1108. inline U16x8 operator-(const U16x8& left, uint16_t right) {
  1109. #ifdef USE_BASIC_SIMD
  1110. return U16x8(SUB_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1111. #else
  1112. return U16x8(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right,
  1113. left.emulated[4] - right, left.emulated[5] - right, left.emulated[6] - right, left.emulated[7] - right);
  1114. #endif
  1115. }
  1116. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  1117. #ifdef USE_BASIC_SIMD
  1118. return U16x8(MUL_U16_SIMD(left.v, right.v));
  1119. #else
  1120. return U16x8(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3],
  1121. left.emulated[4] * right.emulated[4], left.emulated[5] * right.emulated[5], left.emulated[6] * right.emulated[6], left.emulated[7] * right.emulated[7]);
  1122. #endif
  1123. }
  1124. inline U16x8 operator*(uint16_t left, const U16x8& right) {
  1125. #ifdef USE_BASIC_SIMD
  1126. return U16x8(MUL_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1127. #else
  1128. return U16x8(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3],
  1129. left * right.emulated[4], left * right.emulated[5], left * right.emulated[6], left * right.emulated[7]);
  1130. #endif
  1131. }
  1132. inline U16x8 operator*(const U16x8& left, uint16_t right) {
  1133. #ifdef USE_BASIC_SIMD
  1134. return U16x8(MUL_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1135. #else
  1136. return U16x8(
  1137. left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right,
  1138. left.emulated[4] * right, left.emulated[5] * right, left.emulated[6] * right, left.emulated[7] * right
  1139. );
  1140. #endif
  1141. }
  1142. union U8x16 {
  1143. #ifdef USE_BASIC_SIMD
  1144. public:
  1145. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  1146. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1147. // Direct access cannot be done on NEON!
  1148. uint8_t shared_memory[16];
  1149. #endif
  1150. // The SIMD vector of undefined type
  1151. // Not accessible while emulating!
  1152. SIMD_U8x16 v;
  1153. // Construct a portable vector from a native SIMD vector
  1154. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  1155. // Construct a portable vector from a set of scalars
  1156. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1157. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  1158. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1159. // Construct a portable vector from a single duplicated scalar
  1160. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  1161. #else
  1162. public:
  1163. // Emulate a SIMD vector as an array of scalars without hardware support
  1164. // Only accessible while emulating!
  1165. uint8_t emulated[16];
  1166. // Construct a portable vector from a set of scalars
  1167. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1168. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  1169. this->emulated[0] = a1;
  1170. this->emulated[1] = a2;
  1171. this->emulated[2] = a3;
  1172. this->emulated[3] = a4;
  1173. this->emulated[4] = a5;
  1174. this->emulated[5] = a6;
  1175. this->emulated[6] = a7;
  1176. this->emulated[7] = a8;
  1177. this->emulated[8] = a9;
  1178. this->emulated[9] = a10;
  1179. this->emulated[10] = a11;
  1180. this->emulated[11] = a12;
  1181. this->emulated[12] = a13;
  1182. this->emulated[13] = a14;
  1183. this->emulated[14] = a15;
  1184. this->emulated[15] = a16;
  1185. }
  1186. // Construct a portable vector from a single duplicated scalar
  1187. explicit U8x16(uint8_t scalar) {
  1188. this->emulated[0] = scalar;
  1189. this->emulated[1] = scalar;
  1190. this->emulated[2] = scalar;
  1191. this->emulated[3] = scalar;
  1192. this->emulated[4] = scalar;
  1193. this->emulated[5] = scalar;
  1194. this->emulated[6] = scalar;
  1195. this->emulated[7] = scalar;
  1196. this->emulated[8] = scalar;
  1197. this->emulated[9] = scalar;
  1198. this->emulated[10] = scalar;
  1199. this->emulated[11] = scalar;
  1200. this->emulated[12] = scalar;
  1201. this->emulated[13] = scalar;
  1202. this->emulated[14] = scalar;
  1203. this->emulated[15] = scalar;
  1204. }
  1205. #endif
  1206. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  1207. #ifdef USE_BASIC_SIMD
  1208. #ifdef USE_SSE2
  1209. return U8x16(_mm_load_si128((const __m128i*)data));
  1210. #elif USE_NEON
  1211. return U8x16(vld1q_u8(data));
  1212. #endif
  1213. #else
  1214. return U8x16(
  1215. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  1216. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  1217. );
  1218. #endif
  1219. }
  1220. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1221. inline void writeAlignedUnsafe(uint8_t* data) const {
  1222. #ifdef USE_BASIC_SIMD
  1223. #ifdef USE_SSE2
  1224. _mm_store_si128((__m128i*)data, this->v);
  1225. #elif USE_NEON
  1226. vst1q_u8(data, this->v);
  1227. #endif
  1228. #else
  1229. data[0] = this->emulated[0];
  1230. data[1] = this->emulated[1];
  1231. data[2] = this->emulated[2];
  1232. data[3] = this->emulated[3];
  1233. data[4] = this->emulated[4];
  1234. data[5] = this->emulated[5];
  1235. data[6] = this->emulated[6];
  1236. data[7] = this->emulated[7];
  1237. data[8] = this->emulated[8];
  1238. data[9] = this->emulated[9];
  1239. data[10] = this->emulated[10];
  1240. data[11] = this->emulated[11];
  1241. data[12] = this->emulated[12];
  1242. data[13] = this->emulated[13];
  1243. data[14] = this->emulated[14];
  1244. data[15] = this->emulated[15];
  1245. #endif
  1246. }
  1247. // Bound and alignment checked reading
  1248. static inline U8x16 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
  1249. const uint8_t* pointer = data.getUnsafe();
  1250. assert(((uintptr_t)pointer & 15) == 0);
  1251. #ifdef SAFE_POINTER_CHECKS
  1252. data.assertInside(methodName, pointer, 16);
  1253. #endif
  1254. return U8x16::readAlignedUnsafe(pointer);
  1255. }
  1256. // Bound and alignment checked writing
  1257. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1258. uint8_t* pointer = data.getUnsafe();
  1259. assert(((uintptr_t)pointer & 15) == 0);
  1260. #ifdef SAFE_POINTER_CHECKS
  1261. data.assertInside(methodName, pointer, 16);
  1262. #endif
  1263. this->writeAlignedUnsafe(pointer);
  1264. }
  1265. };
  1266. inline dsr::String& string_toStreamIndented(dsr::String& target, const U8x16& source, const dsr::ReadableString& indentation) {
  1267. ALIGN16 uint8_t data[16];
  1268. source.writeAlignedUnsafe(data);
  1269. string_append(target, indentation,
  1270. "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7],
  1271. ", ", data[8], ", ", data[9], ", ", data[10], ", ", data[11], ", ", data[12], ", ", data[13], ", ", data[14], ", ", data[15], ")"
  1272. );
  1273. return target;
  1274. }
  1275. inline bool operator==(const U8x16& left, const U8x16& right) {
  1276. ALIGN16 uint8_t a[16];
  1277. ALIGN16 uint8_t b[16];
  1278. left.writeAlignedUnsafe(a);
  1279. right.writeAlignedUnsafe(b);
  1280. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7]
  1281. && a[8] == b[8] && a[9] == b[9] && a[10] == b[10] && a[11] == b[11] && a[12] == b[12] && a[13] == b[13] && a[14] == b[14] && a[15] == b[15];
  1282. }
  1283. inline bool operator!=(const U8x16& left, const U8x16& right) {
  1284. return !(left == right);
  1285. }
  1286. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  1287. #ifdef USE_BASIC_SIMD
  1288. return U8x16(ADD_U8_SIMD(left.v, right.v));
  1289. #else
  1290. return U8x16(
  1291. left.emulated[0] + right.emulated[0],
  1292. left.emulated[1] + right.emulated[1],
  1293. left.emulated[2] + right.emulated[2],
  1294. left.emulated[3] + right.emulated[3],
  1295. left.emulated[4] + right.emulated[4],
  1296. left.emulated[5] + right.emulated[5],
  1297. left.emulated[6] + right.emulated[6],
  1298. left.emulated[7] + right.emulated[7],
  1299. left.emulated[8] + right.emulated[8],
  1300. left.emulated[9] + right.emulated[9],
  1301. left.emulated[10] + right.emulated[10],
  1302. left.emulated[11] + right.emulated[11],
  1303. left.emulated[12] + right.emulated[12],
  1304. left.emulated[13] + right.emulated[13],
  1305. left.emulated[14] + right.emulated[14],
  1306. left.emulated[15] + right.emulated[15]
  1307. );
  1308. #endif
  1309. }
  1310. inline U8x16 operator+(uint8_t left, const U8x16& right) {
  1311. #ifdef USE_BASIC_SIMD
  1312. return U8x16(ADD_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
  1313. #else
  1314. return U8x16(
  1315. left + right.emulated[0],
  1316. left + right.emulated[1],
  1317. left + right.emulated[2],
  1318. left + right.emulated[3],
  1319. left + right.emulated[4],
  1320. left + right.emulated[5],
  1321. left + right.emulated[6],
  1322. left + right.emulated[7],
  1323. left + right.emulated[8],
  1324. left + right.emulated[9],
  1325. left + right.emulated[10],
  1326. left + right.emulated[11],
  1327. left + right.emulated[12],
  1328. left + right.emulated[13],
  1329. left + right.emulated[14],
  1330. left + right.emulated[15]
  1331. );
  1332. #endif
  1333. }
  1334. inline U8x16 operator+(const U8x16& left, uint8_t right) {
  1335. #ifdef USE_BASIC_SIMD
  1336. return U8x16(ADD_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
  1337. #else
  1338. return U8x16(
  1339. left.emulated[0] + right,
  1340. left.emulated[1] + right,
  1341. left.emulated[2] + right,
  1342. left.emulated[3] + right,
  1343. left.emulated[4] + right,
  1344. left.emulated[5] + right,
  1345. left.emulated[6] + right,
  1346. left.emulated[7] + right,
  1347. left.emulated[8] + right,
  1348. left.emulated[9] + right,
  1349. left.emulated[10] + right,
  1350. left.emulated[11] + right,
  1351. left.emulated[12] + right,
  1352. left.emulated[13] + right,
  1353. left.emulated[14] + right,
  1354. left.emulated[15] + right
  1355. );
  1356. #endif
  1357. }
  1358. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  1359. #ifdef USE_BASIC_SIMD
  1360. return U8x16(SUB_U8_SIMD(left.v, right.v));
  1361. #else
  1362. return U8x16(
  1363. left.emulated[0] - right.emulated[0],
  1364. left.emulated[1] - right.emulated[1],
  1365. left.emulated[2] - right.emulated[2],
  1366. left.emulated[3] - right.emulated[3],
  1367. left.emulated[4] - right.emulated[4],
  1368. left.emulated[5] - right.emulated[5],
  1369. left.emulated[6] - right.emulated[6],
  1370. left.emulated[7] - right.emulated[7],
  1371. left.emulated[8] - right.emulated[8],
  1372. left.emulated[9] - right.emulated[9],
  1373. left.emulated[10] - right.emulated[10],
  1374. left.emulated[11] - right.emulated[11],
  1375. left.emulated[12] - right.emulated[12],
  1376. left.emulated[13] - right.emulated[13],
  1377. left.emulated[14] - right.emulated[14],
  1378. left.emulated[15] - right.emulated[15]
  1379. );
  1380. #endif
  1381. }
  1382. inline U8x16 operator-(uint8_t left, const U8x16& right) {
  1383. #ifdef USE_BASIC_SIMD
  1384. return U8x16(SUB_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
  1385. #else
  1386. return U8x16(
  1387. left - right.emulated[0],
  1388. left - right.emulated[1],
  1389. left - right.emulated[2],
  1390. left - right.emulated[3],
  1391. left - right.emulated[4],
  1392. left - right.emulated[5],
  1393. left - right.emulated[6],
  1394. left - right.emulated[7],
  1395. left - right.emulated[8],
  1396. left - right.emulated[9],
  1397. left - right.emulated[10],
  1398. left - right.emulated[11],
  1399. left - right.emulated[12],
  1400. left - right.emulated[13],
  1401. left - right.emulated[14],
  1402. left - right.emulated[15]
  1403. );
  1404. #endif
  1405. }
  1406. inline U8x16 operator-(const U8x16& left, uint8_t right) {
  1407. #ifdef USE_BASIC_SIMD
  1408. return U8x16(SUB_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
  1409. #else
  1410. return U8x16(
  1411. left.emulated[0] - right,
  1412. left.emulated[1] - right,
  1413. left.emulated[2] - right,
  1414. left.emulated[3] - right,
  1415. left.emulated[4] - right,
  1416. left.emulated[5] - right,
  1417. left.emulated[6] - right,
  1418. left.emulated[7] - right,
  1419. left.emulated[8] - right,
  1420. left.emulated[9] - right,
  1421. left.emulated[10] - right,
  1422. left.emulated[11] - right,
  1423. left.emulated[12] - right,
  1424. left.emulated[13] - right,
  1425. left.emulated[14] - right,
  1426. left.emulated[15] - right
  1427. );
  1428. #endif
  1429. }
  1430. inline uint8_t saturateToU8(uint32_t x) {
  1431. // No need to check lower bound for unsigned input
  1432. return x > 255 ? 255 : x;
  1433. }
  1434. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  1435. #ifdef USE_BASIC_SIMD
  1436. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  1437. #else
  1438. return U8x16(
  1439. saturateToU8((uint32_t)left.emulated[0] + (uint32_t)right.emulated[0]),
  1440. saturateToU8((uint32_t)left.emulated[1] + (uint32_t)right.emulated[1]),
  1441. saturateToU8((uint32_t)left.emulated[2] + (uint32_t)right.emulated[2]),
  1442. saturateToU8((uint32_t)left.emulated[3] + (uint32_t)right.emulated[3]),
  1443. saturateToU8((uint32_t)left.emulated[4] + (uint32_t)right.emulated[4]),
  1444. saturateToU8((uint32_t)left.emulated[5] + (uint32_t)right.emulated[5]),
  1445. saturateToU8((uint32_t)left.emulated[6] + (uint32_t)right.emulated[6]),
  1446. saturateToU8((uint32_t)left.emulated[7] + (uint32_t)right.emulated[7]),
  1447. saturateToU8((uint32_t)left.emulated[8] + (uint32_t)right.emulated[8]),
  1448. saturateToU8((uint32_t)left.emulated[9] + (uint32_t)right.emulated[9]),
  1449. saturateToU8((uint32_t)left.emulated[10] + (uint32_t)right.emulated[10]),
  1450. saturateToU8((uint32_t)left.emulated[11] + (uint32_t)right.emulated[11]),
  1451. saturateToU8((uint32_t)left.emulated[12] + (uint32_t)right.emulated[12]),
  1452. saturateToU8((uint32_t)left.emulated[13] + (uint32_t)right.emulated[13]),
  1453. saturateToU8((uint32_t)left.emulated[14] + (uint32_t)right.emulated[14]),
  1454. saturateToU8((uint32_t)left.emulated[15] + (uint32_t)right.emulated[15])
  1455. );
  1456. #endif
  1457. }
  1458. // TODO: Use overloading to only name the target type
  1459. inline I32x4 truncateToI32(const F32x4& vector) {
  1460. #ifdef USE_BASIC_SIMD
  1461. return I32x4(F32_TO_I32_SIMD(vector.v));
  1462. #else
  1463. return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
  1464. #endif
  1465. }
  1466. inline U32x4 truncateToU32(const F32x4& vector) {
  1467. #ifdef USE_BASIC_SIMD
  1468. return U32x4(F32_TO_U32_SIMD(vector.v));
  1469. #else
  1470. return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
  1471. #endif
  1472. }
  1473. inline F32x4 floatFromI32(const I32x4& vector) {
  1474. #ifdef USE_BASIC_SIMD
  1475. return F32x4(I32_TO_F32_SIMD(vector.v));
  1476. #else
  1477. return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
  1478. #endif
  1479. }
  1480. inline F32x4 floatFromU32(const U32x4& vector) {
  1481. #ifdef USE_BASIC_SIMD
  1482. return F32x4(U32_TO_F32_SIMD(vector.v));
  1483. #else
  1484. return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
  1485. #endif
  1486. }
  1487. inline I32x4 I32FromU32(const U32x4& vector) {
  1488. #ifdef USE_BASIC_SIMD
  1489. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  1490. #else
  1491. return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
  1492. #endif
  1493. }
  1494. inline U32x4 U32FromI32(const I32x4& vector) {
  1495. #ifdef USE_BASIC_SIMD
  1496. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  1497. #else
  1498. return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
  1499. #endif
  1500. }
  1501. // Warning! Behaviour depends on endianness.
  1502. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  1503. #ifdef USE_BASIC_SIMD
  1504. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  1505. #else
  1506. const uint8_t *source = (const uint8_t*)vector.emulated;
  1507. return U8x16(
  1508. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  1509. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  1510. );
  1511. #endif
  1512. }
  1513. // Warning! Behaviour depends on endianness.
  1514. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  1515. #ifdef USE_BASIC_SIMD
  1516. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  1517. #else
  1518. const uint32_t *source = (const uint32_t*)vector.emulated;
  1519. return U32x4(source[0], source[1], source[2], source[3]);
  1520. #endif
  1521. }
  1522. // Unpacking to larger integers
  1523. inline U32x4 lowerToU32(const U16x8& vector) {
  1524. #ifdef USE_BASIC_SIMD
  1525. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  1526. #else
  1527. return U32x4(vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3]);
  1528. #endif
  1529. }
  1530. inline U32x4 higherToU32(const U16x8& vector) {
  1531. #ifdef USE_BASIC_SIMD
  1532. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  1533. #else
  1534. return U32x4(vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]);
  1535. #endif
  1536. }
  1537. inline U16x8 lowerToU16(const U8x16& vector) {
  1538. #ifdef USE_BASIC_SIMD
  1539. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  1540. #else
  1541. return U16x8(
  1542. vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3],
  1543. vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]
  1544. );
  1545. #endif
  1546. }
  1547. inline U16x8 higherToU16(const U8x16& vector) {
  1548. #ifdef USE_BASIC_SIMD
  1549. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  1550. #else
  1551. return U16x8(
  1552. vector.emulated[8], vector.emulated[9], vector.emulated[10], vector.emulated[11],
  1553. vector.emulated[12], vector.emulated[13], vector.emulated[14], vector.emulated[15]
  1554. );
  1555. #endif
  1556. }
  1557. // Saturated packing
  1558. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  1559. #ifdef USE_BASIC_SIMD
  1560. return U8x16(PACK_SAT_U16_TO_U8(lower.v, upper.v));
  1561. #else
  1562. return U8x16(
  1563. saturateToU8(lower.emulated[0]),
  1564. saturateToU8(lower.emulated[1]),
  1565. saturateToU8(lower.emulated[2]),
  1566. saturateToU8(lower.emulated[3]),
  1567. saturateToU8(lower.emulated[4]),
  1568. saturateToU8(lower.emulated[5]),
  1569. saturateToU8(lower.emulated[6]),
  1570. saturateToU8(lower.emulated[7]),
  1571. saturateToU8(upper.emulated[0]),
  1572. saturateToU8(upper.emulated[1]),
  1573. saturateToU8(upper.emulated[2]),
  1574. saturateToU8(upper.emulated[3]),
  1575. saturateToU8(upper.emulated[4]),
  1576. saturateToU8(upper.emulated[5]),
  1577. saturateToU8(upper.emulated[6]),
  1578. saturateToU8(upper.emulated[7])
  1579. );
  1580. #endif
  1581. }
  1582. #endif