simd.h 79 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2017 to 2022 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // Hardware abstraction layer for portable SIMD math.
  24. // Covers a small intersection of SSE2 and NEON in order to reduce the number
  25. // of bugs from multiple implementations when nothing advanced is required.
  26. #ifndef DFPSR_SIMD
  27. #define DFPSR_SIMD
  28. #include <stdint.h>
  29. #include <cassert>
  30. #include "SafePointer.h"
  31. #include "../math/FVector.h"
  32. #include "../math/IVector.h"
  33. #include "../math/UVector.h"
  34. #define ALIGN16 __attribute__((aligned(16)))
  35. // To allow turning off SIMD intrinsics for testing
  36. #ifdef __SSE2__
  37. // Comment out this line to test without SSE2
  38. #define USE_SSE2
  39. #elif __ARM_NEON
  40. // Comment out this line to test without NEON
  41. #define USE_NEON
  42. #endif
  43. // Everything declared in here handles things specific for SSE.
  44. // Direct use of the macros will not provide portability to all hardware.
  45. #ifdef USE_SSE2
  46. #define USE_BASIC_SIMD
  47. #define USE_DIRECT_SIMD_MEMORY_ACCESS
  48. #include <emmintrin.h> // SSE2
  49. #ifdef __SSSE3__
  50. #include <tmmintrin.h> // SSSE3
  51. // Comment out this line to test without SSSE3
  52. #define USE_SSSE3
  53. #endif
  54. #ifdef __AVX2__
  55. #include <immintrin.h> // AVX2
  56. // Comment out this line to test without AVX2
  57. #define USE_AVX2
  58. #endif
  59. // Vector types
  60. #define SIMD_F32x4 __m128
  61. #define SIMD_U8x16 __m128i
  62. #define SIMD_U16x8 __m128i
  63. #define SIMD_U32x4 __m128i
  64. #define SIMD_I32x4 __m128i
  65. // Vector uploads in address order
  66. #define LOAD_VECTOR_F32_SIMD(A, B, C, D) _mm_set_ps(D, C, B, A)
  67. #define LOAD_SCALAR_F32_SIMD(A) _mm_set1_ps(A)
  68. #define LOAD_VECTOR_U8_SIMD(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) _mm_set_epi8(P, O, N, M, L, K, J, I, H, G, F, E, D, C, B, A)
  69. #define LOAD_SCALAR_U8_SIMD(A) _mm_set1_epi8(A)
  70. #define LOAD_VECTOR_U16_SIMD(A, B, C, D, E, F, G, H) _mm_set_epi16(H, G, F, E, D, C, B, A)
  71. #define LOAD_SCALAR_U16_SIMD(A) _mm_set1_epi16(A)
  72. #define LOAD_VECTOR_U32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  73. #define LOAD_SCALAR_U32_SIMD(A) _mm_set1_epi32(A)
  74. #define LOAD_VECTOR_I32_SIMD(A, B, C, D) _mm_set_epi32(D, C, B, A)
  75. #define LOAD_SCALAR_I32_SIMD(A) _mm_set1_epi32(A)
  76. // Conversions
  77. #define F32_TO_I32_SIMD(A) _mm_cvttps_epi32(A)
  78. #define F32_TO_U32_SIMD(A) _mm_cvttps_epi32(A)
  79. #define I32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  80. #define U32_TO_F32_SIMD(A) _mm_cvtepi32_ps(A)
  81. // Unpacking conversions
  82. #define U8_LOW_TO_U16_SIMD(A) _mm_unpacklo_epi8(A, _mm_set1_epi8(0))
  83. #define U8_HIGH_TO_U16_SIMD(A) _mm_unpackhi_epi8(A, _mm_set1_epi8(0))
  84. #define U16_LOW_TO_U32_SIMD(A) _mm_unpacklo_epi16(A, _mm_set1_epi16(0))
  85. #define U16_HIGH_TO_U32_SIMD(A) _mm_unpackhi_epi16(A, _mm_set1_epi16(0))
  86. // Saturated packing
  87. // Credit: Using ideas from Victoria Zhislina's NEON_2_SSE.h header from the Intel corporation, but not trying to emulate NEON
  88. inline SIMD_U8x16 PACK_SAT_U16_TO_U8(const SIMD_U16x8& a, const SIMD_U16x8& b) {
  89. SIMD_U16x8 mask, a2, b2;
  90. mask = _mm_set1_epi16(0x7fff);
  91. a2 = _mm_and_si128(a, mask);
  92. a2 = _mm_or_si128(a2, _mm_and_si128(_mm_cmpgt_epi16(a2, a), mask));
  93. b2 = _mm_and_si128(b, mask);
  94. b2 = _mm_or_si128(b2, _mm_and_si128(_mm_cmpgt_epi16(b2, b), mask));
  95. return _mm_packus_epi16(a2, b2);
  96. }
  97. // Reinterpret casting
  98. #define REINTERPRET_U32_TO_U8_SIMD(A) (A)
  99. #define REINTERPRET_U32_TO_U16_SIMD(A) (A)
  100. #define REINTERPRET_U8_TO_U32_SIMD(A) (A)
  101. #define REINTERPRET_U16_TO_U32_SIMD(A) (A)
  102. #define REINTERPRET_U32_TO_I32_SIMD(A) (A)
  103. #define REINTERPRET_I32_TO_U32_SIMD(A) (A)
  104. // Vector float operations returning SIMD_F32x4
  105. #define ADD_F32_SIMD(A, B) _mm_add_ps(A, B)
  106. #define SUB_F32_SIMD(A, B) _mm_sub_ps(A, B)
  107. #define MUL_F32_SIMD(A, B) _mm_mul_ps(A, B)
  108. // Vector integer operations returning SIMD_I32x4
  109. #define ADD_I32_SIMD(A, B) _mm_add_epi32(A, B)
  110. #define SUB_I32_SIMD(A, B) _mm_sub_epi32(A, B)
  111. // 32-bit integer multiplications are not available on SSE2.
  112. // Vector integer operations returning SIMD_U32x4
  113. #define ADD_U32_SIMD(A, B) _mm_add_epi32(A, B)
  114. #define SUB_U32_SIMD(A, B) _mm_sub_epi32(A, B)
  115. // 32-bit integer multiplications are not available on SSE2.
  116. // Vector integer operations returning SIMD_U16x8
  117. #define ADD_U16_SIMD(A, B) _mm_add_epi16(A, B)
  118. #define SUB_U16_SIMD(A, B) _mm_sub_epi16(A, B)
  119. #define MUL_U16_SIMD(A, B) _mm_mullo_epi16(A, B)
  120. // Vector integer operations returning SIMD_U8x16
  121. #define ADD_U8_SIMD(A, B) _mm_add_epi8(A, B)
  122. #define ADD_SAT_U8_SIMD(A, B) _mm_adds_epu8(A, B) // Saturated addition
  123. #define SUB_U8_SIMD(A, B) _mm_sub_epi8(A, B)
  124. #define SUB_SAT_U8_SIMD(A, B) _mm_subs_epu8(A, B) // Saturated subtraction
  125. // No 8-bit multiplications
  126. // Statistics
  127. #define MIN_F32_SIMD(A, B) _mm_min_ps(A, B)
  128. #define MAX_F32_SIMD(A, B) _mm_max_ps(A, B)
  129. // Bitwise
  130. #define BITWISE_AND_U32_SIMD(A, B) _mm_and_si128(A, B)
  131. #define BITWISE_OR_U32_SIMD(A, B) _mm_or_si128(A, B)
  132. #define BITWISE_XOR_U32_SIMD(A, B) _mm_xor_si128(A, B)
  133. #endif
  134. // Everything declared in here handles things specific for NEON.
  135. // Direct use of the macros will not provide portability to all hardware.
  136. #ifdef USE_NEON
  137. #define USE_BASIC_SIMD
  138. #include <arm_neon.h> // NEON
  139. // Vector types
  140. #define SIMD_F32x4 float32x4_t
  141. #define SIMD_U8x16 uint8x16_t
  142. #define SIMD_U16x8 uint16x8_t
  143. #define SIMD_U32x4 uint32x4_t
  144. #define SIMD_I32x4 int32x4_t
  145. // Vector uploads in address order
  146. inline SIMD_F32x4 LOAD_VECTOR_F32_SIMD(float a, float b, float c, float d) {
  147. float data[4] ALIGN16 = {a, b, c, d};
  148. return vld1q_f32(data);
  149. }
  150. inline SIMD_F32x4 LOAD_SCALAR_F32_SIMD(float a) {
  151. return vdupq_n_f32(a);
  152. }
  153. inline SIMD_U8x16 LOAD_VECTOR_U8_SIMD(uint8_t a, uint8_t b, uint8_t c, uint8_t d, uint8_t e, uint8_t f, uint8_t g, uint8_t h,
  154. uint8_t i, uint8_t j, uint8_t k, uint8_t l, uint8_t m, uint8_t n, uint8_t o, uint8_t p) {
  155. uint8_t data[16] ALIGN16 = {a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p};
  156. return vld1q_u8(data);
  157. }
  158. inline SIMD_U16x8 LOAD_SCALAR_U8_SIMD(uint16_t a) {
  159. return vdupq_n_u8(a);
  160. }
  161. inline SIMD_U16x8 LOAD_VECTOR_U16_SIMD(uint16_t a, uint16_t b, uint16_t c, uint16_t d, uint16_t e, uint16_t f, uint16_t g, uint16_t h) {
  162. uint16_t data[8] ALIGN16 = {a, b, c, d, e, f, g, h};
  163. return vld1q_u16(data);
  164. }
  165. inline SIMD_U16x8 LOAD_SCALAR_U16_SIMD(uint16_t a) {
  166. return vdupq_n_u16(a);
  167. }
  168. inline SIMD_U32x4 LOAD_VECTOR_U32_SIMD(uint32_t a, uint32_t b, uint32_t c, uint32_t d) {
  169. uint32_t data[4] ALIGN16 = {a, b, c, d};
  170. return vld1q_u32(data);
  171. }
  172. inline SIMD_U32x4 LOAD_SCALAR_U32_SIMD(uint32_t a) {
  173. return vdupq_n_u32(a);
  174. }
  175. inline SIMD_I32x4 LOAD_VECTOR_I32_SIMD(int32_t a, int32_t b, int32_t c, int32_t d) {
  176. int32_t data[4] ALIGN16 = {a, b, c, d};
  177. return vld1q_s32(data);
  178. }
  179. inline SIMD_I32x4 LOAD_SCALAR_I32_SIMD(int32_t a) {
  180. return vdupq_n_s32(a);
  181. }
  182. // Conversions
  183. #define F32_TO_I32_SIMD(A) vcvtq_s32_f32(A)
  184. #define F32_TO_U32_SIMD(A) vcvtq_u32_f32(A)
  185. #define I32_TO_F32_SIMD(A) vcvtq_f32_s32(A)
  186. #define U32_TO_F32_SIMD(A) vcvtq_f32_u32(A)
  187. // Unpacking conversions
  188. #define U8_LOW_TO_U16_SIMD(A) vmovl_u8(vget_low_u8(A))
  189. #define U8_HIGH_TO_U16_SIMD(A) vmovl_u8(vget_high_u8(A))
  190. #define U16_LOW_TO_U32_SIMD(A) vmovl_u16(vget_low_u16(A))
  191. #define U16_HIGH_TO_U32_SIMD(A) vmovl_u16(vget_high_u16(A))
  192. // Saturated packing
  193. #define PACK_SAT_U16_TO_U8(A, B) vcombine_u8(vqmovn_u16(A), vqmovn_u16(B))
  194. // Reinterpret casting
  195. #define REINTERPRET_U32_TO_U8_SIMD(A) vreinterpretq_u8_u32(A)
  196. #define REINTERPRET_U32_TO_U16_SIMD(A) vreinterpretq_u16_u32(A)
  197. #define REINTERPRET_U8_TO_U32_SIMD(A) vreinterpretq_u32_u8(A)
  198. #define REINTERPRET_U16_TO_U32_SIMD(A) vreinterpretq_u32_u16(A)
  199. #define REINTERPRET_U32_TO_I32_SIMD(A) vreinterpretq_s32_u32(A)
  200. #define REINTERPRET_I32_TO_U32_SIMD(A) vreinterpretq_u32_s32(A)
  201. // Vector float operations returning SIMD_F32x4
  202. #define ADD_F32_SIMD(A, B) vaddq_f32(A, B)
  203. #define SUB_F32_SIMD(A, B) vsubq_f32(A, B)
  204. #define MUL_F32_SIMD(A, B) vmulq_f32(A, B)
  205. // Vector integer operations returning SIMD_I32x4
  206. #define ADD_I32_SIMD(A, B) vaddq_s32(A, B)
  207. #define SUB_I32_SIMD(A, B) vsubq_s32(A, B)
  208. #define MUL_I32_NEON(A, B) vmulq_s32(A, B)
  209. // Vector integer operations returning SIMD_U32x4
  210. #define ADD_U32_SIMD(A, B) vaddq_u32(A, B)
  211. #define SUB_U32_SIMD(A, B) vsubq_u32(A, B)
  212. #define MUL_U32_NEON(A, B) vmulq_u32(A, B)
  213. // Vector integer operations returning SIMD_U16x8
  214. #define ADD_U16_SIMD(A, B) vaddq_u16(A, B)
  215. #define SUB_U16_SIMD(A, B) vsubq_u16(A, B)
  216. #define MUL_U16_SIMD(A, B) vmulq_u16(A, B)
  217. // Vector integer operations returning SIMD_U8x16
  218. #define ADD_U8_SIMD(A, B) vaddq_u8(A, B)
  219. #define ADD_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated addition
  220. #define SUB_U8_SIMD(A, B) vsubq_u8(A, B)
  221. #define SUB_SAT_U8_SIMD(A, B) vqaddq_u8(A, B) // Saturated subtraction
  222. // No 8-bit multiplications
  223. // Statistics
  224. #define MIN_F32_SIMD(A, B) vminq_f32(A, B)
  225. #define MAX_F32_SIMD(A, B) vmaxq_f32(A, B)
  226. // Bitwise
  227. #define BITWISE_AND_U32_SIMD(A, B) vandq_u32(A, B)
  228. #define BITWISE_OR_U32_SIMD(A, B) vorrq_u32(A, B)
  229. #define BITWISE_XOR_U32_SIMD(A, B) veorq_u32(A, B)
  230. #endif
  231. /*
  232. The vector types (F32x4, I32x4, U32x4, U16x8) below are supposed to be portable across different CPU architectures.
  233. When this abstraction layer is mixed with handwritten SIMD intrinsics:
  234. Use "USE_SSE2" instead of "__SSE2__"
  235. Use "USE_AVX2" instead of "__AVX2__"
  236. Use "USE_NEON" instead of "__ARM_NEON"
  237. Portability exceptions:
  238. * The "v" variable is the native backend, which is only defined when SIMD is supported by hardware.
  239. Only use when USE_BASIC_SIMD is defined.
  240. Will not work on scalar emulation.
  241. * The "shared_memory" array is only defined for targets with direct access to SIMD registers. (SSE)
  242. Only use when USE_DIRECT_SIMD_MEMORY_ACCESS is defined.
  243. Will not work on NEON or scalar emulation.
  244. * The "emulated" array is ony defined when SIMD is turned off.
  245. Cannot be used when USE_BASIC_SIMD is defined.
  246. Will not work when either SSE or NEON is enabled.
  247. */
  248. union F32x4 {
  249. #ifdef USE_BASIC_SIMD
  250. public:
  251. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  252. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  253. // Direct access cannot be done on NEON!
  254. float shared_memory[4];
  255. #endif
  256. // The SIMD vector of undefined type
  257. // Not accessible while emulating!
  258. SIMD_F32x4 v;
  259. // Construct a portable vector from a native SIMD vector
  260. explicit F32x4(const SIMD_F32x4& v) : v(v) {}
  261. // Construct a portable vector from a set of scalars
  262. F32x4(float a1, float a2, float a3, float a4) : v(LOAD_VECTOR_F32_SIMD(a1, a2, a3, a4)) {}
  263. // Construct a portable vector from a single duplicated scalar
  264. explicit F32x4(float scalar) : v(LOAD_SCALAR_F32_SIMD(scalar)) {}
  265. #else
  266. public:
  267. // Emulate a SIMD vector as an array of scalars without hardware support
  268. // Only accessible while emulating!
  269. float emulated[4];
  270. // Construct a portable vector from a set of scalars
  271. F32x4(float a1, float a2, float a3, float a4) {
  272. this->emulated[0] = a1;
  273. this->emulated[1] = a2;
  274. this->emulated[2] = a3;
  275. this->emulated[3] = a4;
  276. }
  277. // Construct a portable vector from a single duplicated scalar
  278. explicit F32x4(float scalar) {
  279. this->emulated[0] = scalar;
  280. this->emulated[1] = scalar;
  281. this->emulated[2] = scalar;
  282. this->emulated[3] = scalar;
  283. }
  284. #endif
  285. // Construct a portable SIMD vector from a pointer to aligned data
  286. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  287. static inline F32x4 readAlignedUnsafe(const float* data) {
  288. #ifdef USE_BASIC_SIMD
  289. #ifdef USE_SSE2
  290. return F32x4(_mm_load_ps(data));
  291. #elif USE_NEON
  292. return F32x4(vld1q_f32(data));
  293. #endif
  294. #else
  295. return F32x4(data[0], data[1], data[2], data[3]);
  296. #endif
  297. }
  298. // Write to aligned memory from the existing vector
  299. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  300. inline void writeAlignedUnsafe(float* data) const {
  301. #ifdef USE_BASIC_SIMD
  302. #ifdef USE_SSE2
  303. _mm_store_ps(data, this->v);
  304. #elif USE_NEON
  305. vst1q_f32(data, this->v);
  306. #endif
  307. #else
  308. data[0] = this->emulated[0];
  309. data[1] = this->emulated[1];
  310. data[2] = this->emulated[2];
  311. data[3] = this->emulated[3];
  312. #endif
  313. }
  314. #ifdef DFPSR_GEOMETRY_FVECTOR
  315. dsr::FVector4D get() const {
  316. float data[4] ALIGN16;
  317. this->writeAlignedUnsafe(data);
  318. return dsr::FVector4D(data[0], data[1], data[2], data[3]);
  319. }
  320. #endif
  321. // Bound and alignment checked reading
  322. static inline F32x4 readAligned(const dsr::SafePointer<float> data, const char* methodName) {
  323. const float* pointer = data.getUnsafe();
  324. assert(((uintptr_t)pointer & 15) == 0);
  325. #ifdef SAFE_POINTER_CHECKS
  326. data.assertInside(methodName, pointer, 16);
  327. #endif
  328. return F32x4::readAlignedUnsafe(pointer);
  329. }
  330. // Bound and alignment checked writing
  331. inline void writeAligned(dsr::SafePointer<float> data, const char* methodName) const {
  332. float* pointer = data.getUnsafe();
  333. assert(((uintptr_t)pointer & 15) == 0);
  334. #ifdef SAFE_POINTER_CHECKS
  335. data.assertInside(methodName, pointer, 16);
  336. #endif
  337. this->writeAlignedUnsafe(pointer);
  338. }
  339. // 1 / x
  340. // Useful for multiple divisions with the same denominator
  341. // Useless if the denominator is a constant
  342. F32x4 reciprocal() const {
  343. #ifdef USE_BASIC_SIMD
  344. #ifdef USE_SSE2
  345. // Approximate
  346. SIMD_F32x4 lowQ = _mm_rcp_ps(this->v);
  347. // Refine
  348. return F32x4(SUB_F32_SIMD(ADD_F32_SIMD(lowQ, lowQ), MUL_F32_SIMD(this->v, MUL_F32_SIMD(lowQ, lowQ))));
  349. #elif USE_NEON
  350. // Approximate
  351. SIMD_F32x4 result = vrecpeq_f32(this->v);
  352. // Refine
  353. result = MUL_F32_SIMD(vrecpsq_f32(this->v, result), result);
  354. return F32x4(MUL_F32_SIMD(vrecpsq_f32(this->v, result), result));
  355. #else
  356. assert(false);
  357. return F32x4(0);
  358. #endif
  359. #else
  360. return F32x4(1.0f / this->emulated[0], 1.0f / this->emulated[1], 1.0f / this->emulated[2], 1.0f / this->emulated[3]);
  361. #endif
  362. }
  363. // 1 / sqrt(x)
  364. // Useful for normalizing vectors
  365. F32x4 reciprocalSquareRoot() const {
  366. #ifdef USE_BASIC_SIMD
  367. #ifdef USE_SSE2
  368. //__m128 reciRoot = _mm_rsqrt_ps(this->v);
  369. SIMD_F32x4 reciRoot = _mm_rsqrt_ps(this->v);
  370. SIMD_F32x4 mul = MUL_F32_SIMD(MUL_F32_SIMD(this->v, reciRoot), reciRoot);
  371. reciRoot = MUL_F32_SIMD(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(0.5f), reciRoot), SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(3.0f), mul));
  372. return F32x4(reciRoot);
  373. #elif USE_NEON
  374. // Approximate
  375. SIMD_F32x4 reciRoot = vrsqrteq_f32(this->v);
  376. // Refine
  377. reciRoot = MUL_F32_SIMD(vrsqrtsq_f32(MUL_F32_SIMD(this->v, reciRoot), reciRoot), reciRoot);
  378. return reciRoot;
  379. #else
  380. assert(false);
  381. return F32x4(0);
  382. #endif
  383. #else
  384. return F32x4(1.0f / sqrt(this->emulated[0]), 1.0f / sqrt(this->emulated[1]), 1.0f / sqrt(this->emulated[2]), 1.0f / sqrt(this->emulated[3]));
  385. #endif
  386. }
  387. // sqrt(x)
  388. // Useful for getting lengths of vectors
  389. F32x4 squareRoot() const {
  390. #ifdef USE_BASIC_SIMD
  391. #ifdef USE_SSE2
  392. SIMD_F32x4 half = LOAD_SCALAR_F32_SIMD(0.5f);
  393. // Approximate
  394. SIMD_F32x4 root = _mm_sqrt_ps(this->v);
  395. // Refine
  396. root = _mm_mul_ps(_mm_add_ps(root, _mm_div_ps(this->v, root)), half);
  397. return F32x4(root);
  398. #elif USE_NEON
  399. return F32x4(MUL_F32_SIMD(this->v, this->reciprocalSquareRoot().v));
  400. #else
  401. assert(false);
  402. return F32x4(0);
  403. #endif
  404. #else
  405. return F32x4(sqrt(this->emulated[0]), sqrt(this->emulated[1]), sqrt(this->emulated[2]), sqrt(this->emulated[3]));
  406. #endif
  407. }
  408. F32x4 clamp(float min, float max) const {
  409. #ifdef USE_BASIC_SIMD
  410. return F32x4(MIN_F32_SIMD(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)), LOAD_SCALAR_F32_SIMD(max)));
  411. #else
  412. float val0 = this->emulated[0];
  413. float val1 = this->emulated[1];
  414. float val2 = this->emulated[2];
  415. float val3 = this->emulated[3];
  416. if (min > val0) { val0 = min; }
  417. if (max < val0) { val0 = max; }
  418. if (min > val1) { val1 = min; }
  419. if (max < val1) { val1 = max; }
  420. if (min > val2) { val2 = min; }
  421. if (max < val2) { val2 = max; }
  422. if (min > val3) { val3 = min; }
  423. if (max < val3) { val3 = max; }
  424. return F32x4(val0, val1, val2, val3);
  425. #endif
  426. }
  427. F32x4 clampLower(float min) const {
  428. #ifdef USE_BASIC_SIMD
  429. return F32x4(MAX_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(min)));
  430. #else
  431. float val0 = this->emulated[0];
  432. float val1 = this->emulated[1];
  433. float val2 = this->emulated[2];
  434. float val3 = this->emulated[3];
  435. if (min > val0) { val0 = min; }
  436. if (min > val1) { val1 = min; }
  437. if (min > val2) { val2 = min; }
  438. if (min > val3) { val3 = min; }
  439. return F32x4(val0, val1, val2, val3);
  440. #endif
  441. }
  442. F32x4 clampUpper(float max) const {
  443. #ifdef USE_BASIC_SIMD
  444. return F32x4(MIN_F32_SIMD(this->v, LOAD_SCALAR_F32_SIMD(max)));
  445. #else
  446. float val0 = this->emulated[0];
  447. float val1 = this->emulated[1];
  448. float val2 = this->emulated[2];
  449. float val3 = this->emulated[3];
  450. if (max < val0) { val0 = max; }
  451. if (max < val1) { val1 = max; }
  452. if (max < val2) { val2 = max; }
  453. if (max < val3) { val3 = max; }
  454. return F32x4(val0, val1, val2, val3);
  455. #endif
  456. }
  457. };
  458. inline dsr::String& string_toStreamIndented(dsr::String& target, const F32x4& source, const dsr::ReadableString& indentation) {
  459. string_append(target, indentation, source.get());
  460. return target;
  461. }
  462. inline bool operator==(const F32x4& left, const F32x4& right) {
  463. float a[4] ALIGN16;
  464. float b[4] ALIGN16;
  465. left.writeAlignedUnsafe(a);
  466. right.writeAlignedUnsafe(b);
  467. return fabs(a[0] - b[0]) < 0.0001f && fabs(a[1] - b[1]) < 0.0001f && fabs(a[2] - b[2]) < 0.0001f && fabs(a[3] - b[3]) < 0.0001f;
  468. }
  469. inline bool operator!=(const F32x4& left, const F32x4& right) {
  470. return !(left == right);
  471. }
  472. inline F32x4 operator+(const F32x4& left, const F32x4& right) {
  473. #ifdef USE_BASIC_SIMD
  474. return F32x4(ADD_F32_SIMD(left.v, right.v));
  475. #else
  476. return F32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  477. #endif
  478. }
  479. inline F32x4 operator+(float left, const F32x4& right) {
  480. #ifdef USE_BASIC_SIMD
  481. return F32x4(ADD_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  482. #else
  483. return F32x4(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3]);
  484. #endif
  485. }
  486. inline F32x4 operator+(const F32x4& left, float right) {
  487. #ifdef USE_BASIC_SIMD
  488. return F32x4(ADD_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  489. #else
  490. return F32x4(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right);
  491. #endif
  492. }
  493. inline F32x4 operator-(const F32x4& left, const F32x4& right) {
  494. #ifdef USE_BASIC_SIMD
  495. return F32x4(SUB_F32_SIMD(left.v, right.v));
  496. #else
  497. return F32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  498. #endif
  499. }
  500. inline F32x4 operator-(float left, const F32x4& right) {
  501. #ifdef USE_BASIC_SIMD
  502. return F32x4(SUB_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  503. #else
  504. return F32x4(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3]);
  505. #endif
  506. }
  507. inline F32x4 operator-(const F32x4& left, float right) {
  508. #ifdef USE_BASIC_SIMD
  509. return F32x4(SUB_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  510. #else
  511. return F32x4(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right);
  512. #endif
  513. }
  514. inline F32x4 operator*(const F32x4& left, const F32x4& right) {
  515. #ifdef USE_BASIC_SIMD
  516. return F32x4(MUL_F32_SIMD(left.v, right.v));
  517. #else
  518. return F32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  519. #endif
  520. }
  521. inline F32x4 operator*(float left, const F32x4& right) {
  522. #ifdef USE_BASIC_SIMD
  523. return F32x4(MUL_F32_SIMD(LOAD_SCALAR_F32_SIMD(left), right.v));
  524. #else
  525. return F32x4(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3]);
  526. #endif
  527. }
  528. inline F32x4 operator*(const F32x4& left, float right) {
  529. #ifdef USE_BASIC_SIMD
  530. return F32x4(MUL_F32_SIMD(left.v, LOAD_SCALAR_F32_SIMD(right)));
  531. #else
  532. return F32x4(left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right);
  533. #endif
  534. }
  535. inline F32x4 min(const F32x4& left, const F32x4& right) {
  536. #ifdef USE_BASIC_SIMD
  537. return F32x4(MIN_F32_SIMD(left.v, right.v));
  538. #else
  539. float v0 = left.emulated[0];
  540. float v1 = left.emulated[1];
  541. float v2 = left.emulated[2];
  542. float v3 = left.emulated[3];
  543. float r0 = right.emulated[0];
  544. float r1 = right.emulated[1];
  545. float r2 = right.emulated[2];
  546. float r3 = right.emulated[3];
  547. if (r0 < v0) { v0 = r0; }
  548. if (r1 < v1) { v1 = r1; }
  549. if (r2 < v2) { v2 = r2; }
  550. if (r3 < v3) { v3 = r3; }
  551. return F32x4(v0, v1, v2, v3);
  552. #endif
  553. }
  554. inline F32x4 max(const F32x4& left, const F32x4& right) {
  555. #ifdef USE_BASIC_SIMD
  556. return F32x4(MAX_F32_SIMD(left.v, right.v));
  557. #else
  558. float v0 = left.emulated[0];
  559. float v1 = left.emulated[1];
  560. float v2 = left.emulated[2];
  561. float v3 = left.emulated[3];
  562. float r0 = right.emulated[0];
  563. float r1 = right.emulated[1];
  564. float r2 = right.emulated[2];
  565. float r3 = right.emulated[3];
  566. if (r0 > v0) { v0 = r0; }
  567. if (r1 > v1) { v1 = r1; }
  568. if (r2 > v2) { v2 = r2; }
  569. if (r3 > v3) { v3 = r3; }
  570. return F32x4(v0, v1, v2, v3);
  571. #endif
  572. }
  573. union I32x4 {
  574. #ifdef USE_BASIC_SIMD
  575. public:
  576. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  577. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  578. // Direct access cannot be done on NEON!
  579. int32_t shared_memory[4];
  580. #endif
  581. // The SIMD vector of undefined type
  582. // Not accessible while emulating!
  583. SIMD_I32x4 v;
  584. // Construct a portable vector from a native SIMD vector
  585. explicit I32x4(const SIMD_I32x4& v) : v(v) {}
  586. // Construct a portable vector from a set of scalars
  587. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) : v(LOAD_VECTOR_I32_SIMD(a1, a2, a3, a4)) {}
  588. // Construct a portable vector from a single duplicated scalar
  589. explicit I32x4(int32_t scalar) : v(LOAD_SCALAR_I32_SIMD(scalar)) {}
  590. #else
  591. public:
  592. // Emulate a SIMD vector as an array of scalars without hardware support
  593. // Only accessible while emulating!
  594. int32_t emulated[4];
  595. // Construct a portable vector from a set of scalars
  596. I32x4(int32_t a1, int32_t a2, int32_t a3, int32_t a4) {
  597. this->emulated[0] = a1;
  598. this->emulated[1] = a2;
  599. this->emulated[2] = a3;
  600. this->emulated[3] = a4;
  601. }
  602. // Construct a portable vector from a single duplicated scalar
  603. explicit I32x4(int32_t scalar) {
  604. this->emulated[0] = scalar;
  605. this->emulated[1] = scalar;
  606. this->emulated[2] = scalar;
  607. this->emulated[3] = scalar;
  608. }
  609. #endif
  610. // Construct a portable SIMD vector from a pointer to aligned data
  611. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  612. static inline I32x4 readAlignedUnsafe(const int32_t* data) {
  613. #ifdef USE_BASIC_SIMD
  614. #ifdef USE_SSE2
  615. return I32x4(_mm_load_si128((const __m128i*)data));
  616. #elif USE_NEON
  617. return I32x4(vld1q_s32(data));
  618. #endif
  619. #else
  620. return I32x4(data[0], data[1], data[2], data[3]);
  621. #endif
  622. }
  623. // Write to aligned memory from the existing vector
  624. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  625. inline void writeAlignedUnsafe(int32_t* data) const {
  626. #ifdef USE_BASIC_SIMD
  627. #ifdef USE_SSE2
  628. _mm_store_si128((__m128i*)data, this->v);
  629. #elif USE_NEON
  630. vst1q_s32(data, this->v);
  631. #endif
  632. #else
  633. data[0] = this->emulated[0];
  634. data[1] = this->emulated[1];
  635. data[2] = this->emulated[2];
  636. data[3] = this->emulated[3];
  637. #endif
  638. }
  639. #ifdef DFPSR_GEOMETRY_IVECTOR
  640. dsr::IVector4D get() const {
  641. int32_t data[4] ALIGN16;
  642. this->writeAlignedUnsafe(data);
  643. return dsr::IVector4D(data[0], data[1], data[2], data[3]);
  644. }
  645. #endif
  646. // Bound and alignment checked reading
  647. static inline I32x4 readAligned(const dsr::SafePointer<int32_t> data, const char* methodName) {
  648. const int32_t* pointer = data.getUnsafe();
  649. assert(((uintptr_t)pointer & 15) == 0);
  650. #ifdef SAFE_POINTER_CHECKS
  651. data.assertInside(methodName, pointer, 16);
  652. #endif
  653. return I32x4::readAlignedUnsafe(pointer);
  654. }
  655. // Bound and alignment checked writing
  656. inline void writeAligned(dsr::SafePointer<int32_t> data, const char* methodName) const {
  657. int32_t* pointer = data.getUnsafe();
  658. assert(((uintptr_t)pointer & 15) == 0);
  659. #ifdef SAFE_POINTER_CHECKS
  660. data.assertInside(methodName, pointer, 16);
  661. #endif
  662. this->writeAlignedUnsafe(pointer);
  663. }
  664. };
  665. inline dsr::String& string_toStreamIndented(dsr::String& target, const I32x4& source, const dsr::ReadableString& indentation) {
  666. string_append(target, indentation, source.get());
  667. return target;
  668. }
  669. inline bool operator==(const I32x4& left, const I32x4& right) {
  670. int32_t a[4] ALIGN16;
  671. int32_t b[4] ALIGN16;
  672. left.writeAlignedUnsafe(a);
  673. right.writeAlignedUnsafe(b);
  674. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
  675. }
  676. inline bool operator!=(const I32x4& left, const I32x4& right) {
  677. return !(left == right);
  678. }
  679. inline I32x4 operator+(const I32x4& left, const I32x4& right) {
  680. #ifdef USE_BASIC_SIMD
  681. return I32x4(ADD_I32_SIMD(left.v, right.v));
  682. #else
  683. return I32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  684. #endif
  685. }
  686. inline I32x4 operator+(int32_t left, const I32x4& right) {
  687. return I32x4(left) + right;
  688. }
  689. inline I32x4 operator+(const I32x4& left, int32_t right) {
  690. return left + I32x4(right);
  691. }
  692. inline I32x4 operator-(const I32x4& left, const I32x4& right) {
  693. #ifdef USE_BASIC_SIMD
  694. return I32x4(SUB_I32_SIMD(left.v, right.v));
  695. #else
  696. return I32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  697. #endif
  698. }
  699. inline I32x4 operator-(int32_t left, const I32x4& right) {
  700. return I32x4(left) - right;
  701. }
  702. inline I32x4 operator-(const I32x4& left, int32_t right) {
  703. return left - I32x4(right);
  704. }
  705. inline I32x4 operator*(const I32x4& left, const I32x4& right) {
  706. #ifdef USE_BASIC_SIMD
  707. #ifdef USE_SSE2
  708. // Emulate a NEON instruction
  709. return I32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
  710. #elif USE_NEON
  711. return I32x4(MUL_I32_NEON(left.v, right.v));
  712. #endif
  713. #else
  714. return I32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  715. #endif
  716. }
  717. inline I32x4 operator*(int32_t left, const I32x4& right) {
  718. return I32x4(left) * right;
  719. }
  720. inline I32x4 operator*(const I32x4& left, int32_t right) {
  721. return left * I32x4(right);
  722. }
  723. union U32x4 {
  724. #ifdef USE_BASIC_SIMD
  725. public:
  726. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  727. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  728. // Direct access cannot be done on NEON!
  729. uint32_t shared_memory[4];
  730. #endif
  731. // The SIMD vector of undefined type
  732. // Not accessible while emulating!
  733. SIMD_U32x4 v;
  734. // Construct a portable vector from a native SIMD vector
  735. explicit U32x4(const SIMD_U32x4& v) : v(v) {}
  736. // Construct a portable vector from a set of scalars
  737. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) : v(LOAD_VECTOR_U32_SIMD(a1, a2, a3, a4)) {}
  738. // Construct a portable vector from a single duplicated scalar
  739. explicit U32x4(uint32_t scalar) : v(LOAD_SCALAR_U32_SIMD(scalar)) {}
  740. #else
  741. public:
  742. // Emulate a SIMD vector as an array of scalars without hardware support
  743. // Only accessible while emulating!
  744. uint32_t emulated[4];
  745. // Construct a portable vector from a set of scalars
  746. U32x4(uint32_t a1, uint32_t a2, uint32_t a3, uint32_t a4) {
  747. this->emulated[0] = a1;
  748. this->emulated[1] = a2;
  749. this->emulated[2] = a3;
  750. this->emulated[3] = a4;
  751. }
  752. // Construct a portable vector from a single duplicated scalar
  753. explicit U32x4(uint32_t scalar) {
  754. this->emulated[0] = scalar;
  755. this->emulated[1] = scalar;
  756. this->emulated[2] = scalar;
  757. this->emulated[3] = scalar;
  758. }
  759. #endif
  760. // Construct a portable SIMD vector from a pointer to aligned data
  761. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  762. static inline U32x4 readAlignedUnsafe(const uint32_t* data) {
  763. #ifdef USE_BASIC_SIMD
  764. #ifdef USE_SSE2
  765. return U32x4(_mm_load_si128((const __m128i*)data));
  766. #elif USE_NEON
  767. return U32x4(vld1q_u32(data));
  768. #endif
  769. #else
  770. return U32x4(data[0], data[1], data[2], data[3]);
  771. #endif
  772. }
  773. // Write to aligned memory from the existing vector
  774. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  775. inline void writeAlignedUnsafe(uint32_t* data) const {
  776. #ifdef USE_BASIC_SIMD
  777. #ifdef USE_SSE2
  778. _mm_store_si128((__m128i*)data, this->v);
  779. #elif USE_NEON
  780. vst1q_u32(data, this->v);
  781. #endif
  782. #else
  783. data[0] = this->emulated[0];
  784. data[1] = this->emulated[1];
  785. data[2] = this->emulated[2];
  786. data[3] = this->emulated[3];
  787. #endif
  788. }
  789. #ifdef DFPSR_GEOMETRY_UVECTOR
  790. dsr::UVector4D get() const {
  791. uint32_t data[4] ALIGN16;
  792. this->writeAlignedUnsafe(data);
  793. return dsr::UVector4D(data[0], data[1], data[2], data[3]);
  794. }
  795. #endif
  796. // Bound and alignment checked reading
  797. static inline U32x4 readAligned(const dsr::SafePointer<uint32_t> data, const char* methodName) {
  798. const uint32_t* pointer = data.getUnsafe();
  799. assert(((uintptr_t)pointer & 15) == 0);
  800. #ifdef SAFE_POINTER_CHECKS
  801. data.assertInside(methodName, pointer, 16);
  802. #endif
  803. return U32x4::readAlignedUnsafe(pointer);
  804. }
  805. // Bound and alignment checked writing
  806. inline void writeAligned(dsr::SafePointer<uint32_t> data, const char* methodName) const {
  807. uint32_t* pointer = data.getUnsafe();
  808. assert(((uintptr_t)pointer & 15) == 0);
  809. #ifdef SAFE_POINTER_CHECKS
  810. data.assertInside(methodName, pointer, 16);
  811. #endif
  812. this->writeAlignedUnsafe(pointer);
  813. }
  814. };
  815. inline dsr::String& string_toStreamIndented(dsr::String& target, const U32x4& source, const dsr::ReadableString& indentation) {
  816. string_append(target, indentation, source.get());
  817. return target;
  818. }
  819. inline bool operator==(const U32x4& left, const U32x4& right) {
  820. uint32_t a[4] ALIGN16;
  821. uint32_t b[4] ALIGN16;
  822. left.writeAlignedUnsafe(a);
  823. right.writeAlignedUnsafe(b);
  824. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3];
  825. }
  826. inline bool operator!=(const U32x4& left, const U32x4& right) {
  827. return !(left == right);
  828. }
  829. inline U32x4 operator+(const U32x4& left, const U32x4& right) {
  830. #ifdef USE_BASIC_SIMD
  831. return U32x4(ADD_U32_SIMD(left.v, right.v));
  832. #else
  833. return U32x4(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3]);
  834. #endif
  835. }
  836. inline U32x4 operator+(uint32_t left, const U32x4& right) {
  837. return U32x4(left) + right;
  838. }
  839. inline U32x4 operator+(const U32x4& left, uint32_t right) {
  840. return left + U32x4(right);
  841. }
  842. inline U32x4 operator-(const U32x4& left, const U32x4& right) {
  843. #ifdef USE_BASIC_SIMD
  844. return U32x4(SUB_U32_SIMD(left.v, right.v));
  845. #else
  846. return U32x4(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3]);
  847. #endif
  848. }
  849. inline U32x4 operator-(uint32_t left, const U32x4& right) {
  850. return U32x4(left) - right;
  851. }
  852. inline U32x4 operator-(const U32x4& left, uint32_t right) {
  853. return left - U32x4(right);
  854. }
  855. inline U32x4 operator*(const U32x4& left, const U32x4& right) {
  856. #ifdef USE_BASIC_SIMD
  857. #ifdef USE_SSE2
  858. // Emulate a NEON instruction on SSE2 registers
  859. return U32x4(left.shared_memory[0] * right.shared_memory[0], left.shared_memory[1] * right.shared_memory[1], left.shared_memory[2] * right.shared_memory[2], left.shared_memory[3] * right.shared_memory[3]);
  860. #else // NEON
  861. return U32x4(MUL_U32_NEON(left.v, right.v));
  862. #endif
  863. #else
  864. return U32x4(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3]);
  865. #endif
  866. }
  867. inline U32x4 operator*(uint32_t left, const U32x4& right) {
  868. return U32x4(left) * right;
  869. }
  870. inline U32x4 operator*(const U32x4& left, uint32_t right) {
  871. return left * U32x4(right);
  872. }
  873. inline U32x4 operator&(const U32x4& left, const U32x4& right) {
  874. #ifdef USE_BASIC_SIMD
  875. return U32x4(BITWISE_AND_U32_SIMD(left.v, right.v));
  876. #else
  877. return U32x4(left.emulated[0] & right.emulated[0], left.emulated[1] & right.emulated[1], left.emulated[2] & right.emulated[2], left.emulated[3] & right.emulated[3]);
  878. #endif
  879. }
  880. inline U32x4 operator&(const U32x4& left, uint32_t mask) {
  881. #ifdef USE_BASIC_SIMD
  882. return U32x4(BITWISE_AND_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  883. #else
  884. return U32x4(left.emulated[0] & mask, left.emulated[1] & mask, left.emulated[2] & mask, left.emulated[3] & mask);
  885. #endif
  886. }
  887. inline U32x4 operator|(const U32x4& left, const U32x4& right) {
  888. #ifdef USE_BASIC_SIMD
  889. return U32x4(BITWISE_OR_U32_SIMD(left.v, right.v));
  890. #else
  891. return U32x4(left.emulated[0] | right.emulated[0], left.emulated[1] | right.emulated[1], left.emulated[2] | right.emulated[2], left.emulated[3] | right.emulated[3]);
  892. #endif
  893. }
  894. inline U32x4 operator|(const U32x4& left, uint32_t mask) {
  895. #ifdef USE_BASIC_SIMD
  896. return U32x4(BITWISE_OR_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  897. #else
  898. return U32x4(left.emulated[0] | mask, left.emulated[1] | mask, left.emulated[2] | mask, left.emulated[3] | mask);
  899. #endif
  900. }
  901. inline U32x4 operator^(const U32x4& left, const U32x4& right) {
  902. #ifdef USE_BASIC_SIMD
  903. return U32x4(BITWISE_XOR_U32_SIMD(left.v, right.v));
  904. #else
  905. return U32x4(left.emulated[0] ^ right.emulated[0], left.emulated[1] ^ right.emulated[1], left.emulated[2] ^ right.emulated[2], left.emulated[3] ^ right.emulated[3]);
  906. #endif
  907. }
  908. inline U32x4 operator^(const U32x4& left, uint32_t mask) {
  909. #ifdef USE_BASIC_SIMD
  910. return U32x4(BITWISE_XOR_U32_SIMD(left.v, LOAD_SCALAR_U32_SIMD(mask)));
  911. #else
  912. return U32x4(left.emulated[0] ^ mask, left.emulated[1] ^ mask, left.emulated[2] ^ mask, left.emulated[3] ^ mask);
  913. #endif
  914. }
  915. inline U32x4 operator<<(const U32x4& left, uint32_t bitOffset) {
  916. #ifdef USE_SSE2
  917. return U32x4(_mm_slli_epi32(left.v, bitOffset));
  918. #else
  919. #ifdef USE_NEON
  920. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(bitOffset)));
  921. #else
  922. return U32x4(left.emulated[0] << bitOffset, left.emulated[1] << bitOffset, left.emulated[2] << bitOffset, left.emulated[3] << bitOffset);
  923. #endif
  924. #endif
  925. }
  926. inline U32x4 operator>>(const U32x4& left, uint32_t bitOffset) {
  927. #ifdef USE_SSE2
  928. return U32x4(_mm_srli_epi32(left.v, bitOffset));
  929. #else
  930. #ifdef USE_NEON
  931. return U32x4(vshlq_u32(left.v, LOAD_SCALAR_I32_SIMD(-bitOffset)));
  932. #else
  933. return U32x4(left.emulated[0] >> bitOffset, left.emulated[1] >> bitOffset, left.emulated[2] >> bitOffset, left.emulated[3] >> bitOffset);
  934. #endif
  935. #endif
  936. }
  937. union U16x8 {
  938. #ifdef USE_BASIC_SIMD
  939. public:
  940. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  941. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  942. // Direct access cannot be done on NEON!
  943. uint16_t shared_memory[8];
  944. #endif
  945. // The SIMD vector of undefined type
  946. // Not accessible while emulating!
  947. SIMD_U16x8 v;
  948. // Construct a portable vector from a native SIMD vector
  949. explicit U16x8(const SIMD_U16x8& v) : v(v) {}
  950. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  951. // Reinterpret casting is used
  952. explicit U16x8(const U32x4& vector) : v(REINTERPRET_U32_TO_U16_SIMD(vector.v)) {}
  953. // Construct a portable vector from a set of scalars
  954. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) : v(LOAD_VECTOR_U16_SIMD(a1, a2, a3, a4, a5, a6, a7, a8)) {}
  955. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  956. // Reinterpret casting is used
  957. // TODO: Remove all reintreprets from constructors to improve readability
  958. explicit U16x8(uint32_t scalar) : v(REINTERPRET_U32_TO_U16_SIMD(LOAD_SCALAR_U32_SIMD(scalar))) {}
  959. // Construct a portable vector from a single duplicated scalar
  960. explicit U16x8(uint16_t scalar) : v(LOAD_SCALAR_U16_SIMD(scalar)) {}
  961. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  962. U32x4 get_U32() const {
  963. return U32x4(REINTERPRET_U16_TO_U32_SIMD(this->v));
  964. }
  965. #else
  966. public:
  967. // Emulate a SIMD vector as an array of scalars without hardware support
  968. // Only accessible while emulating!
  969. uint16_t emulated[8];
  970. // Construct a vector of 8 x 16-bit unsigned integers from a vector of 4 x 32-bit unsigned integers
  971. // Reinterpret casting is used
  972. explicit U16x8(const U32x4& vector) {
  973. uint64_t *target = (uint64_t*)this->emulated;
  974. uint64_t *source = (uint64_t*)vector.emulated;
  975. target[0] = source[0];
  976. target[1] = source[1];
  977. }
  978. // Construct a portable vector from a set of scalars
  979. U16x8(uint16_t a1, uint16_t a2, uint16_t a3, uint16_t a4, uint16_t a5, uint16_t a6, uint16_t a7, uint16_t a8) {
  980. this->emulated[0] = a1;
  981. this->emulated[1] = a2;
  982. this->emulated[2] = a3;
  983. this->emulated[3] = a4;
  984. this->emulated[4] = a5;
  985. this->emulated[5] = a6;
  986. this->emulated[6] = a7;
  987. this->emulated[7] = a8;
  988. }
  989. // Construct a vector of 8 x 16-bit unsigned integers from a single duplicated 32-bit unsigned integer
  990. // Reinterpret casting is used
  991. explicit U16x8(uint32_t scalar) {
  992. uint32_t *target = (uint32_t*)this->emulated;
  993. target[0] = scalar;
  994. target[1] = scalar;
  995. target[2] = scalar;
  996. target[3] = scalar;
  997. }
  998. // Construct a portable vector from a single duplicated scalar
  999. explicit U16x8(uint16_t scalar) {
  1000. this->emulated[0] = scalar;
  1001. this->emulated[1] = scalar;
  1002. this->emulated[2] = scalar;
  1003. this->emulated[3] = scalar;
  1004. this->emulated[4] = scalar;
  1005. this->emulated[5] = scalar;
  1006. this->emulated[6] = scalar;
  1007. this->emulated[7] = scalar;
  1008. }
  1009. // Reinterpret cast to a vector of 4 x 32-bit unsigned integers
  1010. U32x4 get_U32() const {
  1011. U32x4 result(0);
  1012. uint64_t *target = (uint64_t*)result.emulated;
  1013. uint64_t *source = (uint64_t*)this->emulated;
  1014. target[0] = source[0];
  1015. target[1] = source[1];
  1016. return result;
  1017. }
  1018. #endif
  1019. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1020. //static inline U16x8 readSlow(uint16_t* data) {
  1021. // return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1022. //}
  1023. static inline U16x8 readAlignedUnsafe(const uint16_t* data) {
  1024. #ifdef USE_BASIC_SIMD
  1025. #ifdef USE_SSE2
  1026. return U16x8(_mm_load_si128((const __m128i*)data));
  1027. #elif USE_NEON
  1028. return U16x8(vld1q_u16(data));
  1029. #endif
  1030. #else
  1031. return U16x8(data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7]);
  1032. #endif
  1033. }
  1034. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1035. inline void writeAlignedUnsafe(uint16_t* data) const {
  1036. #ifdef USE_BASIC_SIMD
  1037. #ifdef USE_SSE2
  1038. _mm_store_si128((__m128i*)data, this->v);
  1039. #elif USE_NEON
  1040. vst1q_u16(data, this->v);
  1041. #endif
  1042. #else
  1043. data[0] = this->emulated[0];
  1044. data[1] = this->emulated[1];
  1045. data[2] = this->emulated[2];
  1046. data[3] = this->emulated[3];
  1047. data[4] = this->emulated[4];
  1048. data[5] = this->emulated[5];
  1049. data[6] = this->emulated[6];
  1050. data[7] = this->emulated[7];
  1051. #endif
  1052. }
  1053. // Bound and alignment checked reading
  1054. static inline U16x8 readAligned(const dsr::SafePointer<uint16_t> data, const char* methodName) {
  1055. const uint16_t* pointer = data.getUnsafe();
  1056. assert(((uintptr_t)pointer & 15) == 0);
  1057. #ifdef SAFE_POINTER_CHECKS
  1058. data.assertInside(methodName, pointer, 16);
  1059. #endif
  1060. return U16x8::readAlignedUnsafe(pointer);
  1061. }
  1062. // Bound and alignment checked writing
  1063. inline void writeAligned(dsr::SafePointer<uint16_t> data, const char* methodName) const {
  1064. uint16_t* pointer = data.getUnsafe();
  1065. assert(((uintptr_t)pointer & 15) == 0);
  1066. #ifdef SAFE_POINTER_CHECKS
  1067. data.assertInside(methodName, pointer, 16);
  1068. #endif
  1069. this->writeAlignedUnsafe(pointer);
  1070. }
  1071. };
  1072. inline dsr::String& string_toStreamIndented(dsr::String& target, const U16x8& source, const dsr::ReadableString& indentation) {
  1073. ALIGN16 uint16_t data[8];
  1074. source.writeAlignedUnsafe(data);
  1075. string_append(target, indentation, "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7], ")");
  1076. return target;
  1077. }
  1078. inline bool operator==(const U16x8& left, const U16x8& right) {
  1079. ALIGN16 uint16_t a[8];
  1080. ALIGN16 uint16_t b[8];
  1081. left.writeAlignedUnsafe(a);
  1082. right.writeAlignedUnsafe(b);
  1083. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7];
  1084. }
  1085. inline bool operator!=(const U16x8& left, const U16x8& right) {
  1086. return !(left == right);
  1087. }
  1088. inline U16x8 operator+(const U16x8& left, const U16x8& right) {
  1089. #ifdef USE_BASIC_SIMD
  1090. return U16x8(ADD_U16_SIMD(left.v, right.v));
  1091. #else
  1092. return U16x8(left.emulated[0] + right.emulated[0], left.emulated[1] + right.emulated[1], left.emulated[2] + right.emulated[2], left.emulated[3] + right.emulated[3],
  1093. left.emulated[4] + right.emulated[4], left.emulated[5] + right.emulated[5], left.emulated[6] + right.emulated[6], left.emulated[7] + right.emulated[7]);
  1094. #endif
  1095. }
  1096. inline U16x8 operator+(uint16_t left, const U16x8& right) {
  1097. #ifdef USE_BASIC_SIMD
  1098. return U16x8(ADD_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1099. #else
  1100. return U16x8(left + right.emulated[0], left + right.emulated[1], left + right.emulated[2], left + right.emulated[3],
  1101. left + right.emulated[4], left + right.emulated[5], left + right.emulated[6], left + right.emulated[7]);
  1102. #endif
  1103. }
  1104. inline U16x8 operator+(const U16x8& left, uint16_t right) {
  1105. #ifdef USE_BASIC_SIMD
  1106. return U16x8(ADD_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1107. #else
  1108. return U16x8(left.emulated[0] + right, left.emulated[1] + right, left.emulated[2] + right, left.emulated[3] + right,
  1109. left.emulated[4] + right, left.emulated[5] + right, left.emulated[6] + right, left.emulated[7] + right);
  1110. #endif
  1111. }
  1112. inline U16x8 operator-(const U16x8& left, const U16x8& right) {
  1113. #ifdef USE_BASIC_SIMD
  1114. return U16x8(SUB_U16_SIMD(left.v, right.v));
  1115. #else
  1116. return U16x8(left.emulated[0] - right.emulated[0], left.emulated[1] - right.emulated[1], left.emulated[2] - right.emulated[2], left.emulated[3] - right.emulated[3],
  1117. left.emulated[4] - right.emulated[4], left.emulated[5] - right.emulated[5], left.emulated[6] - right.emulated[6], left.emulated[7] - right.emulated[7]);
  1118. #endif
  1119. }
  1120. inline U16x8 operator-(uint16_t left, const U16x8& right) {
  1121. #ifdef USE_BASIC_SIMD
  1122. return U16x8(SUB_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1123. #else
  1124. return U16x8(left - right.emulated[0], left - right.emulated[1], left - right.emulated[2], left - right.emulated[3],
  1125. left - right.emulated[4], left - right.emulated[5], left - right.emulated[6], left - right.emulated[7]);
  1126. #endif
  1127. }
  1128. inline U16x8 operator-(const U16x8& left, uint16_t right) {
  1129. #ifdef USE_BASIC_SIMD
  1130. return U16x8(SUB_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1131. #else
  1132. return U16x8(left.emulated[0] - right, left.emulated[1] - right, left.emulated[2] - right, left.emulated[3] - right,
  1133. left.emulated[4] - right, left.emulated[5] - right, left.emulated[6] - right, left.emulated[7] - right);
  1134. #endif
  1135. }
  1136. inline U16x8 operator*(const U16x8& left, const U16x8& right) {
  1137. #ifdef USE_BASIC_SIMD
  1138. return U16x8(MUL_U16_SIMD(left.v, right.v));
  1139. #else
  1140. return U16x8(left.emulated[0] * right.emulated[0], left.emulated[1] * right.emulated[1], left.emulated[2] * right.emulated[2], left.emulated[3] * right.emulated[3],
  1141. left.emulated[4] * right.emulated[4], left.emulated[5] * right.emulated[5], left.emulated[6] * right.emulated[6], left.emulated[7] * right.emulated[7]);
  1142. #endif
  1143. }
  1144. inline U16x8 operator*(uint16_t left, const U16x8& right) {
  1145. #ifdef USE_BASIC_SIMD
  1146. return U16x8(MUL_U16_SIMD(LOAD_SCALAR_U16_SIMD(left), right.v));
  1147. #else
  1148. return U16x8(left * right.emulated[0], left * right.emulated[1], left * right.emulated[2], left * right.emulated[3],
  1149. left * right.emulated[4], left * right.emulated[5], left * right.emulated[6], left * right.emulated[7]);
  1150. #endif
  1151. }
  1152. inline U16x8 operator*(const U16x8& left, uint16_t right) {
  1153. #ifdef USE_BASIC_SIMD
  1154. return U16x8(MUL_U16_SIMD(left.v, LOAD_SCALAR_U16_SIMD(right)));
  1155. #else
  1156. return U16x8(
  1157. left.emulated[0] * right, left.emulated[1] * right, left.emulated[2] * right, left.emulated[3] * right,
  1158. left.emulated[4] * right, left.emulated[5] * right, left.emulated[6] * right, left.emulated[7] * right
  1159. );
  1160. #endif
  1161. }
  1162. union U8x16 {
  1163. #ifdef USE_BASIC_SIMD
  1164. public:
  1165. #ifdef USE_DIRECT_SIMD_MEMORY_ACCESS
  1166. // Only use if USE_DIRECT_SIMD_MEMORY_ACCESS is defined!
  1167. // Direct access cannot be done on NEON!
  1168. uint8_t shared_memory[16];
  1169. #endif
  1170. // The SIMD vector of undefined type
  1171. // Not accessible while emulating!
  1172. SIMD_U8x16 v;
  1173. // Construct a portable vector from a native SIMD vector
  1174. explicit U8x16(const SIMD_U8x16& v) : v(v) {}
  1175. // Construct a portable vector from a set of scalars
  1176. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1177. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16)
  1178. : v(LOAD_VECTOR_U8_SIMD(a1, a2, a3, a4, a5, a6, a7, a8, a9, a10, a11, a12, a13, a14, a15, a16)) {}
  1179. // Construct a portable vector from a single duplicated scalar
  1180. explicit U8x16(uint8_t scalar) : v(LOAD_SCALAR_U8_SIMD(scalar)) {}
  1181. #else
  1182. public:
  1183. // Emulate a SIMD vector as an array of scalars without hardware support
  1184. // Only accessible while emulating!
  1185. uint8_t emulated[16];
  1186. // Construct a portable vector from a set of scalars
  1187. U8x16(uint8_t a1, uint8_t a2, uint8_t a3, uint8_t a4, uint8_t a5, uint8_t a6, uint8_t a7, uint8_t a8,
  1188. uint8_t a9, uint8_t a10, uint8_t a11, uint8_t a12, uint8_t a13, uint8_t a14, uint8_t a15, uint8_t a16) {
  1189. this->emulated[0] = a1;
  1190. this->emulated[1] = a2;
  1191. this->emulated[2] = a3;
  1192. this->emulated[3] = a4;
  1193. this->emulated[4] = a5;
  1194. this->emulated[5] = a6;
  1195. this->emulated[6] = a7;
  1196. this->emulated[7] = a8;
  1197. this->emulated[8] = a9;
  1198. this->emulated[9] = a10;
  1199. this->emulated[10] = a11;
  1200. this->emulated[11] = a12;
  1201. this->emulated[12] = a13;
  1202. this->emulated[13] = a14;
  1203. this->emulated[14] = a15;
  1204. this->emulated[15] = a16;
  1205. }
  1206. // Construct a portable vector from a single duplicated scalar
  1207. explicit U8x16(uint8_t scalar) {
  1208. this->emulated[0] = scalar;
  1209. this->emulated[1] = scalar;
  1210. this->emulated[2] = scalar;
  1211. this->emulated[3] = scalar;
  1212. this->emulated[4] = scalar;
  1213. this->emulated[5] = scalar;
  1214. this->emulated[6] = scalar;
  1215. this->emulated[7] = scalar;
  1216. this->emulated[8] = scalar;
  1217. this->emulated[9] = scalar;
  1218. this->emulated[10] = scalar;
  1219. this->emulated[11] = scalar;
  1220. this->emulated[12] = scalar;
  1221. this->emulated[13] = scalar;
  1222. this->emulated[14] = scalar;
  1223. this->emulated[15] = scalar;
  1224. }
  1225. #endif
  1226. static inline U8x16 readAlignedUnsafe(const uint8_t* data) {
  1227. #ifdef USE_BASIC_SIMD
  1228. #ifdef USE_SSE2
  1229. return U8x16(_mm_load_si128((const __m128i*)data));
  1230. #elif USE_NEON
  1231. return U8x16(vld1q_u8(data));
  1232. #endif
  1233. #else
  1234. return U8x16(
  1235. data[0], data[1], data[2], data[3], data[4], data[5], data[6], data[7],
  1236. data[8], data[9], data[10], data[11], data[12], data[13], data[14], data[15]
  1237. );
  1238. #endif
  1239. }
  1240. // data must be aligned with at least 8 bytes, but preferrably 16 bytes
  1241. inline void writeAlignedUnsafe(uint8_t* data) const {
  1242. #ifdef USE_BASIC_SIMD
  1243. #ifdef USE_SSE2
  1244. _mm_store_si128((__m128i*)data, this->v);
  1245. #elif USE_NEON
  1246. vst1q_u8(data, this->v);
  1247. #endif
  1248. #else
  1249. data[0] = this->emulated[0];
  1250. data[1] = this->emulated[1];
  1251. data[2] = this->emulated[2];
  1252. data[3] = this->emulated[3];
  1253. data[4] = this->emulated[4];
  1254. data[5] = this->emulated[5];
  1255. data[6] = this->emulated[6];
  1256. data[7] = this->emulated[7];
  1257. data[8] = this->emulated[8];
  1258. data[9] = this->emulated[9];
  1259. data[10] = this->emulated[10];
  1260. data[11] = this->emulated[11];
  1261. data[12] = this->emulated[12];
  1262. data[13] = this->emulated[13];
  1263. data[14] = this->emulated[14];
  1264. data[15] = this->emulated[15];
  1265. #endif
  1266. }
  1267. // Bound and alignment checked reading
  1268. static inline U8x16 readAligned(const dsr::SafePointer<uint8_t> data, const char* methodName) {
  1269. const uint8_t* pointer = data.getUnsafe();
  1270. assert(((uintptr_t)pointer & 15) == 0);
  1271. #ifdef SAFE_POINTER_CHECKS
  1272. data.assertInside(methodName, pointer, 16);
  1273. #endif
  1274. return U8x16::readAlignedUnsafe(pointer);
  1275. }
  1276. // Bound and alignment checked writing
  1277. inline void writeAligned(dsr::SafePointer<uint8_t> data, const char* methodName) const {
  1278. uint8_t* pointer = data.getUnsafe();
  1279. assert(((uintptr_t)pointer & 15) == 0);
  1280. #ifdef SAFE_POINTER_CHECKS
  1281. data.assertInside(methodName, pointer, 16);
  1282. #endif
  1283. this->writeAlignedUnsafe(pointer);
  1284. }
  1285. };
  1286. inline dsr::String& string_toStreamIndented(dsr::String& target, const U8x16& source, const dsr::ReadableString& indentation) {
  1287. ALIGN16 uint8_t data[16];
  1288. source.writeAlignedUnsafe(data);
  1289. string_append(target, indentation,
  1290. "(", data[0], ", ", data[1], ", ", data[2], ", ", data[3], ", ", data[4], ", ", data[5], ", ", data[6], ", ", data[7],
  1291. ", ", data[8], ", ", data[9], ", ", data[10], ", ", data[11], ", ", data[12], ", ", data[13], ", ", data[14], ", ", data[15], ")"
  1292. );
  1293. return target;
  1294. }
  1295. inline bool operator==(const U8x16& left, const U8x16& right) {
  1296. ALIGN16 uint8_t a[16];
  1297. ALIGN16 uint8_t b[16];
  1298. left.writeAlignedUnsafe(a);
  1299. right.writeAlignedUnsafe(b);
  1300. return a[0] == b[0] && a[1] == b[1] && a[2] == b[2] && a[3] == b[3] && a[4] == b[4] && a[5] == b[5] && a[6] == b[6] && a[7] == b[7]
  1301. && a[8] == b[8] && a[9] == b[9] && a[10] == b[10] && a[11] == b[11] && a[12] == b[12] && a[13] == b[13] && a[14] == b[14] && a[15] == b[15];
  1302. }
  1303. inline bool operator!=(const U8x16& left, const U8x16& right) {
  1304. return !(left == right);
  1305. }
  1306. inline U8x16 operator+(const U8x16& left, const U8x16& right) {
  1307. #ifdef USE_BASIC_SIMD
  1308. return U8x16(ADD_U8_SIMD(left.v, right.v));
  1309. #else
  1310. return U8x16(
  1311. left.emulated[0] + right.emulated[0],
  1312. left.emulated[1] + right.emulated[1],
  1313. left.emulated[2] + right.emulated[2],
  1314. left.emulated[3] + right.emulated[3],
  1315. left.emulated[4] + right.emulated[4],
  1316. left.emulated[5] + right.emulated[5],
  1317. left.emulated[6] + right.emulated[6],
  1318. left.emulated[7] + right.emulated[7],
  1319. left.emulated[8] + right.emulated[8],
  1320. left.emulated[9] + right.emulated[9],
  1321. left.emulated[10] + right.emulated[10],
  1322. left.emulated[11] + right.emulated[11],
  1323. left.emulated[12] + right.emulated[12],
  1324. left.emulated[13] + right.emulated[13],
  1325. left.emulated[14] + right.emulated[14],
  1326. left.emulated[15] + right.emulated[15]
  1327. );
  1328. #endif
  1329. }
  1330. inline U8x16 operator+(uint8_t left, const U8x16& right) {
  1331. #ifdef USE_BASIC_SIMD
  1332. return U8x16(ADD_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
  1333. #else
  1334. return U8x16(
  1335. left + right.emulated[0],
  1336. left + right.emulated[1],
  1337. left + right.emulated[2],
  1338. left + right.emulated[3],
  1339. left + right.emulated[4],
  1340. left + right.emulated[5],
  1341. left + right.emulated[6],
  1342. left + right.emulated[7],
  1343. left + right.emulated[8],
  1344. left + right.emulated[9],
  1345. left + right.emulated[10],
  1346. left + right.emulated[11],
  1347. left + right.emulated[12],
  1348. left + right.emulated[13],
  1349. left + right.emulated[14],
  1350. left + right.emulated[15]
  1351. );
  1352. #endif
  1353. }
  1354. inline U8x16 operator+(const U8x16& left, uint8_t right) {
  1355. #ifdef USE_BASIC_SIMD
  1356. return U8x16(ADD_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
  1357. #else
  1358. return U8x16(
  1359. left.emulated[0] + right,
  1360. left.emulated[1] + right,
  1361. left.emulated[2] + right,
  1362. left.emulated[3] + right,
  1363. left.emulated[4] + right,
  1364. left.emulated[5] + right,
  1365. left.emulated[6] + right,
  1366. left.emulated[7] + right,
  1367. left.emulated[8] + right,
  1368. left.emulated[9] + right,
  1369. left.emulated[10] + right,
  1370. left.emulated[11] + right,
  1371. left.emulated[12] + right,
  1372. left.emulated[13] + right,
  1373. left.emulated[14] + right,
  1374. left.emulated[15] + right
  1375. );
  1376. #endif
  1377. }
  1378. inline U8x16 operator-(const U8x16& left, const U8x16& right) {
  1379. #ifdef USE_BASIC_SIMD
  1380. return U8x16(SUB_U8_SIMD(left.v, right.v));
  1381. #else
  1382. return U8x16(
  1383. left.emulated[0] - right.emulated[0],
  1384. left.emulated[1] - right.emulated[1],
  1385. left.emulated[2] - right.emulated[2],
  1386. left.emulated[3] - right.emulated[3],
  1387. left.emulated[4] - right.emulated[4],
  1388. left.emulated[5] - right.emulated[5],
  1389. left.emulated[6] - right.emulated[6],
  1390. left.emulated[7] - right.emulated[7],
  1391. left.emulated[8] - right.emulated[8],
  1392. left.emulated[9] - right.emulated[9],
  1393. left.emulated[10] - right.emulated[10],
  1394. left.emulated[11] - right.emulated[11],
  1395. left.emulated[12] - right.emulated[12],
  1396. left.emulated[13] - right.emulated[13],
  1397. left.emulated[14] - right.emulated[14],
  1398. left.emulated[15] - right.emulated[15]
  1399. );
  1400. #endif
  1401. }
  1402. inline U8x16 operator-(uint8_t left, const U8x16& right) {
  1403. #ifdef USE_BASIC_SIMD
  1404. return U8x16(SUB_U8_SIMD(LOAD_SCALAR_U8_SIMD(left), right.v));
  1405. #else
  1406. return U8x16(
  1407. left - right.emulated[0],
  1408. left - right.emulated[1],
  1409. left - right.emulated[2],
  1410. left - right.emulated[3],
  1411. left - right.emulated[4],
  1412. left - right.emulated[5],
  1413. left - right.emulated[6],
  1414. left - right.emulated[7],
  1415. left - right.emulated[8],
  1416. left - right.emulated[9],
  1417. left - right.emulated[10],
  1418. left - right.emulated[11],
  1419. left - right.emulated[12],
  1420. left - right.emulated[13],
  1421. left - right.emulated[14],
  1422. left - right.emulated[15]
  1423. );
  1424. #endif
  1425. }
  1426. inline U8x16 operator-(const U8x16& left, uint8_t right) {
  1427. #ifdef USE_BASIC_SIMD
  1428. return U8x16(SUB_U8_SIMD(left.v, LOAD_SCALAR_U8_SIMD(right)));
  1429. #else
  1430. return U8x16(
  1431. left.emulated[0] - right,
  1432. left.emulated[1] - right,
  1433. left.emulated[2] - right,
  1434. left.emulated[3] - right,
  1435. left.emulated[4] - right,
  1436. left.emulated[5] - right,
  1437. left.emulated[6] - right,
  1438. left.emulated[7] - right,
  1439. left.emulated[8] - right,
  1440. left.emulated[9] - right,
  1441. left.emulated[10] - right,
  1442. left.emulated[11] - right,
  1443. left.emulated[12] - right,
  1444. left.emulated[13] - right,
  1445. left.emulated[14] - right,
  1446. left.emulated[15] - right
  1447. );
  1448. #endif
  1449. }
  1450. inline uint8_t impl_limit0(int32_t x) { return x < 0 ? 0 : x; }
  1451. inline uint8_t impl_limit255(uint32_t x) { return x > 255 ? 255 : x; }
  1452. inline U8x16 saturatedAddition(const U8x16& left, const U8x16& right) {
  1453. #ifdef USE_BASIC_SIMD
  1454. return U8x16(ADD_SAT_U8_SIMD(left.v, right.v));
  1455. #else
  1456. return U8x16(
  1457. impl_limit255((uint32_t)left.emulated[0] + (uint32_t)right.emulated[0]),
  1458. impl_limit255((uint32_t)left.emulated[1] + (uint32_t)right.emulated[1]),
  1459. impl_limit255((uint32_t)left.emulated[2] + (uint32_t)right.emulated[2]),
  1460. impl_limit255((uint32_t)left.emulated[3] + (uint32_t)right.emulated[3]),
  1461. impl_limit255((uint32_t)left.emulated[4] + (uint32_t)right.emulated[4]),
  1462. impl_limit255((uint32_t)left.emulated[5] + (uint32_t)right.emulated[5]),
  1463. impl_limit255((uint32_t)left.emulated[6] + (uint32_t)right.emulated[6]),
  1464. impl_limit255((uint32_t)left.emulated[7] + (uint32_t)right.emulated[7]),
  1465. impl_limit255((uint32_t)left.emulated[8] + (uint32_t)right.emulated[8]),
  1466. impl_limit255((uint32_t)left.emulated[9] + (uint32_t)right.emulated[9]),
  1467. impl_limit255((uint32_t)left.emulated[10] + (uint32_t)right.emulated[10]),
  1468. impl_limit255((uint32_t)left.emulated[11] + (uint32_t)right.emulated[11]),
  1469. impl_limit255((uint32_t)left.emulated[12] + (uint32_t)right.emulated[12]),
  1470. impl_limit255((uint32_t)left.emulated[13] + (uint32_t)right.emulated[13]),
  1471. impl_limit255((uint32_t)left.emulated[14] + (uint32_t)right.emulated[14]),
  1472. impl_limit255((uint32_t)left.emulated[15] + (uint32_t)right.emulated[15])
  1473. );
  1474. #endif
  1475. }
  1476. inline U8x16 saturatedSubtraction(const U8x16& left, const U8x16& right) {
  1477. #ifdef USE_BASIC_SIMD
  1478. return U8x16(SUB_SAT_U8_SIMD(left.v, right.v));
  1479. #else
  1480. return U8x16(
  1481. impl_limit0((int32_t)left.emulated[0] - (int32_t)right.emulated[0]),
  1482. impl_limit0((int32_t)left.emulated[1] - (int32_t)right.emulated[1]),
  1483. impl_limit0((int32_t)left.emulated[2] - (int32_t)right.emulated[2]),
  1484. impl_limit0((int32_t)left.emulated[3] - (int32_t)right.emulated[3]),
  1485. impl_limit0((int32_t)left.emulated[4] - (int32_t)right.emulated[4]),
  1486. impl_limit0((int32_t)left.emulated[5] - (int32_t)right.emulated[5]),
  1487. impl_limit0((int32_t)left.emulated[6] - (int32_t)right.emulated[6]),
  1488. impl_limit0((int32_t)left.emulated[7] - (int32_t)right.emulated[7]),
  1489. impl_limit0((int32_t)left.emulated[8] - (int32_t)right.emulated[8]),
  1490. impl_limit0((int32_t)left.emulated[9] - (int32_t)right.emulated[9]),
  1491. impl_limit0((int32_t)left.emulated[10] - (int32_t)right.emulated[10]),
  1492. impl_limit0((int32_t)left.emulated[11] - (int32_t)right.emulated[11]),
  1493. impl_limit0((int32_t)left.emulated[12] - (int32_t)right.emulated[12]),
  1494. impl_limit0((int32_t)left.emulated[13] - (int32_t)right.emulated[13]),
  1495. impl_limit0((int32_t)left.emulated[14] - (int32_t)right.emulated[14]),
  1496. impl_limit0((int32_t)left.emulated[15] - (int32_t)right.emulated[15])
  1497. );
  1498. #endif
  1499. }
  1500. inline I32x4 truncateToI32(const F32x4& vector) {
  1501. #ifdef USE_BASIC_SIMD
  1502. return I32x4(F32_TO_I32_SIMD(vector.v));
  1503. #else
  1504. return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
  1505. #endif
  1506. }
  1507. inline U32x4 truncateToU32(const F32x4& vector) {
  1508. #ifdef USE_BASIC_SIMD
  1509. return U32x4(F32_TO_U32_SIMD(vector.v));
  1510. #else
  1511. return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
  1512. #endif
  1513. }
  1514. inline F32x4 floatFromI32(const I32x4& vector) {
  1515. #ifdef USE_BASIC_SIMD
  1516. return F32x4(I32_TO_F32_SIMD(vector.v));
  1517. #else
  1518. return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
  1519. #endif
  1520. }
  1521. inline F32x4 floatFromU32(const U32x4& vector) {
  1522. #ifdef USE_BASIC_SIMD
  1523. return F32x4(U32_TO_F32_SIMD(vector.v));
  1524. #else
  1525. return F32x4((float)vector.emulated[0], (float)vector.emulated[1], (float)vector.emulated[2], (float)vector.emulated[3]);
  1526. #endif
  1527. }
  1528. inline I32x4 I32FromU32(const U32x4& vector) {
  1529. #ifdef USE_BASIC_SIMD
  1530. return I32x4(REINTERPRET_U32_TO_I32_SIMD(vector.v));
  1531. #else
  1532. return I32x4((int32_t)vector.emulated[0], (int32_t)vector.emulated[1], (int32_t)vector.emulated[2], (int32_t)vector.emulated[3]);
  1533. #endif
  1534. }
  1535. inline U32x4 U32FromI32(const I32x4& vector) {
  1536. #ifdef USE_BASIC_SIMD
  1537. return U32x4(REINTERPRET_I32_TO_U32_SIMD(vector.v));
  1538. #else
  1539. return U32x4((uint32_t)vector.emulated[0], (uint32_t)vector.emulated[1], (uint32_t)vector.emulated[2], (uint32_t)vector.emulated[3]);
  1540. #endif
  1541. }
  1542. // Warning! Behaviour depends on endianness.
  1543. inline U8x16 reinterpret_U8FromU32(const U32x4& vector) {
  1544. #ifdef USE_BASIC_SIMD
  1545. return U8x16(REINTERPRET_U32_TO_U8_SIMD(vector.v));
  1546. #else
  1547. const uint8_t *source = (const uint8_t*)vector.emulated;
  1548. return U8x16(
  1549. source[0], source[1], source[2], source[3], source[4], source[5], source[6], source[7],
  1550. source[8], source[9], source[10], source[11], source[12], source[13], source[14], source[15]
  1551. );
  1552. #endif
  1553. }
  1554. // Warning! Behaviour depends on endianness.
  1555. inline U32x4 reinterpret_U32FromU8(const U8x16& vector) {
  1556. #ifdef USE_BASIC_SIMD
  1557. return U32x4(REINTERPRET_U8_TO_U32_SIMD(vector.v));
  1558. #else
  1559. const uint32_t *source = (const uint32_t*)vector.emulated;
  1560. return U32x4(source[0], source[1], source[2], source[3]);
  1561. #endif
  1562. }
  1563. // Unpacking to larger integers
  1564. inline U32x4 lowerToU32(const U16x8& vector) {
  1565. #ifdef USE_BASIC_SIMD
  1566. return U32x4(U16_LOW_TO_U32_SIMD(vector.v));
  1567. #else
  1568. return U32x4(vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3]);
  1569. #endif
  1570. }
  1571. inline U32x4 higherToU32(const U16x8& vector) {
  1572. #ifdef USE_BASIC_SIMD
  1573. return U32x4(U16_HIGH_TO_U32_SIMD(vector.v));
  1574. #else
  1575. return U32x4(vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]);
  1576. #endif
  1577. }
  1578. inline U16x8 lowerToU16(const U8x16& vector) {
  1579. #ifdef USE_BASIC_SIMD
  1580. return U16x8(U8_LOW_TO_U16_SIMD(vector.v));
  1581. #else
  1582. return U16x8(
  1583. vector.emulated[0], vector.emulated[1], vector.emulated[2], vector.emulated[3],
  1584. vector.emulated[4], vector.emulated[5], vector.emulated[6], vector.emulated[7]
  1585. );
  1586. #endif
  1587. }
  1588. inline U16x8 higherToU16(const U8x16& vector) {
  1589. #ifdef USE_BASIC_SIMD
  1590. return U16x8(U8_HIGH_TO_U16_SIMD(vector.v));
  1591. #else
  1592. return U16x8(
  1593. vector.emulated[8], vector.emulated[9], vector.emulated[10], vector.emulated[11],
  1594. vector.emulated[12], vector.emulated[13], vector.emulated[14], vector.emulated[15]
  1595. );
  1596. #endif
  1597. }
  1598. // Saturated packing
  1599. inline U8x16 saturateToU8(const U16x8& lower, const U16x8& upper) {
  1600. #ifdef USE_BASIC_SIMD
  1601. return U8x16(PACK_SAT_U16_TO_U8(lower.v, upper.v));
  1602. #else
  1603. return U8x16(
  1604. impl_limit255(lower.emulated[0]),
  1605. impl_limit255(lower.emulated[1]),
  1606. impl_limit255(lower.emulated[2]),
  1607. impl_limit255(lower.emulated[3]),
  1608. impl_limit255(lower.emulated[4]),
  1609. impl_limit255(lower.emulated[5]),
  1610. impl_limit255(lower.emulated[6]),
  1611. impl_limit255(lower.emulated[7]),
  1612. impl_limit255(upper.emulated[0]),
  1613. impl_limit255(upper.emulated[1]),
  1614. impl_limit255(upper.emulated[2]),
  1615. impl_limit255(upper.emulated[3]),
  1616. impl_limit255(upper.emulated[4]),
  1617. impl_limit255(upper.emulated[5]),
  1618. impl_limit255(upper.emulated[6]),
  1619. impl_limit255(upper.emulated[7])
  1620. );
  1621. #endif
  1622. }
  1623. // Unary negation for convenience and code readability.
  1624. // Before using unary negation, always check if:
  1625. // * An addition can be turned into a subtraction?
  1626. // x = -a + b
  1627. // x = b - a
  1628. // * A multiplying constant or scalar can be negated instead?
  1629. // x = -b * 2
  1630. // x = b * -2
  1631. inline F32x4 operator-(const F32x4& value) {
  1632. #ifdef USE_BASIC_SIMD
  1633. return F32x4(0) - value;
  1634. #else
  1635. return F32x4(-value.emulated[0], -value.emulated[1], -value.emulated[2], -value.emulated[3]);
  1636. #endif
  1637. }
  1638. inline I32x4 operator-(const I32x4& value) {
  1639. #ifdef USE_BASIC_SIMD
  1640. return I32x4(0.0f) - value;
  1641. #else
  1642. return I32x4(-value.emulated[0], -value.emulated[1], -value.emulated[2], -value.emulated[3]);
  1643. #endif
  1644. }
  1645. // Helper macros for generating the vector extract functions.
  1646. // Having one function for each type and offset makes sure that the compiler gets an immediate integer within the valid range.
  1647. #ifdef USE_BASIC_SIMD
  1648. #ifdef USE_SSE2
  1649. #ifdef USE_SSSE3
  1650. #define _MM_ALIGNR_EPI8(A, B, OFFSET) _mm_alignr_epi8(A, B, OFFSET)
  1651. #else
  1652. // If SSSE3 is not used, emulate it using stack memory and unaligned reading of data.
  1653. static inline SIMD_U8x16 _MM_ALIGNR_EPI8(SIMD_U8x16 a, SIMD_U8x16 b, int offset) {
  1654. ALIGN16 uint8_t vectorBuffer[32];
  1655. _mm_store_si128((SIMD_U8x16*)(vectorBuffer), b);
  1656. _mm_store_si128((SIMD_U8x16*)(vectorBuffer + 16), a);
  1657. return _mm_loadu_si128((SIMD_U8x16*)(vectorBuffer + offset));
  1658. }
  1659. #endif
  1660. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET));
  1661. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 2));
  1662. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  1663. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(_MM_ALIGNR_EPI8(b.v, a.v, OFFSET * 4));
  1664. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(SIMD_F32x4(_MM_ALIGNR_EPI8(SIMD_U32x4(b.v), SIMD_U32x4(a.v), OFFSET * 4)));
  1665. #elif USE_NEON
  1666. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return U8x16(vextq_u8(a.v, b.v, OFFSET));
  1667. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return U16x8(vextq_u16(a.v, b.v, OFFSET));
  1668. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return U32x4(vextq_u32(a.v, b.v, OFFSET));
  1669. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return I32x4(vextq_s32(a.v, b.v, OFFSET));
  1670. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return F32x4(vextq_f32(a.v, b.v, OFFSET));
  1671. #endif
  1672. #else
  1673. #define VECTOR_EXTRACT_GENERATOR_U8(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1674. #define VECTOR_EXTRACT_GENERATOR_U16(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1675. #define VECTOR_EXTRACT_GENERATOR_U32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1676. #define VECTOR_EXTRACT_GENERATOR_I32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1677. #define VECTOR_EXTRACT_GENERATOR_F32(OFFSET, FALLBACK_RESULT) return FALLBACK_RESULT;
  1678. #endif
  1679. // Vector extraction concatunates two input vectors and reads a vector between them using an offset.
  1680. // The first and last offsets that only return one of the inputs can be used for readability, because they will be inlined and removed by the compiler.
  1681. // To get elements from the right side, combine the center vector with the right vector and shift one element to the left using vectorExtract_1 for the given type.
  1682. // To get elements from the left side, combine the left vector with the center vector and shift one element to the right using vectorExtract_15 for 16 lanes, vectorExtract_7 for 8 lanes, or vectorExtract_3 for 4 lanes.
  1683. U8x16 inline vectorExtract_0(const U8x16 &a, const U8x16 &b) { return a; }
  1684. U8x16 inline vectorExtract_1(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(1, U8x16(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0])) }
  1685. U8x16 inline vectorExtract_2(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(2, U8x16(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1])) }
  1686. U8x16 inline vectorExtract_3(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(3, U8x16(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1687. U8x16 inline vectorExtract_4(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(4, U8x16(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
  1688. U8x16 inline vectorExtract_5(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(5, U8x16(a.emulated[5], a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
  1689. U8x16 inline vectorExtract_6(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(6, U8x16(a.emulated[6], a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
  1690. U8x16 inline vectorExtract_7(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(7, U8x16(a.emulated[7], a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
  1691. U8x16 inline vectorExtract_8(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(8, U8x16(a.emulated[8], a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7])) }
  1692. U8x16 inline vectorExtract_9(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(9, U8x16(a.emulated[9], a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8])) }
  1693. U8x16 inline vectorExtract_10(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(10, U8x16(a.emulated[10], a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9])) }
  1694. U8x16 inline vectorExtract_11(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(11, U8x16(a.emulated[11], a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10])) }
  1695. U8x16 inline vectorExtract_12(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(12, U8x16(a.emulated[12], a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11])) }
  1696. U8x16 inline vectorExtract_13(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(13, U8x16(a.emulated[13], a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12])) }
  1697. U8x16 inline vectorExtract_14(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(14, U8x16(a.emulated[14], a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13])) }
  1698. U8x16 inline vectorExtract_15(const U8x16 &a, const U8x16 &b) { VECTOR_EXTRACT_GENERATOR_U8(15, U8x16(a.emulated[15], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6], b.emulated[7], b.emulated[8], b.emulated[9], b.emulated[10], b.emulated[11], b.emulated[12], b.emulated[13], b.emulated[14])) }
  1699. U8x16 inline vectorExtract_16(const U8x16 &a, const U8x16 &b) { return b; }
  1700. U16x8 inline vectorExtract_0(const U16x8 &a, const U16x8 &b) { return a; }
  1701. U16x8 inline vectorExtract_1(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(1, U16x8(a.emulated[1], a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0])) }
  1702. U16x8 inline vectorExtract_2(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(2, U16x8(a.emulated[2], a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1])) }
  1703. U16x8 inline vectorExtract_3(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(3, U16x8(a.emulated[3], a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1704. U16x8 inline vectorExtract_4(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(4, U16x8(a.emulated[4], a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3])) }
  1705. U16x8 inline vectorExtract_5(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(5, U16x8(a.emulated[5], a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4])) }
  1706. U16x8 inline vectorExtract_6(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(6, U16x8(a.emulated[6], a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5])) }
  1707. U16x8 inline vectorExtract_7(const U16x8 &a, const U16x8 &b) { VECTOR_EXTRACT_GENERATOR_U16(7, U16x8(a.emulated[7], b.emulated[0], b.emulated[1], b.emulated[2], b.emulated[3], b.emulated[4], b.emulated[5], b.emulated[6])) }
  1708. U16x8 inline vectorExtract_8(const U16x8 &a, const U16x8 &b) { return b; }
  1709. U32x4 inline vectorExtract_0(const U32x4 &a, const U32x4 &b) { return a; }
  1710. U32x4 inline vectorExtract_1(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(1, U32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1711. U32x4 inline vectorExtract_2(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(2, U32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1712. U32x4 inline vectorExtract_3(const U32x4 &a, const U32x4 &b) { VECTOR_EXTRACT_GENERATOR_U32(3, U32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1713. U32x4 inline vectorExtract_4(const U32x4 &a, const U32x4 &b) { return b; }
  1714. I32x4 inline vectorExtract_0(const I32x4 &a, const I32x4 &b) { return a; }
  1715. I32x4 inline vectorExtract_1(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(1, I32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1716. I32x4 inline vectorExtract_2(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(2, I32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1717. I32x4 inline vectorExtract_3(const I32x4 &a, const I32x4 &b) { VECTOR_EXTRACT_GENERATOR_I32(3, I32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1718. I32x4 inline vectorExtract_4(const I32x4 &a, const I32x4 &b) { return b; }
  1719. F32x4 inline vectorExtract_0(const F32x4 &a, const F32x4 &b) { return a; }
  1720. F32x4 inline vectorExtract_1(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(1, F32x4(a.emulated[1], a.emulated[2], a.emulated[3], b.emulated[0])) }
  1721. F32x4 inline vectorExtract_2(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(2, F32x4(a.emulated[2], a.emulated[3], b.emulated[0], b.emulated[1])) }
  1722. F32x4 inline vectorExtract_3(const F32x4 &a, const F32x4 &b) { VECTOR_EXTRACT_GENERATOR_F32(3, F32x4(a.emulated[3], b.emulated[0], b.emulated[1], b.emulated[2])) }
  1723. F32x4 inline vectorExtract_4(const F32x4 &a, const F32x4 &b) { return b; }
  1724. // Gather instructions load memory from a pointer at multiple index offsets at the same time.
  1725. // The given pointers should be aligned with 4 bytes, so that the fallback solution works on machines with strict alignment requirements.
  1726. #ifdef USE_AVX2
  1727. #define GATHER_I32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  1728. #define GATHER_U32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_epi32((const int32_t*)(SOURCE), FOUR_OFFSETS, SCALE)
  1729. #define GATHER_F32_AVX2(SOURCE, FOUR_OFFSETS, SCALE) _mm_i32gather_ps((const float*)(SOURCE), FOUR_OFFSETS, SCALE)
  1730. #endif
  1731. static inline U32x4 gather(const dsr::SafePointer<uint32_t> data, const U32x4 &elementOffset) {
  1732. #ifdef USE_AVX2
  1733. return U32x4(GATHER_U32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1734. #else
  1735. dsr::UVector4D elementOffsetS = elementOffset.get();
  1736. return U32x4(
  1737. *(data + elementOffsetS.x),
  1738. *(data + elementOffsetS.y),
  1739. *(data + elementOffsetS.z),
  1740. *(data + elementOffsetS.w)
  1741. );
  1742. #endif
  1743. }
  1744. static inline I32x4 gather(const dsr::SafePointer<int32_t> data, const U32x4 &elementOffset) {
  1745. #ifdef USE_AVX2
  1746. return I32x4(GATHER_U32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1747. #else
  1748. dsr::UVector4D elementOffsetS = elementOffset.get();
  1749. return I32x4(
  1750. *(data + elementOffsetS.x),
  1751. *(data + elementOffsetS.y),
  1752. *(data + elementOffsetS.z),
  1753. *(data + elementOffsetS.w)
  1754. );
  1755. #endif
  1756. }
  1757. static inline F32x4 gather(const dsr::SafePointer<float> data, const U32x4 &elementOffset) {
  1758. #ifdef USE_AVX2
  1759. return F32x4(GATHER_F32_AVX2(data.getUnsafe(), elementOffset.v, 4));
  1760. #else
  1761. dsr::UVector4D elementOffsetS = elementOffset.get();
  1762. return F32x4(
  1763. *(data + elementOffsetS.x),
  1764. *(data + elementOffsetS.y),
  1765. *(data + elementOffsetS.z),
  1766. *(data + elementOffsetS.w)
  1767. );
  1768. #endif
  1769. }
  1770. #endif