simd.odin 68 KB


  1. /*
  2. Cross-platform `SIMD` support types and procedures.
  3. SIMD (Single Instruction Multiple Data), is a CPU hardware feature that
  4. introduce special registers and instructions which operate on multiple units
  5. of data at the same time, which enables faster data processing for
  6. applications with heavy computational workloads.
  7. In Odin SIMD is exposed via a special kinds of arrays, called the *SIMD
  8. vectors*. The types of SIMD vectors is written as `#simd [N]T`, where N is a
  9. power of two, and T could be any basic type (integers, floats, etc.). The
  10. documentation of this package will call *SIMD vectors* just *vectors*.
  11. SIMD vectors consist of elements, called *scalar values*, or
  12. *scalars*, each occupying a *lane* of the SIMD vector. In the type declaration,
  13. `N` specifies the amount of lanes, or values, that a vector stores.
  14. This package implements procedures for working with vectors.
  15. */
  16. package simd
  17. import "base:builtin"
  18. import "base:intrinsics"
  19. import "base:runtime"
  20. /*
  21. Check if SIMD is software-emulated on a target platform.
  22. This value is `true`, when the compile-time target has the hardware support for
  23. at least 128-bit (or wider) SIMD. If the compile-time target lacks the hardware support
  24. for 128-bit SIMD, this value is `false`, and all SIMD operations will likely be
  25. emulated.
  26. */
  27. HAS_HARDWARE_SIMD :: runtime.HAS_HARDWARE_SIMD
  28. /*
  29. Vector of 16 `u8` lanes (128 bits).
  30. */
  31. u8x16 :: #simd[16]u8
  32. /*
  33. Vector of 16 `i8` lanes (128 bits).
  34. */
  35. i8x16 :: #simd[16]i8
  36. /*
  37. Vector of 8 `u16` lanes (128 bits).
  38. */
  39. u16x8 :: #simd[8]u16
  40. /*
  41. Vector of 8 `i16` lanes (128 bits).
  42. */
  43. i16x8 :: #simd[8]i16
  44. /*
  45. Vector of 4 `u32` lanes (128 bits).
  46. */
  47. u32x4 :: #simd[4]u32
  48. /*
  49. Vector of 4 `i32` lanes (128 bits).
  50. */
  51. i32x4 :: #simd[4]i32
  52. /*
  53. Vector of 2 `u64` lanes (128 bits).
  54. */
  55. u64x2 :: #simd[2]u64
  56. /*
  57. Vector of 2 `i64` lanes (128 bits).
  58. */
  59. i64x2 :: #simd[2]i64
  60. /*
  61. Vector of 4 `f32` lanes (128 bits).
  62. */
  63. f32x4 :: #simd[4]f32
  64. /*
  65. Vector of 2 `f64` lanes (128 bits).
  66. */
  67. f64x2 :: #simd[2]f64
  68. /*
  69. Vector of 16 `bool` lanes (128 bits).
  70. */
  71. boolx16 :: #simd[16]bool
  72. /*
  73. Vector of 16 `b8` lanes (128 bits).
  74. */
  75. b8x16 :: #simd[16]b8
  76. /*
  77. Vector of 8 `b16` lanes (128 bits).
  78. */
  79. b16x8 :: #simd[8]b16
  80. /*
  81. Vector of 4 `b32` lanes (128 bits).
  82. */
  83. b32x4 :: #simd[4]b32
  84. /*
  85. Vector of 2 `b64` lanes (128 bits).
  86. */
  87. b64x2 :: #simd[2]b64
  88. /*
  89. Vector of 32 `u8` lanes (256 bits).
  90. */
  91. u8x32 :: #simd[32]u8
  92. /*
  93. Vector of 32 `i8` lanes (256 bits).
  94. */
  95. i8x32 :: #simd[32]i8
  96. /*
  97. Vector of 16 `u16` lanes (256 bits).
  98. */
  99. u16x16 :: #simd[16]u16
  100. /*
  101. Vector of 16 `i16` lanes (256 bits).
  102. */
  103. i16x16 :: #simd[16]i16
  104. /*
  105. Vector of 8 `u32` lanes (256 bits).
  106. */
  107. u32x8 :: #simd[8]u32
  108. /*
  109. Vector of 8 `i32` lanes (256 bits).
  110. */
  111. i32x8 :: #simd[8]i32
  112. /*
  113. Vector of 4 `u64` lanes (256 bits).
  114. */
  115. u64x4 :: #simd[4]u64
  116. /*
  117. Vector of 4 `i64` lanes (256 bits).
  118. */
  119. i64x4 :: #simd[4]i64
  120. /*
  121. Vector of 8 `f32` lanes (256 bits).
  122. */
  123. f32x8 :: #simd[8]f32
  124. /*
  125. Vector of 4 `f64` lanes (256 bits).
  126. */
  127. f64x4 :: #simd[4]f64
  128. /*
  129. Vector of 32 `bool` lanes (256 bits).
  130. */
  131. boolx32 :: #simd[32]bool
  132. /*
  133. Vector of 32 `b8` lanes (256 bits).
  134. */
  135. b8x32 :: #simd[32]b8
  136. /*
  137. Vector of 16 `b16` lanes (256 bits).
  138. */
  139. b16x16 :: #simd[16]b16
  140. /*
  141. Vector of 8 `b32` lanes (256 bits).
  142. */
  143. b32x8 :: #simd[8]b32
  144. /*
  145. Vector of 4 `b64` lanes (256 bits).
  146. */
  147. b64x4 :: #simd[4]b64
  148. /*
  149. Vector of 64 `u8` lanes (512 bits).
  150. */
  151. u8x64 :: #simd[64]u8
  152. /*
  153. Vector of 64 `i8` lanes (512 bits).
  154. */
  155. i8x64 :: #simd[64]i8
  156. /*
  157. Vector of 32 `u16` lanes (512 bits).
  158. */
  159. u16x32 :: #simd[32]u16
  160. /*
  161. Vector of 32 `i16` lanes (512 bits).
  162. */
  163. i16x32 :: #simd[32]i16
  164. /*
  165. Vector of 16 `u32` lanes (512 bits).
  166. */
  167. u32x16 :: #simd[16]u32
  168. /*
  169. Vector of 16 `i32` lanes (512 bits).
  170. */
  171. i32x16 :: #simd[16]i32
  172. /*
  173. Vector of 8 `u64` lanes (512 bits).
  174. */
  175. u64x8 :: #simd[8]u64
  176. /*
  177. Vector of 8 `i64` lanes (512 bits).
  178. */
  179. i64x8 :: #simd[8]i64
  180. /*
  181. Vector of 16 `f32` lanes (512 bits).
  182. */
  183. f32x16 :: #simd[16]f32
  184. /*
  185. Vector of 8 `f64` lanes (512 bits).
  186. */
  187. f64x8 :: #simd[8]f64
  188. /*
  189. Vector of 64 `bool` lanes (512 bits).
  190. */
  191. boolx64 :: #simd[64]bool
  192. /*
  193. Vector of 64 `b8` lanes (512 bits).
  194. */
  195. b8x64 :: #simd[64]b8
  196. /*
  197. Vector of 32 `b16` lanes (512 bits).
  198. */
  199. b16x32 :: #simd[32]b16
  200. /*
  201. Vector of 16 `b32` lanes (512 bits).
  202. */
  203. b32x16 :: #simd[16]b32
  204. /*
  205. Vector of 8 `b64` lanes (512 bits).
  206. */
  207. b64x8 :: #simd[8]b64
  208. /*
  209. Add SIMD vectors.
  210. This procedure returns a vector, where each lane holds the sum of the
  211. corresponding `a` and `b` vectors' lanes.
  212. Inputs:
  213. - `a`: An integer or a float vector.
  214. - `b`: An integer or a float vector.
  215. Returns:
  216. - A vector that is the sum of two input vectors.
  217. Operation:
  218. for i in 0 ..< len(res) {
  219. res[i] = a[i] + b[i]
  220. }
  221. return res
  222. Example:
  223. +-----+-----+-----+-----+
  224. a: | 0 | 1 | 2 | 3 |
  225. +-----+-----+-----+-----+
  226. +-----+-----+-----+-----+
  227. b: | 0 | 1 | 2 | -1 |
  228. +-----+-----+-----+-----+
  229. res:
  230. +-----+-----+-----+-----+
  231. | 0 | 2 | 4 | 2 |
  232. +-----+-----+-----+-----+
  233. */
  234. add :: intrinsics.simd_add
  235. /*
  236. Subtract SIMD vectors.
  237. This procedure returns a vector, where each lane holds the difference between
  238. the corresponding lanes of the vectors `a` and `b`. The lanes from the vector
  239. `b` are subtracted from the corresponding lanes of the vector `a`.
  240. Inputs:
  241. - `a`: An integer or a float vector to subtract from.
  242. - `b`: An integer or a float vector.
  243. Returns:
  244. - A vector that is the difference of two vectors, `a` - `b`.
  245. Operation:
  246. for i in 0 ..< len(res) {
  247. res[i] = a[i] - b[i]
  248. }
  249. return res
  250. Example:
  251. +-----+-----+-----+-----+
  252. a: | 2 | 2 | 2 | 2 |
  253. +-----+-----+-----+-----+
  254. +-----+-----+-----+-----+
  255. b: | 0 | 1 | 2 | 3 |
  256. +-----+-----+-----+-----+
  257. res:
  258. +-----+-----+-----+-----+
  259. | 2 | 1 | 0 | -1 |
  260. +-----+-----+-----+-----+
  261. */
  262. sub :: intrinsics.simd_sub
  263. /*
  264. Multiply (component-wise) SIMD vectors.
  265. This procedure returns a vector, where each lane holds the product of the
  266. corresponding lanes of the vectors `a` and `b`.
  267. Inputs:
  268. - `a`: An integer or a float vector.
  269. - `b`: An integer or a float vector.
  270. Returns:
  271. - A vector that is the product of two vectors.
  272. Operation:
  273. for i in 0 ..< len(res) {
  274. res[i] = a[i] * b[i]
  275. }
  276. return res
  277. Example:
  278. +-----+-----+-----+-----+
  279. a: | 2 | 2 | 2 | 2 |
  280. +-----+-----+-----+-----+
  281. +-----+-----+-----+-----+
  282. b: | 0 | -1 | 2 | -3 |
  283. +-----+-----+-----+-----+
  284. res:
  285. +-----+-----+-----+-----+
  286. | 0 | -2 | 4 | -6 |
  287. +-----+-----+-----+-----+
  288. */
  289. mul :: intrinsics.simd_mul
  290. /*
  291. Divide SIMD vectors.
  292. This procedure returns a vector, where each lane holds the quotient (result
  293. of division) between the corresponding lanes of the vectors `a` and `b`. Each
  294. lane of the vector `a` is divided by the corresponding lane of the vector `b`.
  295. This operation performs a standard floating-point division for each lane.
  296. Inputs:
  297. - `a`: A float vector.
  298. - `b`: A float vector to divide by.
  299. Returns:
  300. - A vector that is the quotient of two vectors, `a` / `b`.
  301. Operation:
  302. for i in 0 ..< len(res) {
  303. res[i] = a[i] / b[i]
  304. }
  305. return res
  306. Example:
  307. +-----+-----+-----+-----+
  308. a: | 2 | 2 | 2 | 2 |
  309. +-----+-----+-----+-----+
  310. +-----+-----+-----+-----+
  311. b: | 0 | -1 | 2 | -3 |
  312. +-----+-----+-----+-----+
  313. res:
  314. +-----+-----+-----+------+
  315. | +∞ | -2 | 1 | -2/3 |
  316. +-----+-----+-----+------+
  317. */
  318. div :: intrinsics.simd_div
  319. /*
  320. Shift left lanes of a vector.
  321. This procedure returns a vector, such that each lane holds the result of a
  322. shift-left (aka shift-up) operation of the corresponding lane from vector `a` by the shift
  323. amount from the corresponding lane of the vector `b`.
  324. If the shift amount is greater than the bit-width of a lane, the result is `0`
  325. in the corresponding positions of the result.
  326. Inputs:
  327. - `a`: An integer vector of values to shift.
  328. - `b`: An unsigned integer vector of the shift amounts.
  329. Result:
  330. - A vector, where each lane is the lane from `a` shifted left by the amount
  331. specified in the corresponding lane of the vector `b`.
  332. Operation:
  333. for i in 0 ..< len(res) {
  334. if b[i] < 8*size_of(a[i]) {
  335. res[i] = a[i] << b[i]
  336. } else {
  337. res[i] = 0
  338. }
  339. }
  340. return res
  341. Example:
  342. // An example for a 4-lane 8-bit signed integer vector `a`.
  343. +-------+-------+-------+-------+
  344. a: | 0x11 | 0x55 | 0x03 | 0xff |
  345. +-------+-------+-------+-------+
  346. +-------+-------+-------+-------+
  347. b: | 2 | 1 | 33 | 1 |
  348. +-------+-------+-------+-------+
  349. res:
  350. +-------+-------+-------+--------+
  351. | 0x44 | 0xaa | 0 | 0xfe |
  352. +-------+-------+-------+--------+
  353. */
  354. shl :: intrinsics.simd_shl
  355. /*
  356. Shift right lanes of a vector.
  357. This procedure returns a vector, such that each lane holds the result of a
  358. shift-right (aka shift-down) operation, of lane from the vector `a` by the shift
  359. amount from the corresponding lane of the vector `b`.
  360. If the shift amount is greater than the bit-width of a lane, the result is `0`
  361. in the corresponding positions of the result.
  362. If the first vector is a vector of signed integers, the arithmetic shift
  363. operation is performed. Otherwise, if the first vector is a vector of unsigned
  364. integers, a logical shift is performed.
  365. Inputs:
  366. - `a`: An integer vector of values to shift.
  367. - `b`: An unsigned integer vector of the shift amounts.
  368. Result:
  369. - A vector, where each lane is the lane from `a` shifted right by the amount
  370. specified in the corresponding lane of the vector `b`.
  371. Operation:
  372. for i in 0 ..< len(res) {
  373. if b[i] < 8*size_of(a[i]) {
  374. res[i] = a[i] >> b[i]
  375. } else {
  376. res[i] = 0
  377. }
  378. }
  379. return res
  380. Example:
  381. // An example for a 4-lane 8-bit signed integer vector `a`.
  382. +-------+-------+-------+-------+
  383. a: | 0x11 | 0x55 | 0x03 | 0xff |
  384. +-------+-------+-------+-------+
  385. +-------+-------+-------+-------+
  386. b: | 2 | 1 | 33 | 1 |
  387. +-------+-------+-------+-------+
  388. res:
  389. +-------+-------+-------+--------+
  390. | 0x04 | 0x2a | 0 | 0xff |
  391. +-------+-------+-------+--------+
  392. */
  393. shr :: intrinsics.simd_shr
  394. /*
  395. Shift left lanes of a vector (masked).
  396. This procedure returns a vector, such that each lane holds the result of a
  397. shift-left (aka shift-up) operation, of lane from the vector `a` by the shift
  398. amount from the corresponding lane of the vector `b`.
  399. The shift amount is wrapped (masked) to the bit-width of the lane.
  400. Inputs:
  401. - `a`: An integer vector of values to shift.
  402. - `b`: An unsigned integer vector of the shift amounts.
  403. Result:
  404. - A vector, where each lane is the lane from `a` shifted left by the amount
  405. specified in the corresponding lane of the vector `b`.
  406. Operation:
  407. for i in 0 ..< len(res) {
  408. mask := 8*size_of(a[i]) - 1
  409. res[i] = a[i] << (b[i] & mask)
  410. }
  411. return res
  412. Example:
  413. // An example for a 4-lane vector `a` of 8-bit signed integers.
  414. +-------+-------+-------+-------+
  415. a: | 0x11 | 0x55 | 0x03 | 0xff |
  416. +-------+-------+-------+-------+
  417. +-------+-------+-------+-------+
  418. b: | 2 | 1 | 33 | 1 |
  419. +-------+-------+-------+-------+
  420. res:
  421. +-------+-------+-------+--------+
  422. | 0x44 | 0xaa | 0x06 | 0xfe |
  423. +-------+-------+-------+--------+
  424. */
  425. shl_masked :: intrinsics.simd_shl_masked
  426. /*
  427. Shift right lanes of a vector (masked).
  428. This procedure returns a vector, such that each lane holds the result of a
  429. shift-right (aka shift-down) operation, of lane from the vector `a` by the shift
  430. amount from the corresponding lane of the vector `b`.
  431. The shift amount is wrapped (masked) to the bit-width of the lane.
  432. If the first vector is a vector of signed integers, the arithmetic shift
  433. operation is performed. Otherwise, if the first vector is a vector of unsigned
  434. integers, a logical shift is performed.
  435. Inputs:
  436. - `a`: An integer vector of values to shift.
  437. - `b`: An unsigned integer vector of the shift amounts.
  438. Result:
  439. - A vector, where each lane is the lane from `a` shifted right by the amount
  440. specified in the corresponding lane of the vector `b`.
  441. Operation:
  442. for i in 0 ..< len(res) {
  443. mask := 8*size_of(a[i]) - 1
  444. res[i] = a[i] >> (b[i] & mask)
  445. }
  446. return res
  447. Example:
  448. // An example for a 4-lane vector `a` of 8-bit signed integers.
  449. +-------+-------+-------+-------+
  450. a: | 0x11 | 0x55 | 0x03 | 0xff |
  451. +-------+-------+-------+-------+
  452. +-------+-------+-------+-------+
  453. b: | 2 | 1 | 33 | 1 |
  454. +-------+-------+-------+-------+
  455. res:
  456. +-------+-------+-------+--------+
  457. | 0x04 | 0x2a | 0x01 | 0xff |
  458. +-------+-------+-------+--------+
  459. */
  460. shr_masked :: intrinsics.simd_shr_masked
  461. /*
  462. Saturated addition of SIMD vectors.
  463. The *saturated sum* is a just like a normal sum, except the treatment of the
  464. result upon overflow or underflow is different. In saturated operations, the
  465. result is not wrapped to the bit-width of the lane, and instead is kept clamped
  466. between the minimum and the maximum values of the lane type.
  467. This procedure returns a vector where each lane is the saturated sum of the
  468. corresponding lanes of vectors `a` and `b`.
  469. Inputs:
  470. - `a`: An integer vector.
  471. - `b`: An integer vector.
  472. Returns:
  473. - The saturated sum of the two vectors.
  474. Operation:
  475. for i in 0 ..< len(res) {
  476. switch {
  477. case b[i] >= max(type_of(a[i])) - a[i]: // (overflow of a[i])
  478. res[i] = max(type_of(a[i]))
  479. case b[i] <= min(type_of(a[i])) - a[i]: // (underflow of a[i])
  480. res[i] = min(type_of(a[i]))
  481. } else {
  482. res[i] = a[i] + b[i]
  483. }
  484. }
  485. return res
  486. Example:
  487. // An example for a 4-lane vector `a` of 8-bit signed integers.
  488. +-----+-----+-----+-----+
  489. a: | 0 | 255 | 2 | 3 |
  490. +-----+-----+-----+-----+
  491. +-----+-----+-----+-----+
  492. b: | 1 | 3 | 2 | -1 |
  493. +-----+-----+-----+-----+
  494. res:
  495. +-----+-----+-----+-----+
  496. | 1 | 255 | 4 | 2 |
  497. +-----+-----+-----+-----+
  498. */
  499. saturating_add :: intrinsics.simd_saturating_add
  500. /*
  501. Saturated subtraction of 2 lanes of vectors.
  502. The *saturated difference* is a just like a normal difference, except the treatment of the
  503. result upon overflow or underflow is different. In saturated operations, the
  504. result is not wrapped to the bit-width of the lane, and instead is kept clamped
  505. between the minimum and the maximum values of the lane type.
  506. This procedure returns a vector where each lane is the saturated difference of
  507. the corresponding lanes of vectors `a` and `b`.
  508. Inputs:
  509. - `a`: An integer vector to subtract from.
  510. - `b`: An integer vector.
  511. Returns:
  512. - The saturated difference of the two vectors.
  513. Operation:
  514. for i in 0 ..< len(res) {
  515. switch {
  516. case b[i] >= max(type_of(a[i])) + a[i]: // (overflow of a[i])
  517. res[i] = max(type_of(a[i]))
  518. case b[i] <= min(type_of(a[i])) + a[i]: // (underflow of a[i])
  519. res[i] = min(type_of(a[i]))
  520. } else {
  521. res[i] = a[i] - b[i]
  522. }
  523. }
  524. return res
  525. Example:
  526. // An example for a 4-lane vector `a` of 8-bit signed integers.
  527. +-----+-----+-----+-----+
  528. a: | 0 | 255 | 2 | 3 |
  529. +-----+-----+-----+-----+
  530. +-----+-----+-----+-----+
  531. b: | 3 | 3 | 2 | -1 |
  532. +-----+-----+-----+-----+
  533. res:
  534. +-----+-----+-----+-----+
  535. | 0 | 252 | 0 | 4 |
  536. +-----+-----+-----+-----+
  537. */
  538. saturating_sub :: intrinsics.simd_saturating_sub
  539. /*
  540. Bitwise AND of vectors.
  541. This procedure returns a vector, such that each lane has the result of a bitwise
  542. AND operation between the corresponding lanes of the vectors `a` and `b`.
  543. Inputs:
  544. - `a`: An integer or a boolean vector.
  545. - `b`: An integer or a boolean vector.
  546. Returns:
  547. - A vector that is the result of the bitwise AND operation between two vectors.
  548. Operation:
  549. for i in 0 ..< len(res) {
  550. res[i] = a[i] & b[i]
  551. }
  552. return res
  553. Example:
  554. +------+------+------+------+
  555. a: | 0x11 | 0x33 | 0x55 | 0xaa |
  556. +------+------+------+------+
  557. +------+------+------+------+
  558. b: | 0xff | 0xf0 | 0x0f | 0x00 |
  559. +------+------+------+------+
  560. res:
  561. +------+------+------+------+
  562. | 0x11 | 0x30 | 0x05 | 0x00 |
  563. +------+------+------+------+
  564. */
  565. bit_and :: intrinsics.simd_bit_and
  566. /*
  567. Bitwise OR of vectors.
  568. This procedure returns a vector, such that each lane has the result of a bitwise
  569. OR operation between the corresponding lanes of the vectors `a` and `b`.
  570. Inputs:
  571. - `a`: An integer or a boolean vector.
  572. - `b`: An integer or a boolean vector.
  573. Returns:
  574. - A vector that is the result of the bitwise OR operation between two vectors.
  575. Operation:
  576. for i in 0 ..< len(res) {
  577. res[i] = a[i] | b[i]
  578. }
  579. return res
  580. Example:
  581. +------+------+------+------+
  582. a: | 0x11 | 0x33 | 0x55 | 0xaa |
  583. +------+------+------+------+
  584. +------+------+------+------+
  585. b: | 0xff | 0xf0 | 0x0f | 0x00 |
  586. +------+------+------+------+
  587. res:
  588. +------+------+------+------+
  589. | 0xff | 0xf3 | 0x5f | 0xaa |
  590. +------+------+------+------+
  591. */
  592. bit_or :: intrinsics.simd_bit_or
  593. /*
  594. Bitwise XOR of vectors.
  595. This procedure returns a vector, such that each lane has the result of a bitwise
  596. XOR operation between the corresponding lanes of the vectors `a` and `b`.
  597. Inputs:
  598. - `a`: An integer or a boolean vector.
  599. - `b`: An integer or a boolean vector.
  600. Returns:
  601. - A vector that is the result of the bitwise XOR operation between two vectors.
  602. Operation:
  603. for i in 0 ..< len(res) {
  604. res[i] = a[i] ~ b[i]
  605. }
  606. return res
  607. Example:
  608. +------+------+------+------+
  609. a: | 0x11 | 0x33 | 0x55 | 0xaa |
  610. +------+------+------+------+
  611. +------+------+------+------+
  612. b: | 0xff | 0xf0 | 0x0f | 0x00 |
  613. +------+------+------+------+
  614. res:
  615. +------+------+------+------+
  616. | 0xee | 0xc3 | 0x5a | 0xaa |
  617. +------+------+------+------+
  618. */
  619. bit_xor :: intrinsics.simd_bit_xor
  620. /*
  621. Bitwise AND NOT of vectors.
  622. This procedure returns a vector, such that each lane has the result of a bitwise
  623. AND NOT operation between the corresponding lanes of the vectors `a` and `b`.
  624. Inputs:
  625. - `a`: An integer or a boolean vector.
  626. - `b`: An integer or a boolean vector.
  627. Returns:
  628. - A vector that is the result of the bitwise AND NOT operation between two vectors.
  629. Operation:
  630. for i in 0 ..< len(res) {
  631. res[i] = a[i] &~ b[i]
  632. }
  633. return res
  634. Example:
  635. +------+------+------+------+
  636. a: | 0x11 | 0x33 | 0x55 | 0xaa |
  637. +------+------+------+------+
  638. +------+------+------+------+
  639. b: | 0xff | 0xf0 | 0x0f | 0x00 |
  640. +------+------+------+------+
  641. res:
  642. +------+------+------+------+
  643. | 0x00 | 0x03 | 0x50 | 0xaa |
  644. +------+------+------+------+
  645. */
  646. bit_and_not :: intrinsics.simd_bit_and_not
  647. /*
  648. Negation of a SIMD vector.
  649. This procedure returns a vector where each lane is the negation of the
  650. corresponding lane in the vector `a`.
  651. Inputs:
  652. - `a`: An integer or a float vector to negate.
  653. Returns:
  654. - The negated version of the vector `a`.
  655. Operation:
  656. for i in 0 ..< len(res) {
  657. res[i] = -a[i]
  658. }
  659. return res
  660. Example:
  661. +------+------+------+------+
  662. a: | 0 | 1 | 2 | 3 |
  663. +------+------+------+------+
  664. res:
  665. +------+------+------+------+
  666. | 0 | -1 | -2 | -3 |
  667. +------+------+------+------+
  668. */
  669. neg :: intrinsics.simd_neg
  670. /*
  671. Absolute value of a SIMD vector.
  672. This procedure returns a vector where each lane has the absolute value of the
  673. corresponding lane in the vector `a`.
  674. Inputs:
  675. - `a`: An integer or a float vector to negate
  676. Returns:
  677. - The absolute value of a vector.
  678. Operation:
  679. for i in 0 ..< len(res) {
  680. switch {
  681. case a[i] < 0: res[i] = -a[i]
  682. case a[i] > 0: res[i] = a[i]
  683. case a[i] == 0: res[i] = 0
  684. }
  685. }
  686. return res
  687. Example:
  688. +------+------+------+------+
  689. a: | 0 | -1 | 2 | -3 |
  690. +------+------+------+------+
  691. res:
  692. +------+------+------+------+
  693. | 0 | 1 | 2 | 3 |
  694. +------+------+------+------+
  695. */
  696. abs :: intrinsics.simd_abs
  697. /*
  698. Minimum of each lane of vectors.
  699. This procedure returns a vector, such that each lane has the minimum value
  700. between the corresponding lanes in vectors `a` and `b`.
  701. Inputs:
  702. - `a`: An integer or a float vector.
  703. - `b`: An integer or a float vector.
  704. Returns:
  705. - A vector containing with minimum values from corresponding lanes of `a` and `b`.
  706. Operation:
  707. for i in 0 ..< len(res) {
  708. if a[i] < b[i] {
  709. res[i] = a[i]
  710. } else {
  711. res[i] = b[i]
  712. }
  713. }
  714. return res
  715. Example:
  716. +-----+-----+-----+-----+
  717. a: | 0 | 1 | 2 | 3 |
  718. +-----+-----+-----+-----+
  719. +-----+-----+-----+-----+
  720. b: | 0 | 2 | 1 | -1 |
  721. +-----+-----+-----+-----+
  722. res:
  723. +-----+-----+-----+-----+
  724. | 0 | 1 | 1 | -1 |
  725. +-----+-----+-----+-----+
  726. */
  727. min :: intrinsics.simd_min
  728. /*
  729. Maximum of each lane of vectors.
  730. This procedure returns a vector, such that each lane has the maximum value
  731. between the corresponding lanes in vectors `a` and `b`.
  732. Inputs:
  733. - `a`: An integer or a float vector.
  734. - `b`: An integer or a float vector.
  735. Returns:
  736. - A vector containing with maximum values from corresponding lanes of `a` and `b`.
  737. Operation:
  738. for i in 0 ..< len(res) {
  739. if a[i] > b[i] {
  740. res[i] = a[i]
  741. } else {
  742. res[i] = b[i]
  743. }
  744. }
  745. return res
  746. Example:
  747. +-----+-----+-----+-----+
  748. a: | 0 | 1 | 2 | 3 |
  749. +-----+-----+-----+-----+
  750. +-----+-----+-----+-----+
  751. b: | 0 | 2 | 1 | -1 |
  752. +-----+-----+-----+-----+
  753. res:
  754. +-----+-----+-----+-----+
  755. | 0 | 2 | 2 | 3 |
  756. +-----+-----+-----+-----+
  757. */
  758. max :: intrinsics.simd_max
  759. /*
  760. Clamp lanes of vector.
  761. This procedure returns a vector, where each lane is the result of the
  762. clamping of the lane from the vector `v` between the values in the corresponding
  763. lanes of vectors `min` and `max`.
  764. Inputs:
  765. - `v`: An integer or a float vector with values to be clamped.
  766. - `min`: An integer or a float vector with minimum bounds.
  767. - `max`: An integer or a float vectoe with maximum bounds.
  768. Returns:
  769. - A vector containing clamped values in each lane.
  770. Operation:
  771. for i in 0 ..< len(res) {
  772. val := v[i]
  773. switch {
  774. case val < min: val = min
  775. case val > max: val = max
  776. }
  777. res[i] = val
  778. }
  779. return res
  780. Example:
  781. +-------+-------+-------+-------+
  782. v: | -1 | 0.3 | 1.2 | 1 |
  783. +-------+-------+-------+-------+
  784. +-------+-------+-------+-------+
  785. min: | 0 | 0 | 0 | 0 |
  786. +-------+-------+-------+-------+
  787. +-------+-------+-------+-------+
  788. max: | 1 | 1 | 1 | 1 |
  789. +-------+-------+-------+-------+
  790. res:
  791. +-------+-------+-------+-------+
  792. | 0 | 0.3 | 1 | 1 |
  793. +-------+-------+-------+-------+
  794. */
  795. clamp :: intrinsics.simd_clamp
  796. /*
  797. Check if lanes of vectors are equal.
  798. This procedure checks each pair of lanes from vectors `a` and `b` for whether
  799. they are equal, and if they are, the corresponding lane of the result vector
  800. will have a value with all bits set (`0xff..ff`). Otherwise the lane of the
  801. result vector will have the value `0`.
  802. Inputs:
  803. - `a`: An integer, a float or a boolean vector.
  804. - `b`: An integer, a float or a boolean vector.
  805. Returns:
  806. - A vector of unsigned integers of the same size as the input vector's lanes,
  807. containing the comparison results for each lane.
  808. Operation:
  809. for i in 0 ..< len(res) {
  810. if a[i] == b[i] {
  811. res[i] = max(T)
  812. } else {
  813. res[i] = 0
  814. }
  815. }
  816. return res
  817. Example:
  818. +-------+-------+-------+-------+
  819. a: | 0 | 1 | 2 | 3 |
  820. +-------+-------+-------+-------+
  821. +-------+-------+-------+-------+
  822. b: | 0 | 2 | 2 | 2 |
  823. +-------+-------+-------+-------+
  824. res:
  825. +-------+-------+-------+-------+
  826. | 0xff | 0x00 | 0xff | 0x00 |
  827. +-------+-------+-------+-------+
  828. */
  829. lanes_eq :: intrinsics.simd_lanes_eq
  830. /*
  831. Check if lanes of vectors are not equal.
  832. This procedure checks each pair of lanes from vectors `a` and `b` for whether
  833. they are not equal, and if they are, the corresponding lane of the result
  834. vector will have a value with all bits set (`0xff..ff`). Otherwise the lane of
  835. the result vector will have the value `0`.
  836. Inputs:
  837. - `a`: An integer, a float or a boolean vector.
  838. - `b`: An integer, a float or a boolean vector.
  839. Returns:
  840. - A vector of unsigned integers of the same size as the input vector's lanes,
  841. containing the comparison results for each lane.
  842. Operation:
  843. for i in 0 ..< len(res) {
  844. if a[i] != b[i] {
  845. res[i] = unsigned(-1)
  846. } else {
  847. res[i] = 0
  848. }
  849. }
  850. return res
  851. Example:
  852. +-------+-------+-------+-------+
  853. a: | 0 | 1 | 2 | 3 |
  854. +-------+-------+-------+-------+
  855. +-------+-------+-------+-------+
  856. b: | 0 | 2 | 2 | 2 |
  857. +-------+-------+-------+-------+
  858. res:
  859. +-------+-------+-------+-------+
  860. | 0x00 | 0xff | 0x00 | 0xff |
  861. +-------+-------+-------+-------+
  862. */
  863. lanes_ne :: intrinsics.simd_lanes_ne
  864. /*
  865. Check if lanes of a vector are less than another.
  866. This procedure checks each pair of lanes from vectors `a` and `b` for whether
  867. the lane of `a` is less than the lane of `b`, and if so, the corresponding lane
  868. of the result vector will have a value with all bits set (`0xff..ff`). Otherwise
  869. the lane of the result vector will have the value `0`.
  870. Inputs:
  871. - `a`: An integer or a float vector.
  872. - `b`: An integer or a float vector.
  873. Returns:
  874. - A vector of unsigned integers of the same size as the input vector's lanes,
  875. containing the comparison results for each lane.
  876. Operation:
  877. for i in 0 ..< len(res) {
  878. if a[i] < b[i] {
  879. res[i] = unsigned(-1)
  880. } else {
  881. res[i] = 0
  882. }
  883. }
  884. return res
  885. Example:
  886. +-------+-------+-------+-------+
  887. a: | 0 | 1 | 2 | 3 |
  888. +-------+-------+-------+-------+
  889. +-------+-------+-------+-------+
  890. b: | 0 | 2 | 2 | 2 |
  891. +-------+-------+-------+-------+
  892. res:
  893. +-------+-------+-------+-------+
  894. r: | 0x00 | 0xff | 0x00 | 0x00 |
  895. +-------+-------+-------+-------+
  896. */
  897. lanes_lt :: intrinsics.simd_lanes_lt
  898. /*
  899. Check if lanes of a vector are less than or equal than another.
  900. SIMD vector.
  901. This procedure checks each pair of lanes from vectors `a` and `b` for whether the
  902. lane of `a` is less than or equal to the lane of `b`, and if so, the
  903. corresponding lane of the result vector will have a value with all bits set
  904. (`0xff..ff`). Otherwise the lane of the result vector will have the value `0`.
  905. Inputs:
  906. - `a`: An integer or a float vector.
  907. - `b`: An integer or a float vector.
  908. Returns:
  909. - A vector of unsigned integers of the same size as the input vector's lanes,
  910. containing the comparison results for each lane.
  911. Operation:
  912. for i in 0 ..< len(res) {
  913. if a[i] <= b[i] {
  914. res[i] = unsigned(-1)
  915. } else {
  916. res[i] = 0
  917. }
  918. }
  919. return res
  920. Example:
  921. +-------+-------+-------+-------+
  922. a: | 0 | 1 | 2 | 3 |
  923. +-------+-------+-------+-------+
  924. +-------+-------+-------+-------+
  925. b: | 0 | 2 | 2 | 2 |
  926. +-------+-------+-------+-------+
  927. res:
  928. +-------+-------+-------+-------+
  929. | 0xff | 0xff | 0xff | 0x00 |
  930. +-------+-------+-------+-------+
  931. */
  932. lanes_le :: intrinsics.simd_lanes_le
  933. /*
  934. Check if lanes of a vector are greater than another.
  935. vector.
  936. This procedure checks each pair of lanes from vectors `a` and `b` for whether the
  937. lane of `a` is greater than to the lane of `b`, and if so, the corresponding
  938. lane of the result vector will have a value with all bits set (`0xff..ff`).
  939. Otherwise the lane of the result vector will have the value `0`.
  940. Inputs:
  941. - `a`: An integer or a float vector.
  942. - `b`: An integer or a float vector.
  943. Returns:
  944. - A vector of unsigned integers of the same size as the input vector's lanes,
  945. containing the comparison results for each lane.
  946. Operation:
  947. for i in 0 ..< len(res) {
  948. if a[i] > b[i] {
  949. res[i] = unsigned(-1)
  950. } else {
  951. res[i] = 0
  952. }
  953. }
  954. return res
  955. Example:
  956. +-------+-------+-------+-------+
  957. a: | 0 | 1 | 2 | 3 |
  958. +-------+-------+-------+-------+
  959. +-------+-------+-------+-------+
  960. b: | 0 | 2 | 2 | 2 |
  961. +-------+-------+-------+-------+
  962. res:
  963. +-------+-------+-------+-------+
  964. | 0x00 | 0x00 | 0x00 | 0xff |
  965. +-------+-------+-------+-------+
  966. */
  967. lanes_gt :: intrinsics.simd_lanes_gt
  968. /*
  969. Check if lanes of a vector are greater than or equal than another.
  970. SIMD vector.
  971. This procedure checks each pair of lanes from vectors `a` and `b` for whether the
  972. lane of `a` is greater than or equal to the lane of `b`, and if so, the
  973. corresponding lane of the result vector will have a value with all bits set
  974. (`0xff..ff`). Otherwise the lane of the result vector will have the value `0`.
  975. Inputs:
  976. - `a`: An integer or a float vector.
  977. - `b`: An integer or a float vector.
  978. Returns:
  979. - A vector of unsigned integers of the same size as the input vector's lanes,
  980. containing the comparison results for each lane.
  981. Operation:
  982. for i in 0 ..< len(res) {
  983. if a[i] >= b[i] {
  984. res[i] = unsigned(-1)
  985. } else {
  986. res[i] = 0
  987. }
  988. }
  989. return res
  990. Example:
  991. +-------+-------+-------+-------+
  992. a: | 0 | 1 | 2 | 3 |
  993. +-------+-------+-------+-------+
  994. +-------+-------+-------+-------+
  995. b: | 0 | 2 | 2 | 2 |
  996. +-------+-------+-------+-------+
  997. res:
  998. +-------+-------+-------+-------+
  999. | 0xff | 0x00 | 0xff | 0xff |
  1000. +-------+-------+-------+-------+
  1001. */
  1002. lanes_ge :: intrinsics.simd_lanes_ge
  1003. /*
  1004. Perform a gather load into a vector.
  1005. A *gather* operation is memory load operation, that loads values from an vector
  1006. of addresses into a single value vector. This can be used to achieve the
  1007. following results:
  1008. - Accessing every N'th element of an array (strided access)
  1009. - Access of elements according to some computed offsets (indexed access).
  1010. - Access of elements in a different order (shuffling access).
  1011. When used alongside other SIMD procedures in order to compute the offsets
  1012. for the `ptr` and `mask` parameters.
  1013. Inputs:
  1014. - `ptr`: A vector of memory locations. Each pointer points to a single value,
  1015. of a SIMD vector's lane type that will be loaded into the vector. Pointer
  1016. in this vector can be `nil` or any other invalid value, if the corresponding
  1017. value in the `mask` parameter is zero.
  1018. - `val`: A vector of values that will be used at corresponding positions
  1019. of the result vector, if the corresponding memory location has been
  1020. masked out.
  1021. - `mask`: A vector of booleans or unsigned integers that determines which memory
  1022. locations to read from. If the value at an index has the value true
  1023. (lowest bit set), the value at that index will be loaded into the result
  1024. vector from the corresponding memory location in the `ptr` vector. Otherwise
  1025. the value will be loaded from the `val` vector.
  1026. Returns:
  1027. - A vector with all values from unmasked indices
  1028. loaded from the pointer vector `ptr`, and all values from masked indices loaded
  1029. from the value vector `val`.
  1030. Operation:
  1031. for i in 0 ..< len(res) {
  1032. if mask[i]&1 == 1 {
  1033. res[i] = ptr[i]^
  1034. } else {
  1035. res[i] = val[i]
  1036. }
  1037. }
  1038. return res
  1039. Example:
  1040. // Example below loads 2 lanes of values from 2 lanes of float vectors, `v1` and
  1041. // `v2`. From each of these vectors we're loading the second value, into the first
  1042. // and the third position of the result vector.
  1043. // Therefore the `ptrs` argument is initialized such that the first and the third
  1044. // value are the addresses of the values that we want to load into the result
  1045. // vector, and we'll fill in `nil` for the rest of them. To prevent CPU from
  1046. // dereferencing those `nil` addresses we provide the mask that only allows us
  1047. // to load valid positions of the `ptrs` array, and the array of defaults which
  1048. // will have `127` in each position as the default value.
  1049. import "core:fmt"
  1050. import "core:simd"
  1051. simd_gather_example :: proc() {
  1052. v1 := [4] f32 {1, 2, 3, 4};
  1053. v2 := [4] f32 {9, 10,11,12};
  1054. ptrs := #simd [4]rawptr { &v1[1], nil, &v2[1], nil }
  1055. mask := #simd [4]bool { true, false, true, false }
  1056. defaults := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
  1057. res := simd.gather(ptrs, defaults, mask)
  1058. fmt.println(res)
  1059. }
  1060. Output:
  1061. <2, 127, 10, 127>
  1062. The first and the third positions came from the `ptrs` array, and the other
  1063. 2 lanes of from the default vector. The graphic below shows how the values of
  1064. the result are decided based on the mask:
  1065. +-------------------------------+
  1066. mask: | 1 | 0 | 1 | 0 |
  1067. +-------------------------------+
  1068. | | | `----------------------------.
  1069. | | | |
  1070. | `---- | ------------------------. |
  1071. v v v v
  1072. +-------------------------------+ +-------------------+
  1073. ptrs: | &m0 | nil | &m2 | nil | vals: | d0 | d1 | d2 | d3 |
  1074. +-------------------------------+ +-------------------+
  1075. | | | |
  1076. | .--- | -------------------------' |
  1077. | | | ,-------------------------'
  1078. v v v v
  1079. +-------------------------------+
  1080. result: | m0 | d1 | m2 | d3 |
  1081. +-------------------------------+
  1082. */
  1083. gather :: intrinsics.simd_gather
  1084. /*
  1085. Perform a scatter store from a vector.
  1086. A *scatter* operation is a memory store operation that stores values from a
  1087. vector into multiple memory locations. This operation is effectively the
  1088. opposite of the *gather* operation.
  1089. Inputs:
  1090. - `ptr`: A vector of memory locations. Each masked location will be written
  1091. to with a value from the `val` vector. Pointers in this vector can be `nil`
  1092. or any other invalid value if the corresponding value in the `mask`
  1093. parameter is zero.
  1094. - `val`: A vector of values to write to the memory locations.
  1095. - `mask`: A vector of booleans or unsigned integers that decides which lanes
  1096. get written to memory. If the value of the mask is `true` (the lowest bit
  1097. set), the corresponding lane is written into memory. Otherwise it's not
  1098. written into memory.
  1099. Operation:
  1100. for i in 0 ..< len(ptr) {
  1101. if mask[i]&1 == 1 {
  1102. ptr[i]^ = val[i]
  1103. }
  1104. }
  1105. Example:
  1106. // Example below writes value `127` to the second element of two different
  1107. // vectors. The addresses of store destinations are written to the first and the
  1108. // third argument of the `ptr` vector, and the `mask` is set accordingly.
  1109. import "core:fmt"
  1110. import "core:simd"
  1111. simd_scatter_example :: proc() {
  1112. v1 := [4] f32 {1, 2, 3, 4};
  1113. v2 := [4] f32 {5, 6, 7, 8};
  1114. ptrs := #simd [4]rawptr { &v1[1], nil, &v2[1], nil }
  1115. mask := #simd [4]bool { true, false, true, false }
  1116. vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
  1117. simd.scatter(ptrs, vals, mask)
  1118. fmt.println(v1)
  1119. fmt.println(v2)
  1120. }
  1121. Output:
  1122. [1, 127, 3, 4]
  1123. [5, 127, 7, 8]
  1124. The graphic below shows how the data gets written into memory.
  1125. +-------------------+
  1126. mask: | 1 | 0 | 1 | 0 |
  1127. +-------------------+
  1128. | | | |
  1129. v X v X
  1130. +-------------------+
  1131. vals: | d0 | d1 | d2 | d3 |
  1132. +-------------------+
  1133. | \
  1134. v v
  1135. +-----------------------+
  1136. ptrs: | &m0 | nil | &m2 | nil |
  1137. +-----------------------+
  1138. */
  1139. scatter :: intrinsics.simd_scatter
  1140. /*
  1141. Perform a masked load into the vector.
  1142. This procedure performs a masked load from memory, into the vector. The `ptr`
  1143. argument specifies the base address from which the values of the vector
  1144. will be loaded. The mask selects the source for the result vector's lanes. If
  1145. the mask for the corresponding lane has the value `true` (lowest bit set), the
  1146. result lane is loaded from memory. Otherwise the result lane is loaded from the
  1147. corresponding lane of the `val` vector.
  1148. Inputs:
  1149. - `ptr`: The address of the vector values to load. Masked-off values are not
  1150. accessed.
  1151. - `val`: The vector of values that will be loaded into the masked slots of the
  1152. result vector.
  1153. - `mask`: The mask that selects where to load the values from.
  1154. Returns:
  1155. - The loaded vector. The lanes for which the mask was set are loaded from
  1156. memory, and the other lanes are loaded from the `val` vector.
  1157. Operation:
  1158. for i in 0 ..< len(res) {
  1159. if mask[i]&1 == 1 {
  1160. res[i] = ptr[i]
  1161. } else {
  1162. res[i] = vals[i]
  1163. }
  1164. }
  1165. return res
  1166. Example:
  1167. // The following code loads two values from the `src` vector, the first and the
  1168. // third value (selected by the mask). The masked-off values are given the value
  1169. // of 127 (`0x7f`).
  1170. import "core:fmt"
  1171. import "core:simd"
  1172. simd_masked_load_example :: proc() {
  1173. src := [4] f32 {1, 2, 3, 4};
  1174. mask := #simd [4]bool { true, false, true, false }
  1175. vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
  1176. res := simd.masked_load(&src, vals, mask)
  1177. fmt.println(res)
  1178. }
  1179. Output:
  1180. <1, 127, 3, 127>
  1181. The graphic below demonstrates the flow of lanes.
  1182. +-------------------------------+
  1183. mask: | 1 | 0 | 1 | 0 |
  1184. +-------------------------------+
  1185. | | | `----------------------------.
  1186. | | | |
  1187. | `---- | ------------------------. |
  1188. ptr v v v v
  1189. +---->+-------------------------------+ +-------------------+
  1190. | v1 | v2 | v3 | v4 | vals: | d0 | d1 | d2 | d3 |
  1191. +-------------------------------+ +-------------------+
  1192. | | | |
  1193. | .--- | -------------------------' |
  1194. | | | ,-------------------------'
  1195. v v v v
  1196. +-------------------------------+
  1197. result: | v1 | d1 | v3 | d3 |
  1198. +-------------------------------+
  1199. */
  1200. masked_load :: intrinsics.simd_masked_load
  1201. /*
  1202. Perform a masked store to memory.
  1203. This procedure performs a masked store from a vector `val`, into memory at
  1204. address `ptr`, with the `mask` deciding which lanes are going to be stored,
  1205. and which aren't. If the mask at a corresponding lane has the value `true`
  1206. (lowest bit set), the lane is stored into memory. Otherwise the lane is not
  1207. stored into memory.
  1208. Inputs:
  1209. - `ptr`: The base address of the store.
  1210. - `val`: The vector to store.
  1211. - `mask`: The mask, selecting which lanes of the vector to store into memory.
  1212. Operation:
  1213. for i in 0 ..< len(val) {
  1214. if mask[i]&1 == 1 {
  1215. ptr[i] = val
  1216. }
  1217. }
  1218. Example:
  1219. // Example below stores the value 127 into the first and the third slot of the
  1220. // vector `v`.
  1221. import "core:fmt"
  1222. import "core:simd"
  1223. simd_masked_store_example :: proc() {
  1224. v := [4] f32 {1, 2, 3, 4};
  1225. mask := #simd [4]bool { true, false, true, false }
  1226. vals := #simd [4]f32 { 0x7f, 0x7f, 0x7f, 0x7f }
  1227. simd.masked_store(&v, vals, mask)
  1228. fmt.println(v)
  1229. }
  1230. Output:
  1231. [127, 2, 127, 4]
  1232. The graphic below shows the flow of lanes:
  1233. +-------------------+
  1234. mask: | 1 | 0 | 1 | 0 |
  1235. +-------------------+
  1236. | | | |
  1237. v X v X
  1238. +-------------------+
  1239. vals: | v0 | v1 | v2 | v3 |
  1240. +-------------------+
  1241. | \
  1242. ptr v v
  1243. +--->+-----------------------+
  1244. | v0 | ... | v2 | ... |
  1245. +-----------------------+
  1246. */
  1247. masked_store :: intrinsics.simd_masked_store
  1248. /*
  1249. Load consecutive scalar values and expand into a vector.
  1250. This procedure loads a number of consecutive scalar values from an address,
  1251. specified by the `ptr` parameter, and stores them in a result vector, according
  1252. to the mask. The number of values read from memory is the number of set bits
  1253. in the mask. The lanes for which the mask has the value `true` get the next
  1254. consecutive value from memory, otherwise if the mask is `false` for the
  1255. lane, its value is filled from the corresponding lane of the `val` parameter.
  1256. This procedure acts like `masked_store`, except the values from memory are
  1257. read consecutively, and not according to the lanes. The memory values are read
  1258. and assigned to the result vector's masked lanes in order of increasing
  1259. addresses.
  1260. Inputs:
  1261. - `ptr`: The pointer to the memory to read from.
  1262. - `vals`: The default values for masked-off entries.
  1263. - `mask`: The mask that determines which lanes get consecutive memory values.
  1264. Returns:
  1265. - The result vector, holding masked memory values unmasked default values.
  1266. Operation:
  1267. mem_idx := 0
  1268. for i in 0 ..< len(mask) {
  1269. if mask[i]&1 == 1 {
  1270. res[i] = ptr[mem_idx]
  1271. mem_idx += 1
  1272. } else {
  1273. res[i] = val[i]
  1274. }
  1275. }
  1276. return res
  1277. Example:
  1278. // The example below loads two values from memory of the vector `v`. Two values in
  1279. // the mask are set to `true`, meaning only two memory items will be loaded into
  1280. // the result vector. The mask is set to `true` in the first and the third
  1281. // position, which specifies that the first memory item will be read into the
  1282. // first lane of the result vector, and the second memory item will be read into
  1283. // the third lane of the result vector. All the other lanes of the result vector
  1284. // will be initialized to the default value `127`.
  1285. import "core:fmt"
  1286. import "core:simd"
  1287. simd_masked_expand_load_example :: proc() {
  1288. v := [2] f64 {1, 2};
  1289. mask := #simd [4]bool { true, false, true, false }
  1290. vals := #simd [4]f64 { 0x7f, 0x7f, 0x7f, 0x7f }
  1291. res := simd.masked_expand_load(&v, vals, mask)
  1292. fmt.println(res)
  1293. }
  1294. Output:
  1295. <1, 127, 2, 127>
  1296. Graphical representation of the operation:
  1297. ptr --->+-----------+-----
  1298. | m0 | m1 | ...
  1299. +-----------+-----
  1300. | `--.
  1301. v v
  1302. +-------------------+ +-------------------+
  1303. mask: | 1 | 0 | 1 | 0 | vals: | v0 | v1 | v2 | v3 |
  1304. +-------------------+ +-------------------+
  1305. | | | |
  1306. | .-- | -----------------------' |
  1307. | | | ,----------------------------'
  1308. v v v v
  1309. +-------------------+
  1310. result: | m0 | v1 | m1 | v3 |
  1311. +-------------------+
  1312. */
  1313. masked_expand_load :: intrinsics.simd_masked_expand_load
  1314. /*
  1315. Store masked values to consecutive memory locations.
  1316. This procedure stores values from masked lanes of a vector `val` consecutively
  1317. into memory. This operation is the opposite of `masked_expand_load`. The number
  1318. of items stored into memory is the number of set bits in the mask. If the value
  1319. in a lane of a mask is `true`, that lane is stored into memory. Otherwise
  1320. nothing is stored.
  1321. Inputs:
  1322. - `ptr`: The pointer to the memory of a store.
  1323. - `val`: The vector to store into memory.
  1324. - `mask`: The mask that selects which values to store into memory.
  1325. Operation:
  1326. mem_idx := 0
  1327. for i in 0 ..< len(mask) {
  1328. if mask[i]&1 == 1 {
  1329. ptr[mem_idx] = val[i]
  1330. mem_idx += 1
  1331. }
  1332. }
  1333. Example:
  1334. // The code below fills the vector `v` with two values from a 4-element SIMD
  1335. // vector, the first and the third value. The items in the mask are set to `true`
  1336. // in those lanes.
  1337. import "core:fmt"
  1338. import "core:simd"
  1339. simd_masked_compress_store_example :: proc() {
  1340. v := [2] f64 { };
  1341. mask := #simd [4]bool { true, false, true, false }
  1342. vals := #simd [4]f64 { 1, 2, 3, 4 }
  1343. simd.masked_compress_store(&v, vals, mask)
  1344. fmt.println(v)
  1345. }
  1346. Output:
  1347. [1, 3]
  1348. Graphical representation of the operation:
  1349. +-------------------+
  1350. mask: | 1 | 0 | 1 | 0 |
  1351. +-------------------+
  1352. | |
  1353. v v
  1354. +-------------------+
  1355. vals: | v0 | v1 | v2 | v3 |
  1356. +-------------------+
  1357. | ,--'
  1358. ptr v v
  1359. +--->+-----------------
  1360. | v0 | v2 | ...
  1361. +-----------------
  1362. */
  1363. masked_compress_store :: intrinsics.simd_masked_compress_store
  1364. /*
  1365. Extract scalar from a vector's lane.
  1366. This procedure returns the scalar from the lane at the specified index of the
  1367. vector.
  1368. Inputs:
  1369. - `a`: The vector to extract from.
  1370. - `idx`: The lane index.
  1371. Returns:
  1372. - The value of the lane at the specified index.
  1373. Operation:
  1374. return a[idx]
  1375. */
  1376. extract :: intrinsics.simd_extract
  1377. /*
  1378. Replace the value in a vector's lane.
  1379. This procedure places a scalar value at the lane corresponding to the given index of
  1380. the vector.
  1381. Inputs:
  1382. - `a`: The vector to replace a lane in.
  1383. - `idx`: The lane index.
  1384. - `elem`: The scalar to place.
  1385. Returns:
  1386. - Vector with the specified lane replaced.
  1387. Operation:
  1388. a[idx] = elem
  1389. */
  1390. replace :: intrinsics.simd_replace
  1391. /*
  1392. Reduce a vector to a scalar by adding up all the lanes in a bisecting fashion.
  1393. This procedure returns a scalar that is the sum of all lanes, calculated by
  1394. bisecting the vector into two parts, where the first contains lanes [0, N/2)
  1395. and the second contains lanes [N/2, N), and adding the two halves element-wise
  1396. to produce N/2 values. This is repeated until only a single element remains.
  1397. This order may be faster to compute than the ordered sum for floats, as it can
  1398. often be better parallelized.
  1399. The order of the sum may be important for accounting for precision errors in
  1400. floating-point computation, as floating-point addition is not associative, that
  1401. is `(a+b)+c` may not be equal to `a+(b+c)`.
  1402. Inputs:
  1403. - `v`: The vector to reduce.
  1404. Result:
  1405. - Sum of all lanes, as a scalar.
  1406. Operation:
  1407. for n > 1 {
  1408. n = n / 2
  1409. for i in 0 ..< n {
  1410. a[i] += a[i+n]
  1411. }
  1412. }
  1413. res := a[0]
  1414. Graphical representation of the operation for N=4:
  1415. +-----------------------+
  1416. | v0 | v1 | v2 | v3 |
  1417. +-----------------------+
  1418. | | | |
  1419. [+]<-- | ---' |
  1420. | [+]<--------'
  1421. | |
  1422. `>[+]<'
  1423. |
  1424. v
  1425. +-----+
  1426. result: | y0 |
  1427. +-----+
  1428. */
  1429. reduce_add_bisect :: intrinsics.simd_reduce_add_bisect
  1430. /*
  1431. Reduce a vector to a scalar by multiplying up all the lanes in a bisecting fashion.
  1432. This procedure returns a scalar that is the product of all lanes, calculated by
  1433. bisecting the vector into two parts, where the first contains indices [0, N/2)
  1434. and the second contains indices [N/2, N), and multiplying the two halves
  1435. together element-wise to produce N/2 values. This is repeated until only a
  1436. single element remains. This order may be faster to compute than the ordered
  1437. product for floats, as it can often be better parallelized.
  1438. The order of the product may be important for accounting for precision errors
  1439. in floating-point computation, as floating-point multiplication is not
  1440. associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
  1441. Inputs:
  1442. - `v`: The vector to reduce.
  1443. Result:
  1444. - Product of all lanes, as a scalar.
  1445. Operation:
  1446. for n > 1 {
  1447. n = n / 2
  1448. for i in 0 ..< n {
  1449. a[i] *= a[i+n]
  1450. }
  1451. }
  1452. res := a[0]
  1453. Graphical representation of the operation for N=4:
  1454. +-----------------------+
  1455. | v0 | v1 | v2 | v3 |
  1456. +-----------------------+
  1457. | | | |
  1458. [x]<-- | ---' |
  1459. | [x]<--------'
  1460. | |
  1461. `>[x]<'
  1462. |
  1463. v
  1464. +-----+
  1465. result: | y0 |
  1466. +-----+
  1467. */
  1468. reduce_mul_bisect :: intrinsics.simd_reduce_mul_bisect
  1469. /*
  1470. Reduce a vector to a scalar by adding up all the lanes in an ordered fashion.
  1471. This procedure returns a scalar that is the ordered sum of all lanes. The
  1472. ordered sum may be important for accounting for precision errors in
  1473. floating-point computation, as floating-point addition is not associative,
  1474. that is `(a+b)+c` may not be equal to `a+(b+c)`.
  1475. Inputs:
  1476. - `a`: The vector to reduce.
  1477. Result:
  1478. - Sum of all lanes, as a scalar.
  1479. Operation:
  1480. res := 0
  1481. for i in 0 ..< len(a) {
  1482. res += a[i]
  1483. }
  1484. */
  1485. reduce_add_ordered :: intrinsics.simd_reduce_add_ordered
  1486. /*
  1487. Reduce a vector to a scalar by multiplying all the lanes in an ordered fashion.
  1488. This procedure returns a scalar that is the ordered product of all lanes.
  1489. The ordered product may be important for accounting for precision errors in
  1490. floating-point computation, as floating-point multiplication is not associative,
  1491. that is `(a*b)*c` may not be equal to `a*(b*c)`.
  1492. Inputs:
  1493. - `a`: The vector to reduce.
  1494. Result:
  1495. - Product of all lanes, as a scalar.
  1496. Operation:
  1497. res := 1
  1498. for i in 0 ..< len(a) {
  1499. res *= a[i]
  1500. }
  1501. */
  1502. reduce_mul_ordered :: intrinsics.simd_reduce_mul_ordered
  1503. /*
  1504. Reduce a vector to a scalar by adding up all the lanes in a pairwise fashion.
  1505. This procedure returns a scalar that is the sum of all lanes, calculated by
  1506. adding each even-indexed element with the following odd-indexed element to
  1507. produce N/2 values. This is repeated until only a single element remains. This
  1508. order is supported by hardware instructions for some types/architectures (e.g.
  1509. i16/i32/f32/f64 on x86 SSE, i8/i16/i32/f32 on ARM NEON).
  1510. The order of the sum may be important for accounting for precision errors in
  1511. floating-point computation, as floating-point addition is not associative, that
  1512. is `(a+b)+c` may not be equal to `a+(b+c)`.
  1513. Inputs:
  1514. - `v`: The vector to reduce.
  1515. Result:
  1516. - Sum of all lanes, as a scalar.
  1517. Operation:
  1518. for n > 1 {
  1519. n = n / 2
  1520. for i in 0 ..< n {
  1521. a[i] = a[2*i+0] + a[2*i+1]
  1522. }
  1523. }
  1524. res := a[0]
  1525. Graphical representation of the operation for N=4:
  1526. +-----------------------+
  1527. v: | v0 | v1 | v2 | v3 |
  1528. +-----------------------+
  1529. | | | |
  1530. `>[+]<' `>[+]<'
  1531. | |
  1532. `--->[+]<--'
  1533. |
  1534. v
  1535. +-----+
  1536. result: | y0 |
  1537. +-----+
  1538. */
  1539. reduce_add_pairs :: intrinsics.simd_reduce_add_pairs
  1540. /*
  1541. Reduce a vector to a scalar by multiplying all the lanes in a pairwise fashion.
  1542. This procedure returns a scalar that is the product of all lanes, calculated by
  1543. bisecting the vector into two parts, where the first contains lanes [0, N/2)
  1544. and the second contains lanes [N/2, N), and multiplying the two halves together
  1545. multiplying each even-indexed element with the following odd-indexed element to
  1546. produce N/2 values. This is repeated until only a single element remains. This
  1547. order may be faster to compute than the ordered product for floats, as it can
  1548. often be better parallelized.
  1549. The order of the product may be important for accounting for precision errors
  1550. in floating-point computation, as floating-point multiplication is not
  1551. associative, that is `(a*b)*c` may not be equal to `a*(b*c)`.
  1552. Inputs:
  1553. - `v`: The vector to reduce.
  1554. Result:
  1555. - Product of all lanes, as a scalar.
  1556. Operation:
  1557. for n > 1 {
  1558. n = n / 2
  1559. for i in 0 ..< n {
  1560. a[i] = a[2*i+0] * a[2*i+1]
  1561. }
  1562. }
  1563. res := a[0]
  1564. Graphical representation of the operation for N=4:
  1565. +-----------------------+
  1566. v: | v0 | v1 | v2 | v3 |
  1567. +-----------------------+
  1568. | | | |
  1569. `>[x]<' `>[x]<'
  1570. | |
  1571. `--->[x]<--'
  1572. |
  1573. v
  1574. +-----+
  1575. result: | y0 |
  1576. +-----+
  1577. */
  1578. reduce_mul_pairs :: intrinsics.simd_reduce_mul_pairs
  1579. /*
  1580. Reduce a vector to a scalar by finding the minimum value between all of the lanes.
  1581. This procedure returns a scalar that is the minimum value of all the lanes
  1582. in a vector.
  1583. Inputs:
  1584. - `a`: The vector to reduce.
  1585. Result:
  1586. - Minimum value of all lanes, as a scalar.
  1587. Operation:
  1588. res := 0
  1589. for i in 0 ..< len(a) {
  1590. res = min(res, a[i])
  1591. }
  1592. */
  1593. reduce_min :: intrinsics.simd_reduce_min
  1594. /*
  1595. Reduce a vector to a scalar by finding the maximum value between all of the lanes.
  1596. This procedure returns a scalar that is the maximum value of all the lanes
  1597. in a vector.
  1598. Inputs:
  1599. - `a`: The vector to reduce.
  1600. Result:
  1601. - Maximum value of all lanes, as a scalar.
  1602. Operation:
  1603. res := 0
  1604. for i in 0 ..< len(a) {
  1605. res = max(res, a[i])
  1606. }
  1607. */
  1608. reduce_max :: intrinsics.simd_reduce_max
  1609. /*
  1610. Reduce a vector to a scalar by performing bitwise AND of all of the lanes.
  1611. This procedure returns a scalar that is the result of the bitwise AND operation
  1612. between all of the lanes in a vector.
  1613. Inputs:
  1614. - `a`: The vector to reduce.
  1615. Result:
  1616. - Bitwise AND of all lanes, as a scalar.
  1617. Operation:
  1618. res := 0
  1619. for i in 0 ..< len(a) {
  1620. res &= a[i]
  1621. }
  1622. */
  1623. reduce_and :: intrinsics.simd_reduce_and
  1624. /*
  1625. Reduce a vector to a scalar by performing bitwise OR of all of the lanes.
  1626. This procedure returns a scalar that is the result of the bitwise OR operation
  1627. between all of the lanes in a vector.
  1628. Inputs:
  1629. - `a`: The vector to reduce.
  1630. Result:
  1631. - Bitwise OR of all lanes, as a scalar.
  1632. Operation:
  1633. res := 0
  1634. for i in 0 ..< len(a) {
  1635. res |= a[i]
  1636. }
  1637. */
  1638. reduce_or :: intrinsics.simd_reduce_or
  1639. /*
  1640. Reduce SIMD vector to a scalar by performing bitwise XOR of all of the lanes.
  1641. This procedure returns a scalar that is the result of the bitwise XOR operation
  1642. between all of the lanes in a vector.
  1643. Inputs:
  1644. - `a`: The vector to reduce.
  1645. Result:
  1646. - Bitwise XOR of all lanes, as a scalar.
  1647. Operation:
  1648. res := 0
  1649. for i in 0 ..< len(a) {
  1650. res ~= a[i]
  1651. }
  1652. */
  1653. reduce_xor :: intrinsics.simd_reduce_xor
  1654. /*
  1655. Reduce SIMD vector to a scalar by performing bitwise OR of all of the lanes.
  1656. This procedure returns a scalar that is the result of the bitwise OR operation
  1657. between all of the lanes in a vector.
  1658. Inputs:
  1659. - `a`: The vector to reduce.
  1660. Result:
  1661. - Bitwise OR of all lanes, as a scalar.
  1662. Operation:
  1663. res := 0
  1664. for i in 0 ..< len(a) {
  1665. res |= a[i]
  1666. }
  1667. */
  1668. reduce_any :: intrinsics.simd_reduce_any
  1669. /*
  1670. Reduce SIMD vector to a scalar by performing bitwise AND of all of the lanes.
  1671. This procedure returns a scalar that is the result of the bitwise AND operation
  1672. between all of the lanes in a vector.
  1673. Inputs:
  1674. - `a`: The vector to reduce.
  1675. Result:
  1676. - Bitwise AND of all lanes, as a scalar.
  1677. Operation:
  1678. res := 0
  1679. for i in 0 ..< len(a) {
  1680. res &= a[i]
  1681. }
  1682. */
  1683. reduce_all :: intrinsics.simd_reduce_all
  1684. /*
  1685. Reorder the lanes of a SIMD vector.
  1686. This procedure reorders the lanes of a vector, according to the provided
  1687. indices. The number of indices correspond to the number of lanes in the
  1688. result vector and must be the same as the number of lanes of the input vector.
  1689. Each index specifies, the lane of the scalar from the input vector, which
  1690. will be written at the corresponding position of the result vector.
  1691. Inputs:
  1692. - `x`: The input vector.
  1693. - `indices`: The indices of lanes to write to the result vector.
  1694. Result:
  1695. - Swizzled input vector.
  1696. Operation:
  1697. res = {}
  1698. for i in 0 ..< len(indices) {
  1699. res[i] = x[indices[i]]
  1700. }
  1701. return res
  1702. Example:
  1703. // The example below shows how the indices are used to determine which lanes of the
  1704. // input vector get written into the result vector.
  1705. import "core:fmt"
  1706. import "core:simd"
  1707. swizzle_example :: proc() {
  1708. x := #simd [4]f32 { 1.5, 2.5, 3.5, 4.5 }
  1709. res := simd.swizzle(x, 0, 3, 1, 1)
  1710. fmt.println(res)
  1711. }
  1712. Output:
  1713. <1.5, 4.5, 2.5, 2.5>
  1714. The graphical representation of the operation is as follows. The `idx` vector in
  1715. the picture represents the `indices` parameter:
  1716. 0 1 2 3
  1717. +-----+-----+-----+-----+
  1718. x: | 1.5 | 2.5 | 3.5 | 4.5 |
  1719. +-----+-----+-----+-----+
  1720. ^ ^ ^
  1721. | | |
  1722. | '----. |
  1723. | .---- | ---'
  1724. | | |
  1725. | | +------.
  1726. +-----+-----+-----+-----+
  1727. idx: | 0 | 3 | 1 | 1 |
  1728. +-----+-----+-----+-----+
  1729. ^ ^ ^ ^
  1730. | | | |
  1731. +-----+-----+-----+-----+
  1732. res: | 1.5 | 3.5 | 2.5 | 2.5 |
  1733. +-----+-----+-----+-----+
  1734. */
  1735. swizzle :: builtin.swizzle
  1736. /*
  1737. Extract the set of most-significant bits of a SIMD vector.
  1738. This procedure checks the the most-significant bit (MSB) for each lane of vector
  1739. and returns the numbers of lanes with the most-significant bit set. This procedure
  1740. can be used in conjuction with `lanes_eq` (and other similar procedures) to
  1741. count the number of matched lanes by computing the cardinality of the resulting
  1742. set.
  1743. Inputs:
  1744. - `a`: An input vector.
  1745. Result:
  1746. - A bitset of integers, corresponding to the indexes of the lanes, whose MSBs
  1747. are set.
  1748. Operation:
  1749. bits_per_lane = 8*size_of(a[0])
  1750. res = bit_set {}
  1751. for i in 0 ..< len(a) {
  1752. if a[i] & 1<<(bits_per_lane-1) != 0 {
  1753. res |= i
  1754. }
  1755. }
  1756. return res
  1757. Example:
  1758. // Since lanes 0, 1, 4, 7 contain negative numbers, the most significant
  1759. // bits for them will be set.
  1760. import "core:fmt"
  1761. import "core:simd"
  1762. simd_extract_msbs_example :: proc() {
  1763. v := #simd [8]i32 { -1, -2, +3, +4, -5, +6, +7, -8 }
  1764. fmt.println(simd.extract_msbs(v))
  1765. }
  1766. Output:
  1767. bit_set[0..=7]{0, 1, 4, 7}
  1768. */
  1769. extract_msbs :: intrinsics.simd_extract_msbs
  1770. /*
  1771. Extract the set of least-significant bits of a SIMD vector.
  1772. This procedure checks the the least-significant bit (LSB) for each lane of vector
  1773. and returns the numbers of lanes with the least-significant bit set. This procedure
  1774. can be used in conjuction with `lanes_eq` (and other similar procedures) to
  1775. count the number of matched lanes by computing the cardinality of the resulting
  1776. set.
  1777. Inputs:
  1778. - `a`: An input vector.
  1779. Result:
  1780. - A bitset of integers, corresponding to the indexes of the lanes, whose LSBs
  1781. are set.
  1782. Operation:
  1783. res = bit_set {}
  1784. for i in 0 ..< len(a) {
  1785. if a[i] & 1 != 0 {
  1786. res |= i
  1787. }
  1788. }
  1789. return res
  1790. Example:
  1791. // Since lanes 0, 2, 4, 6 contain odd integers, the least significant bits
  1792. // for these lanes are set.
  1793. import "core:fmt"
  1794. import "core:simd"
  1795. simd_extract_lsbs_example :: proc() {
  1796. v := #simd [8]i32 { -1, -2, +3, +4, -5, +6, +7, -8 }
  1797. fmt.println(simd.extract_lsbs(v))
  1798. }
  1799. Output:
  1800. bit_set[0..=7]{0, 2, 4, 6}
  1801. */
  1802. extract_lsbs :: intrinsics.simd_extract_lsbs
  1803. /*
  1804. Reorder the lanes of two SIMD vectors.
  1805. This procedure returns a vector, containing the scalars from the lanes of two
  1806. vectors, according to the provided indices vector. Each index in the indices
  1807. vector specifies, the lane of the scalar from one of the two input vectors,
  1808. which will be written at the corresponding position of the result vector. If
  1809. the index is within bounds 0 ..< len(A), it corresponds to the indices of the
  1810. first input vector. Otherwise the index corresponds to the indices of the second
  1811. input vector.
  1812. Inputs:
  1813. - `a`: The first input vector.
  1814. - `b`: The second input vector.
  1815. - `indices`: The indices.
  1816. Result:
  1817. - Input vectors, shuffled according to the indices.
  1818. Operation:
  1819. res = {}
  1820. for i in 0 ..< len(indices) {
  1821. idx = indices[i];
  1822. if idx < len(a) {
  1823. res[i] = a[idx]
  1824. } else {
  1825. res[i] = b[idx]
  1826. }
  1827. }
  1828. return res
  1829. Example:
  1830. // The example below shows how the indices are used to determine lanes of the
  1831. // input vector that are shuffled into the result vector.
  1832. import "core:fmt"
  1833. import "core:simd"
  1834. simd_shuffle_example :: proc() {
  1835. a := #simd [4]f32 { 1, 2, 3, 4 }
  1836. b := #simd [4]f32 { 5, 6, 7, 8 }
  1837. res := simd.shuffle(a, b, 0, 4, 2, 5)
  1838. fmt.println(res)
  1839. }
  1840. Output:
  1841. <1, 5, 3, 6>
  1842. The graphical representation of the operation is as follows. The `idx` vector in
  1843. the picture represents the `indices` parameter:
  1844. 0 1 2 3 4 5 6 7
  1845. +-----+-----+-----+-----+ +-----+-----+-----+-----+
  1846. a: | 1 | 2 | 3 | 4 | b: | 5 | 6 | 7 | 8 |
  1847. +-----+-----+-----+-----+ +-----+-----+-----+-----+
  1848. ^ ^ ^ ^
  1849. | | | |
  1850. | | | |
  1851. | .--- | ----------------' |
  1852. | | | .-----------------'
  1853. +-----+-----+-----+-----+
  1854. idx: | 0 | 4 | 2 | 5 |
  1855. +-----+-----+-----+-----+
  1856. ^ ^ ^ ^
  1857. | | | |
  1858. +-----+-----+-----+-----+
  1859. res: | 1 | 5 | 3 | 6 |
  1860. +-----+-----+-----+-----+
  1861. */
  1862. shuffle :: intrinsics.simd_shuffle
  1863. /*
  1864. Select values from one of the two vectors.
  1865. This procedure returns a vector, which has, on each lane a value from one of the
  1866. corresponding lanes in one of the two input vectors based on the `cond`
  1867. parameter. On each lane, if the value of the `cond` parameter is `true` (or
  1868. non-zero), the result lane will have a value from the `true` input vector,
  1869. otherwise the result lane will have a value from the `false` input vector.
  1870. Inputs:
  1871. - `cond`: The condition vector.
  1872. - `true`: The first input vector.
  1873. - `false`: The second input vector.
  1874. Result:
  1875. - The result of selecting values from the two input vectors.
  1876. Operation:
  1877. res = {}
  1878. for i in 0 ..< len(cond) {
  1879. if cond[i] {
  1880. res[i] = true[i]
  1881. } else {
  1882. res[i] = false[i]
  1883. }
  1884. }
  1885. return res
  1886. Example:
  1887. // The following example selects values from the two input vectors, `a` and `b`
  1888. // into a single vector.
  1889. import "core:fmt"
  1890. import "core:simd"
  1891. simd_select_example :: proc() {
  1892. a := #simd [4] f64 { 1,2,3,4 }
  1893. b := #simd [4] f64 { 5,6,7,8 }
  1894. cond := #simd[4] int { 1, 0, 1, 0 }
  1895. fmt.println(simd.select(cond,a,b))
  1896. }
  1897. Output:
  1898. <1, 6, 3, 8>
  1899. Graphically, the operation looks as follows. The `t` and `f` represent the
  1900. `true` and `false` vectors respectively:
  1901. 0 1 2 3 0 1 2 3
  1902. +-----+-----+-----+-----+ +-----+-----+-----+-----+
  1903. t: | 1 | 2 | 3 | 4 | f: | 5 | 6 | 7 | 8 |
  1904. +-----+-----+-----+-----+ +-----+-----+-----+-----+
  1905. ^ ^ ^ ^
  1906. | | | |
  1907. | | | |
  1908. | .--- | ----------------------' |
  1909. | | | .-----------------------------'
  1910. +-----+-----+-----+-----+
  1911. cond: | 1 | 0 | 1 | 0 |
  1912. +-----+-----+-----+-----+
  1913. ^ ^ ^ ^
  1914. | | | |
  1915. +-----+-----+-----+-----+
  1916. res: | 1 | 5 | 3 | 6 |
  1917. +-----+-----+-----+-----+
  1918. */
  1919. select :: intrinsics.simd_select
  1920. /*
  1921. Runtime Equivalent to Shuffle.
  1922. Performs element-wise table lookups using runtime indices.
  1923. Each element in the indices vector selects an element from the table vector.
  1924. The indices are automatically masked to prevent out-of-bounds access.
  1925. This operation is hardware-accelerated on most platforms when using 8-bit
  1926. integer vectors. For other element types or unsupported vector sizes, it
  1927. falls back to software emulation.
  1928. Inputs:
  1929. - `table`: The lookup table vector (should be power-of-2 size for correct masking).
  1930. - `indices`: The indices vector (automatically masked to valid range).
  1931. Returns:
  1932. - A vector where `result[i] = table[indices[i] & (table_size-1)]`.
  1933. Operation:
  1934. for i in 0 ..< len(indices) {
  1935. masked_index := indices[i] & (len(table) - 1)
  1936. result[i] = table[masked_index]
  1937. }
  1938. return result
  1939. Implementation:
  1940. | Platform | Lane Size | Implementation |
  1941. |-------------|-------------------------------------------|---------------------|
  1942. | x86-64 | pshufb (16B), vpshufb (32B), AVX512 (64B) | Single vector |
  1943. | ARM64 | tbl1 (16B), tbl2 (32B), tbl4 (64B) | Automatic splitting |
  1944. | ARM32 | vtbl1 (8B), vtbl2 (16B), vtbl4 (32B) | Automatic splitting |
  1945. | WebAssembly | i8x16.swizzle (16B), Emulation (>16B) | Mixed |
  1946. | Other | Emulation | Software |
  1947. Example:
  1948. import "core:simd"
  1949. import "core:fmt"
  1950. runtime_swizzle_example :: proc() {
  1951. table := simd.u8x16{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}
  1952. indices := simd.u8x16{15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
  1953. result := simd.runtime_swizzle(table, indices)
  1954. fmt.println(result) // Expected: {15, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}
  1955. }
  1956. */
  1957. runtime_swizzle :: intrinsics.simd_runtime_swizzle
  1958. /*
  1959. Compute the square root of each lane in a SIMD vector.
  1960. */
  1961. sqrt :: intrinsics.sqrt
  1962. /*
  1963. Ceil each lane in a SIMD vector.
  1964. */
  1965. ceil :: intrinsics.simd_ceil
  1966. /*
  1967. Floor each lane in a SIMD vector.
  1968. */
  1969. floor :: intrinsics.simd_floor
  1970. /*
  1971. Truncate each lane in a SIMD vector.
  1972. */
  1973. trunc :: intrinsics.simd_trunc
  1974. /*
  1975. Compute the nearest integer of each lane in a SIMD vector.
  1976. */
  1977. nearest :: intrinsics.simd_nearest
  1978. /*
  1979. Transmute a SIMD vector into an integer vector.
  1980. */
  1981. to_bits :: intrinsics.simd_to_bits
  1982. /*
  1983. Reverse the lanes of a SIMD vector.
  1984. This procedure reverses the lanes of a vector, putting last lane in the
  1985. first spot, etc. This procedure is equivalent to the following call (for
  1986. 4-element vectors):
  1987. swizzle(a, 3, 2, 1, 0)
  1988. */
  1989. lanes_reverse :: intrinsics.simd_lanes_reverse
  1990. /*
  1991. Rotate the lanes of a SIMD vector left.
  1992. This procedure rotates the lanes of a vector, putting the first lane of the
  1993. last spot, second lane in the first spot, third lane in the second spot, etc.
  1994. For 4-element vectors, this procedure is equvalent to the following:
  1995. swizzle(a, 1, 2, 3, 0)
  1996. */
  1997. lanes_rotate_left :: intrinsics.simd_lanes_rotate_left
  1998. /*
  1999. Rotate the lanes of a SIMD vector right.
  2000. This procedure rotates the lanes of a SIMD vector, putting the first lane of the
  2001. second spot, second lane in the third spot, etc. For 4-element vectors, this
  2002. procedure is equvalent to the following:
  2003. swizzle(a, 3, 0, 1, 2)
  2004. */
  2005. lanes_rotate_right :: intrinsics.simd_lanes_rotate_right
  2006. /*
  2007. Count the number of set bits in each lane of a SIMD vector.
  2008. */
  2009. count_ones :: intrinsics.count_ones
  2010. /*
  2011. Count the number of unset bits in each lane of a SIMD vector.
  2012. */
  2013. count_zeros :: intrinsics.count_zeros
  2014. /*
  2015. Count the number of trailing unset bits in each lane of a SIMD vector.
  2016. */
  2017. count_trailing_zeros :: intrinsics.count_trailing_zeros
  2018. /*
  2019. Count the number of leading unset bits in each lane of a SIMD vector.
  2020. */
  2021. count_leading_zeros :: intrinsics.count_leading_zeros
  2022. /*
  2023. Reverse the bit pattern of a SIMD vector.
  2024. */
  2025. reverse_bits :: intrinsics.reverse_bits
  2026. /*
  2027. Perform a FMA (Fused multiply-add) operation on each lane of SIMD vectors.
  2028. A fused multiply-add is a ternary operation that for three operands, `a`, `b`
  2029. and `c` performs the operation `a*b+c`. This operation is a hardware feature
  2030. that allows to minimize floating-point error and allow for faster computation.
  2031. This procedure performs a FMA operation on each lane of the SIMD vectors.
  2032. Inputs:
  2033. - `a`: The multiplier
  2034. - `b`: The multiplicand
  2035. - `c`: The addend
  2036. Returns:
  2037. - `a*b+c`
  2038. **Operation**
  2039. res := 0
  2040. for i in 0 ..< len(a) {
  2041. res[i] = fma(a[i], b[i], c[i])
  2042. }
  2043. return res
  2044. */
  2045. fused_mul_add :: intrinsics.fused_mul_add
  2046. /*
  2047. Perform a FMA (Fused multiply-add) operation on each lane of SIMD vectors.
  2048. A fused multiply-add is a ternary operation that for three operands, `a`, `b`
  2049. and `c` performs the operation `a*b+c`. This operation is a hardware feature
  2050. that allows to minimize floating-point error and allow for faster computation.
  2051. This procedure performs a FMA operation on each lane of the SIMD vectors.
  2052. Inputs:
  2053. - `a`: The multiplier.
  2054. - `b`: The multiplicand.
  2055. - `c`: The addend.
  2056. Returns:
  2057. - `a*b+c`
  2058. **Operation**
  2059. res := 0
  2060. for i in 0 ..< len(a) {
  2061. res[i] = fma(a[i], b[i], c[i])
  2062. }
  2063. return res
  2064. */
  2065. fma :: intrinsics.fused_mul_add
  2066. /*
  2067. Convert pointer to SIMD vector to an array pointer.
  2068. */
  2069. to_array_ptr :: #force_inline proc "contextless" (v: ^#simd[$LANES]$E) -> ^[LANES]E {
  2070. return (^[LANES]E)(v)
  2071. }
  2072. /*
  2073. Convert SIMD vector to an array.
  2074. */
  2075. to_array :: #force_inline proc "contextless" (v: #simd[$LANES]$E) -> [LANES]E {
  2076. return transmute([LANES]E)(v)
  2077. }
  2078. /*
  2079. Convert array to SIMD vector.
  2080. */
  2081. from_array :: #force_inline proc "contextless" (v: $A/[$LANES]$E) -> #simd[LANES]E {
  2082. return transmute(#simd[LANES]E)v
  2083. }
  2084. /*
  2085. Convert slice to SIMD vector.
  2086. */
  2087. from_slice :: proc($T: typeid/#simd[$LANES]$E, slice: []E) -> T {
  2088. assert(len(slice) >= LANES, "slice length must be a least the number of lanes")
  2089. array: [LANES]E
  2090. #no_bounds_check for i in 0..<LANES {
  2091. array[i] = slice[i]
  2092. }
  2093. return transmute(T)array
  2094. }
  2095. /*
  2096. Perform binary not operation on a SIMD vector.
  2097. This procedure returns a vector where each lane is the result of the binary
  2098. NOT operation of the corresponding lane in the vector `a`.
  2099. Operation:
  2100. for i in 0 ..< len(res) {
  2101. res[i] = ~a[i]
  2102. }
  2103. return res
  2104. Example:
  2105. +------+------+------+------+
  2106. a: | 0x00 | 0x50 | 0x80 | 0xff |
  2107. +------+------+------+------+
  2108. res:
  2109. +------+------+------+------+
  2110. | 0xff | 0xaf | 0x7f | 0x00 |
  2111. +------+------+------+------+
  2112. */
  2113. bit_not :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_integer(E) {
  2114. return bit_xor(v, T(~E(0)))
  2115. }
  2116. /*
  2117. Copy the signs from lanes of one SIMD vector into another SIMD vector.
  2118. */
  2119. copysign :: #force_inline proc "contextless" (v, sign: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
  2120. neg_zero := to_bits(T(-0.0))
  2121. sign_bit := to_bits(sign) & neg_zero
  2122. magnitude := to_bits(v) &~ neg_zero
  2123. return transmute(T)(sign_bit|magnitude)
  2124. }
  2125. /*
  2126. Return signs of SIMD lanes.
  2127. This procedure returns a vector, each lane of which contains either +1.0 or
  2128. -1.0 depending on the sign of the value in the corresponding lane of the
  2129. input vector. If the lane of the input vector has NaN, then the result vector
  2130. will contain this NaN value as-is.
  2131. */
  2132. signum :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
  2133. is_nan := lanes_ne(v, v)
  2134. return select(is_nan, v, copysign(T(1), v))
  2135. }
  2136. /*
  2137. Calculate reciprocals of SIMD lanes.
  2138. This procedure returns a vector where each lane is the reciprocal of the
  2139. corresponding lane in the vector `a`.
  2140. Inputs:
  2141. - `a`: An integer or a float vector to negate.
  2142. Returns:
  2143. - Negated vector.
  2144. Operation:
  2145. for i in 0 ..< len(res) {
  2146. res[i] = 1.0 / a[i]
  2147. }
  2148. return res
  2149. Example:
  2150. +------+------+------+------+
  2151. a: | 2 | 1 | 3 | 5 |
  2152. +------+------+------+------+
  2153. res:
  2154. +------+------+------+------+
  2155. | 0.5 | 1 | 0.33 | 0.2 |
  2156. +------+------+------+------+
  2157. */
  2158. recip :: #force_inline proc "contextless" (v: $T/#simd[$LANES]$E) -> T where intrinsics.type_is_float(E) {
  2159. return T(1) / v
  2160. }
  2161. /*
  2162. Create a vector where each lane contains the index of that lane.
  2163. Inputs:
  2164. - `V`: The type of the vector to create.
  2165. Result:
  2166. - A vector of the given type, where each lane contains the index of that lane.
  2167. Operation:
  2168. for i in 0 ..< N {
  2169. res[i] = i
  2170. }
  2171. */
  2172. indices :: intrinsics.simd_indices