sse.odin 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618
  1. //+build i386, amd64
  2. package simd_x86
  3. import "base:intrinsics"
  4. import "core:simd"
  5. // _MM_SHUFFLE(z, y, x, w) -> (z<<6 | y<<4 | x<<2 | w)
  6. _MM_SHUFFLE :: intrinsics.simd_x86__MM_SHUFFLE
  7. _MM_HINT_T0 :: 3
  8. _MM_HINT_T1 :: 2
  9. _MM_HINT_T2 :: 1
  10. _MM_HINT_NTA :: 0
  11. _MM_HINT_ET0 :: 7
  12. _MM_HINT_ET1 :: 6
  13. _MM_EXCEPT_INVALID :: 0x0001
  14. _MM_EXCEPT_DENORM :: 0x0002
  15. _MM_EXCEPT_DIV_ZERO :: 0x0004
  16. _MM_EXCEPT_OVERFLOW :: 0x0008
  17. _MM_EXCEPT_UNDERFLOW :: 0x0010
  18. _MM_EXCEPT_INEXACT :: 0x0020
  19. _MM_EXCEPT_MASK :: 0x003f
  20. _MM_MASK_INVALID :: 0x0080
  21. _MM_MASK_DENORM :: 0x0100
  22. _MM_MASK_DIV_ZERO :: 0x0200
  23. _MM_MASK_OVERFLOW :: 0x0400
  24. _MM_MASK_UNDERFLOW :: 0x0800
  25. _MM_MASK_INEXACT :: 0x1000
  26. _MM_MASK_MASK :: 0x1f80
  27. _MM_ROUND_NEAREST :: 0x0000
  28. _MM_ROUND_DOWN :: 0x2000
  29. _MM_ROUND_UP :: 0x4000
  30. _MM_ROUND_TOWARD_ZERO :: 0x6000
  31. _MM_ROUND_MASK :: 0x6000
  32. _MM_FLUSH_ZERO_MASK :: 0x8000
  33. _MM_FLUSH_ZERO_ON :: 0x8000
  34. _MM_FLUSH_ZERO_OFF :: 0x0000
  35. @(require_results, enable_target_feature="sse")
  36. _mm_add_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  37. return addss(a, b)
  38. }
  39. @(require_results, enable_target_feature="sse")
  40. _mm_add_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  41. return simd.add(a, b)
  42. }
  43. @(require_results, enable_target_feature="sse")
  44. _mm_sub_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  45. return subss(a, b)
  46. }
  47. @(require_results, enable_target_feature="sse")
  48. _mm_sub_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  49. return simd.sub(a, b)
  50. }
  51. @(require_results, enable_target_feature="sse")
  52. _mm_mul_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  53. return mulss(a, b)
  54. }
  55. @(require_results, enable_target_feature="sse")
  56. _mm_mul_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  57. return simd.mul(a, b)
  58. }
  59. @(require_results, enable_target_feature="sse")
  60. _mm_div_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  61. return divss(a, b)
  62. }
  63. @(require_results, enable_target_feature="sse")
  64. _mm_div_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  65. return simd.div(a, b)
  66. }
  67. @(require_results, enable_target_feature="sse")
  68. _mm_sqrt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  69. return sqrtss(a)
  70. }
  71. @(require_results, enable_target_feature="sse")
  72. _mm_sqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
  73. return sqrtps(a)
  74. }
  75. @(require_results, enable_target_feature="sse")
  76. _mm_rcp_ss :: #force_inline proc "c" (a: __m128) -> __m128 {
  77. return rcpss(a)
  78. }
  79. @(require_results, enable_target_feature="sse")
  80. _mm_rcp_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
  81. return rcpps(a)
  82. }
  83. @(require_results, enable_target_feature="sse")
  84. _mm_rsqrt_ss :: #force_inline proc "c" (a: __m128) -> __m128 {
  85. return rsqrtss(a)
  86. }
  87. @(require_results, enable_target_feature="sse")
  88. _mm_rsqrt_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
  89. return rsqrtps(a)
  90. }
  91. @(require_results, enable_target_feature="sse")
  92. _mm_min_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  93. return minss(a, b)
  94. }
  95. @(require_results, enable_target_feature="sse")
  96. _mm_min_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  97. return minps(a, b)
  98. }
  99. @(require_results, enable_target_feature="sse")
  100. _mm_max_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  101. return maxss(a, b)
  102. }
  103. @(require_results, enable_target_feature="sse")
  104. _mm_max_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  105. return maxps(a, b)
  106. }
  107. @(require_results, enable_target_feature="sse")
  108. _mm_and_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  109. return transmute(__m128)simd.bit_and(transmute(__m128i)a, transmute(__m128i)b)
  110. }
  111. @(require_results, enable_target_feature="sse")
  112. _mm_andnot_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  113. return transmute(__m128)simd.bit_and_not(transmute(__m128i)a, transmute(__m128i)b)
  114. }
  115. @(require_results, enable_target_feature="sse")
  116. _mm_or_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  117. return transmute(__m128)simd.bit_or(transmute(__m128i)a, transmute(__m128i)b)
  118. }
  119. @(require_results, enable_target_feature="sse")
  120. _mm_xor_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  121. return transmute(__m128)simd.bit_xor(transmute(__m128i)a, transmute(__m128i)b)
  122. }
  123. @(require_results, enable_target_feature="sse")
  124. _mm_cmpeq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  125. return cmpss(a, b, 0)
  126. }
  127. @(require_results, enable_target_feature="sse")
  128. _mm_cmplt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  129. return cmpss(a, b, 1)
  130. }
  131. @(require_results, enable_target_feature="sse")
  132. _mm_cmple_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  133. return cmpss(a, b, 2)
  134. }
  135. @(require_results, enable_target_feature="sse")
  136. _mm_cmpgt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  137. return simd.shuffle(a, cmpss(b, a, 1), 4, 1, 2, 3)
  138. }
  139. @(require_results, enable_target_feature="sse")
  140. _mm_cmpge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  141. return simd.shuffle(a, cmpss(b, a, 2), 4, 1, 2, 3)
  142. }
  143. @(require_results, enable_target_feature="sse")
  144. _mm_cmpneq_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  145. return cmpss(a, b, 4)
  146. }
  147. @(require_results, enable_target_feature="sse")
  148. _mm_cmpnlt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  149. return cmpss(a, b, 5)
  150. }
  151. @(require_results, enable_target_feature="sse")
  152. _mm_cmpnle_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  153. return cmpss(a, b, 6)
  154. }
  155. @(require_results, enable_target_feature="sse")
  156. _mm_cmpngt_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  157. return simd.shuffle(a, cmpss(b, a, 5), 4, 1, 2, 3)
  158. }
  159. @(require_results, enable_target_feature="sse")
  160. _mm_cmpnge_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  161. return simd.shuffle(a, cmpss(b, a, 6), 4, 1, 2, 3)
  162. }
  163. @(require_results, enable_target_feature="sse")
  164. _mm_cmpord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  165. return cmpss(a, b, 7)
  166. }
  167. @(require_results, enable_target_feature="sse")
  168. _mm_cmpunord_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  169. return cmpss(a, b, 3)
  170. }
  171. @(require_results, enable_target_feature="sse")
  172. _mm_cmpeq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  173. return cmpps(a, b, 0)
  174. }
  175. @(require_results, enable_target_feature="sse")
  176. _mm_cmplt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  177. return cmpps(a, b, 1)
  178. }
  179. @(require_results, enable_target_feature="sse")
  180. _mm_cmple_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  181. return cmpps(a, b, 2)
  182. }
  183. @(require_results, enable_target_feature="sse")
  184. _mm_cmpgt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  185. return cmpps(b, a, 1)
  186. }
  187. @(require_results, enable_target_feature="sse")
  188. _mm_cmpge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  189. return cmpps(b, a, 2)
  190. }
  191. @(require_results, enable_target_feature="sse")
  192. _mm_cmpneq_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  193. return cmpps(a, b, 4)
  194. }
  195. @(require_results, enable_target_feature="sse")
  196. _mm_cmpnlt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  197. return cmpps(a, b, 5)
  198. }
  199. @(require_results, enable_target_feature="sse")
  200. _mm_cmpnle_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  201. return cmpps(a, b, 6)
  202. }
  203. @(require_results, enable_target_feature="sse")
  204. _mm_cmpngt_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  205. return cmpps(b, a, 5)
  206. }
  207. @(require_results, enable_target_feature="sse")
  208. _mm_cmpnge_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  209. return cmpps(b, a, 6)
  210. }
  211. @(require_results, enable_target_feature="sse")
  212. _mm_cmpord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  213. return cmpps(b, a, 7)
  214. }
  215. @(require_results, enable_target_feature="sse")
  216. _mm_cmpunord_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  217. return cmpps(b, a, 3)
  218. }
  219. @(require_results, enable_target_feature="sse")
  220. _mm_comieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  221. return comieq_ss(a, b)
  222. }
  223. @(require_results, enable_target_feature="sse")
  224. _mm_comilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  225. return comilt_ss(a, b)
  226. }
  227. @(require_results, enable_target_feature="sse")
  228. _mm_comile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  229. return comile_ss(a, b)
  230. }
  231. @(require_results, enable_target_feature="sse")
  232. _mm_comigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  233. return comigt_ss(a, b)
  234. }
  235. @(require_results, enable_target_feature="sse")
  236. _mm_comige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  237. return comige_ss(a, b)
  238. }
  239. @(require_results, enable_target_feature="sse")
  240. _mm_comineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  241. return comineq_ss(a, b)
  242. }
  243. @(require_results, enable_target_feature="sse")
  244. _mm_ucomieq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  245. return ucomieq_ss(a, b)
  246. }
  247. @(require_results, enable_target_feature="sse")
  248. _mm_ucomilt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  249. return ucomilt_ss(a, b)
  250. }
  251. @(require_results, enable_target_feature="sse")
  252. _mm_ucomile_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  253. return ucomile_ss(a, b)
  254. }
  255. @(require_results, enable_target_feature="sse")
  256. _mm_ucomigt_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  257. return ucomigt_ss(a, b)
  258. }
  259. @(require_results, enable_target_feature="sse")
  260. _mm_ucomige_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  261. return ucomige_ss(a, b)
  262. }
  263. @(require_results, enable_target_feature="sse")
  264. _mm_ucomineq_ss :: #force_inline proc "c" (a, b: __m128) -> b32 {
  265. return ucomineq_ss(a, b)
  266. }
  267. @(require_results, enable_target_feature="sse")
  268. _mm_cvtss_si32 :: #force_inline proc "c" (a: __m128) -> i32 {
  269. return cvtss2si(a)
  270. }
  271. _mm_cvt_ss2si :: _mm_cvtss_si32
  272. _mm_cvttss_si32 :: _mm_cvtss_si32
  273. @(require_results, enable_target_feature="sse")
  274. _mm_cvtss_f32 :: #force_inline proc "c" (a: __m128) -> f32 {
  275. return simd.extract(a, 0)
  276. }
  277. @(require_results, enable_target_feature="sse")
  278. _mm_cvtsi32_ss :: #force_inline proc "c" (a: __m128, b: i32) -> __m128 {
  279. return cvtsi2ss(a, b)
  280. }
  281. _mm_cvt_si2ss :: _mm_cvtsi32_ss
  282. @(require_results, enable_target_feature="sse")
  283. _mm_set_ss :: #force_inline proc "c" (a: f32) -> __m128 {
  284. return __m128{a, 0, 0, 0}
  285. }
  286. @(require_results, enable_target_feature="sse")
  287. _mm_set1_ps :: #force_inline proc "c" (a: f32) -> __m128 {
  288. return __m128(a)
  289. }
  290. _mm_set_ps1 :: _mm_set1_ps
  291. @(require_results, enable_target_feature="sse")
  292. _mm_set_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 {
  293. return __m128{d, c, b, a}
  294. }
  295. @(require_results, enable_target_feature="sse")
  296. _mm_setr_ps :: #force_inline proc "c" (a, b, c, d: f32) -> __m128 {
  297. return __m128{a, b, c, d}
  298. }
  299. @(require_results, enable_target_feature="sse")
  300. _mm_setzero_ps :: #force_inline proc "c" () -> __m128 {
  301. return __m128{0, 0, 0, 0}
  302. }
  303. @(require_results, enable_target_feature="sse")
  304. _mm_shuffle_ps :: #force_inline proc "c" (a, b: __m128, $MASK: u32) -> __m128 {
  305. return simd.shuffle(
  306. a, b,
  307. u32(MASK) & 0b11,
  308. (u32(MASK)>>2) & 0b11,
  309. ((u32(MASK)>>4) & 0b11)+4,
  310. ((u32(MASK)>>6) & 0b11)+4)
  311. }
  312. @(require_results, enable_target_feature="sse")
  313. _mm_unpackhi_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  314. return simd.shuffle(a, b, 2, 6, 3, 7)
  315. }
  316. @(require_results, enable_target_feature="sse")
  317. _mm_unpacklo_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  318. return simd.shuffle(a, b, 0, 4, 1, 5)
  319. }
  320. @(require_results, enable_target_feature="sse")
  321. _mm_movehl_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  322. return simd.shuffle(a, b, 6, 7, 2, 3)
  323. }
  324. @(require_results, enable_target_feature="sse")
  325. _mm_movelh_ps :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  326. return simd.shuffle(a, b, 0, 1, 4, 5)
  327. }
  328. @(require_results, enable_target_feature="sse")
  329. _mm_movemask_ps :: #force_inline proc "c" (a: __m128) -> u32 {
  330. return movmskps(a)
  331. }
  332. @(require_results, enable_target_feature="sse")
  333. _mm_load_ss :: #force_inline proc "c" (p: ^f32) -> __m128 {
  334. return __m128{p^, 0, 0, 0}
  335. }
  336. @(require_results, enable_target_feature="sse")
  337. _mm_load1_ps :: #force_inline proc "c" (p: ^f32) -> __m128 {
  338. a := p^
  339. return __m128(a)
  340. }
  341. _mm_load_ps1 :: _mm_load1_ps
  342. @(require_results, enable_target_feature="sse")
  343. _mm_load_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
  344. return (^__m128)(p)^
  345. }
  346. @(require_results, enable_target_feature="sse")
  347. _mm_loadu_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
  348. dst := _mm_undefined_ps()
  349. intrinsics.mem_copy_non_overlapping(&dst, p, size_of(__m128))
  350. return dst
  351. }
  352. @(require_results, enable_target_feature="sse")
  353. _mm_loadr_ps :: #force_inline proc "c" (p: [^]f32) -> __m128 {
  354. return simd.lanes_reverse(_mm_load_ps(p))
  355. }
  356. @(require_results, enable_target_feature="sse")
  357. _mm_loadu_si64 :: #force_inline proc "c" (mem_addr: rawptr) -> __m128i {
  358. a := intrinsics.unaligned_load((^i64)(mem_addr))
  359. return __m128i{a, 0}
  360. }
  361. @(enable_target_feature="sse")
  362. _mm_store_ss :: #force_inline proc "c" (p: ^f32, a: __m128) {
  363. p^ = simd.extract(a, 0)
  364. }
  365. @(enable_target_feature="sse")
  366. _mm_store1_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
  367. b := simd.swizzle(a, 0, 0, 0, 0)
  368. (^__m128)(p)^ = b
  369. }
  370. _mm_store_ps1 :: _mm_store1_ps
  371. @(enable_target_feature="sse")
  372. _mm_store_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
  373. (^__m128)(p)^ = a
  374. }
  375. @(enable_target_feature="sse")
  376. _mm_storeu_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
  377. b := a
  378. intrinsics.mem_copy_non_overlapping(p, &b, size_of(__m128))
  379. }
  380. @(enable_target_feature="sse")
  381. _mm_storer_ps :: #force_inline proc "c" (p: [^]f32, a: __m128) {
  382. (^__m128)(p)^ = simd.lanes_reverse(a)
  383. }
  384. @(require_results, enable_target_feature="sse")
  385. _mm_move_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  386. return simd.shuffle(a, b, 4, 1, 2, 3)
  387. }
  388. @(enable_target_feature="sse")
  389. _mm_sfence :: #force_inline proc "c" () {
  390. sfence()
  391. }
  392. @(require_results, enable_target_feature="sse")
  393. _mm_getcsr :: #force_inline proc "c" () -> (result: u32) {
  394. stmxcsr(&result)
  395. return result
  396. }
  397. @(enable_target_feature="sse")
  398. _mm_setcsr :: #force_inline proc "c" (val: u32) {
  399. val := val
  400. ldmxcsr(&val)
  401. }
  402. @(require_results, enable_target_feature="sse")
  403. _MM_GET_EXCEPTION_MASK :: #force_inline proc "c" () -> u32 {
  404. return _mm_getcsr() & _MM_MASK_MASK
  405. }
  406. @(require_results, enable_target_feature="sse")
  407. _MM_GET_EXCEPTION_STATE :: #force_inline proc "c" () -> u32 {
  408. return _mm_getcsr() & _MM_EXCEPT_MASK
  409. }
  410. @(require_results, enable_target_feature="sse")
  411. _MM_GET_FLUSH_ZERO_MODE :: #force_inline proc "c" () -> u32 {
  412. return _mm_getcsr() & _MM_FLUSH_ZERO_MASK
  413. }
  414. @(require_results, enable_target_feature="sse")
  415. _MM_GET_ROUNDING_MODE :: #force_inline proc "c" () -> u32 {
  416. return _mm_getcsr() & _MM_ROUND_MASK
  417. }
  418. @(enable_target_feature="sse")
  419. _MM_SET_EXCEPTION_MASK :: #force_inline proc "c" (x: u32) {
  420. _mm_setcsr((_mm_getcsr() &~ _MM_MASK_MASK) | x)
  421. }
  422. @(enable_target_feature="sse")
  423. _MM_SET_EXCEPTION_STATE :: #force_inline proc "c" (x: u32) {
  424. _mm_setcsr((_mm_getcsr() &~ _MM_EXCEPT_MASK) | x)
  425. }
  426. @(enable_target_feature="sse")
  427. _MM_SET_FLUSH_ZERO_MODE :: #force_inline proc "c" (x: u32) {
  428. _mm_setcsr((_mm_getcsr() &~ _MM_FLUSH_ZERO_MASK) | x)
  429. }
  430. @(enable_target_feature="sse")
  431. _MM_SET_ROUNDING_MODE :: #force_inline proc "c" (x: u32) {
  432. _mm_setcsr((_mm_getcsr() &~ _MM_ROUND_MASK) | x)
  433. }
  434. @(enable_target_feature="sse")
  435. _mm_prefetch :: #force_inline proc "c" (p: rawptr, $STRATEGY: u32) {
  436. prefetch(p, (STRATEGY>>2)&1, STRATEGY&3, 1)
  437. }
  438. @(require_results, enable_target_feature="sse")
  439. _mm_undefined_ps :: #force_inline proc "c" () -> __m128 {
  440. return _mm_set1_ps(0)
  441. }
  442. @(enable_target_feature="sse")
  443. _MM_TRANSPOSE4_PS :: #force_inline proc "c" (row0, row1, row2, row3: ^__m128) {
  444. tmp0 := _mm_unpacklo_ps(row0^, row1^)
  445. tmp1 := _mm_unpacklo_ps(row2^, row3^)
  446. tmp2 := _mm_unpackhi_ps(row0^, row1^)
  447. tmp3 := _mm_unpackhi_ps(row2^, row3^)
  448. row0^ = _mm_movelh_ps(tmp0, tmp2)
  449. row1^ = _mm_movelh_ps(tmp2, tmp0)
  450. row2^ = _mm_movelh_ps(tmp1, tmp3)
  451. row3^ = _mm_movelh_ps(tmp3, tmp1)
  452. }
  453. @(enable_target_feature="sse")
  454. _mm_stream_ps :: #force_inline proc "c" (addr: [^]f32, a: __m128) {
  455. intrinsics.non_temporal_store((^__m128)(addr), a)
  456. }
  457. when ODIN_ARCH == .amd64 {
  458. @(require_results, enable_target_feature="sse")
  459. _mm_cvtss_si64 :: #force_inline proc "c"(a: __m128) -> i64 {
  460. return cvtss2si64(a)
  461. }
  462. @(require_results, enable_target_feature="sse")
  463. _mm_cvttss_si64 :: #force_inline proc "c"(a: __m128) -> i64 {
  464. return cvttss2si64(a)
  465. }
  466. @(require_results, enable_target_feature="sse")
  467. _mm_cvtsi64_ss :: #force_inline proc "c"(a: __m128, b: i64) -> __m128 {
  468. return cvtsi642ss(a, b)
  469. }
  470. }
  471. @(private, default_calling_convention="none")
  472. foreign _ {
  473. @(link_name="llvm.x86.sse.add.ss")
  474. addss :: proc(a, b: __m128) -> __m128 ---
  475. @(link_name="llvm.x86.sse.sub.ss")
  476. subss :: proc(a, b: __m128) -> __m128 ---
  477. @(link_name="llvm.x86.sse.mul.ss")
  478. mulss :: proc(a, b: __m128) -> __m128 ---
  479. @(link_name="llvm.x86.sse.div.ss")
  480. divss :: proc(a, b: __m128) -> __m128 ---
  481. @(link_name="llvm.x86.sse.sqrt.ss")
  482. sqrtss :: proc(a: __m128) -> __m128 ---
  483. @(link_name="llvm.x86.sse.sqrt.ps")
  484. sqrtps :: proc(a: __m128) -> __m128 ---
  485. @(link_name="llvm.x86.sse.rcp.ss")
  486. rcpss :: proc(a: __m128) -> __m128 ---
  487. @(link_name="llvm.x86.sse.rcp.ps")
  488. rcpps :: proc(a: __m128) -> __m128 ---
  489. @(link_name="llvm.x86.sse.rsqrt.ss")
  490. rsqrtss :: proc(a: __m128) -> __m128 ---
  491. @(link_name="llvm.x86.sse.rsqrt.ps")
  492. rsqrtps :: proc(a: __m128) -> __m128 ---
  493. @(link_name="llvm.x86.sse.min.ss")
  494. minss :: proc(a, b: __m128) -> __m128 ---
  495. @(link_name="llvm.x86.sse.min.ps")
  496. minps :: proc(a, b: __m128) -> __m128 ---
  497. @(link_name="llvm.x86.sse.max.ss")
  498. maxss :: proc(a, b: __m128) -> __m128 ---
  499. @(link_name="llvm.x86.sse.max.ps")
  500. maxps :: proc(a, b: __m128) -> __m128 ---
  501. @(link_name="llvm.x86.sse.movmsk.ps")
  502. movmskps :: proc(a: __m128) -> u32 ---
  503. @(link_name="llvm.x86.sse.cmp.ps")
  504. cmpps :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
  505. @(link_name="llvm.x86.sse.comieq.ss")
  506. comieq_ss :: proc(a, b: __m128) -> b32 ---
  507. @(link_name="llvm.x86.sse.comilt.ss")
  508. comilt_ss :: proc(a, b: __m128) -> b32 ---
  509. @(link_name="llvm.x86.sse.comile.ss")
  510. comile_ss :: proc(a, b: __m128) -> b32 ---
  511. @(link_name="llvm.x86.sse.comigt.ss")
  512. comigt_ss :: proc(a, b: __m128) -> b32 ---
  513. @(link_name="llvm.x86.sse.comige.ss")
  514. comige_ss :: proc(a, b: __m128) -> b32 ---
  515. @(link_name="llvm.x86.sse.comineq.ss")
  516. comineq_ss :: proc(a, b: __m128) -> b32 ---
  517. @(link_name="llvm.x86.sse.ucomieq.ss")
  518. ucomieq_ss :: proc(a, b: __m128) -> b32 ---
  519. @(link_name="llvm.x86.sse.ucomilt.ss")
  520. ucomilt_ss :: proc(a, b: __m128) -> b32 ---
  521. @(link_name="llvm.x86.sse.ucomile.ss")
  522. ucomile_ss :: proc(a, b: __m128) -> b32 ---
  523. @(link_name="llvm.x86.sse.ucomigt.ss")
  524. ucomigt_ss :: proc(a, b: __m128) -> b32 ---
  525. @(link_name="llvm.x86.sse.ucomige.ss")
  526. ucomige_ss :: proc(a, b: __m128) -> b32 ---
  527. @(link_name="llvm.x86.sse.ucomineq.ss")
  528. ucomineq_ss :: proc(a, b: __m128) -> b32 ---
  529. @(link_name="llvm.x86.sse.cvtss2si")
  530. cvtss2si :: proc(a: __m128) -> i32 ---
  531. @(link_name="llvm.x86.sse.cvttss2si")
  532. cvttss2si :: proc(a: __m128) -> i32 ---
  533. @(link_name="llvm.x86.sse.cvtsi2ss")
  534. cvtsi2ss :: proc(a: __m128, b: i32) -> __m128 ---
  535. @(link_name="llvm.x86.sse.sfence")
  536. sfence :: proc() ---
  537. @(link_name="llvm.x86.sse.stmxcsr")
  538. stmxcsr :: proc(p: rawptr) ---
  539. @(link_name="llvm.x86.sse.ldmxcsr")
  540. ldmxcsr :: proc(p: rawptr) ---
  541. @(link_name="llvm.prefetch")
  542. prefetch :: proc(p: rawptr, #const rw, loc, ty: u32) ---
  543. @(link_name="llvm.x86.sse.cmp.ss")
  544. cmpss :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
  545. // amd64 only
  546. @(link_name="llvm.x86.sse.cvtss2si64")
  547. cvtss2si64 :: proc(a: __m128) -> i64 ---
  548. @(link_name="llvm.x86.sse.cvttss2si64")
  549. cvttss2si64 :: proc(a: __m128) -> i64 ---
  550. @(link_name="llvm.x86.sse.cvtsi642ss")
  551. cvtsi642ss :: proc(a: __m128, b: i64) -> __m128 ---
  552. }