sse41.odin 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. #+build i386, amd64
  2. package simd_x86
  3. import "core:simd"
  4. // SSE4 rounding constants
  5. _MM_FROUND_TO_NEAREST_INT :: 0x00
  6. _MM_FROUND_TO_NEG_INF :: 0x01
  7. _MM_FROUND_TO_POS_INF :: 0x02
  8. _MM_FROUND_TO_ZERO :: 0x03
  9. _MM_FROUND_CUR_DIRECTION :: 0x04
  10. _MM_FROUND_RAISE_EXC :: 0x00
  11. _MM_FROUND_NO_EXC :: 0x08
  12. _MM_FROUND_NINT :: 0x00
  13. _MM_FROUND_FLOOR :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF
  14. _MM_FROUND_CEIL :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF
  15. _MM_FROUND_TRUNC :: _MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO
  16. _MM_FROUND_RINT :: _MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION
  17. _MM_FROUND_NEARBYINT :: _MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION
  18. @(require_results, enable_target_feature="sse4.1")
  19. _mm_blendv_epi8 :: #force_inline proc "c" (a, b, mask: __m128i) -> __m128i {
  20. return transmute(__m128i)pblendvb(transmute(i8x16)a, transmute(i8x16)b, transmute(i8x16)mask)
  21. }
  22. @(require_results, enable_target_feature="sse4.1")
  23. _mm_blend_epi16 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i {
  24. return transmute(__m128i)pblendw(transmute(i16x8)a, transmute(i16x8)b, IMM8)
  25. }
  26. @(require_results, enable_target_feature="sse4.1")
  27. _mm_blendv_pd :: #force_inline proc "c" (a, b, mask: __m128d) -> __m128d {
  28. return blendvpd(a, b, mask)
  29. }
  30. @(require_results, enable_target_feature="sse4.1")
  31. _mm_blendv_ps :: #force_inline proc "c" (a, b, mask: __m128) -> __m128 {
  32. return blendvps(a, b, mask)
  33. }
  34. @(require_results, enable_target_feature="sse4.1")
  35. _mm_blend_pd :: #force_inline proc "c" (a, b: __m128d, $IMM2: u8) -> __m128d {
  36. return blendpd(a, b, IMM2)
  37. }
  38. @(require_results, enable_target_feature="sse4.1")
  39. _mm_blend_ps :: #force_inline proc "c" (a, b: __m128, $IMM4: u8) -> __m128 {
  40. return blendps(a, b, IMM4)
  41. }
  42. @(require_results, enable_target_feature="sse4.1")
  43. _mm_extract_ps :: #force_inline proc "c" (a: __m128, $IMM8: u32) -> i32 {
  44. return transmute(i32)simd.extract(a, IMM8)
  45. }
  46. @(require_results, enable_target_feature="sse4.1")
  47. _mm_extract_epi8 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 {
  48. return i32(simd.extract(transmute(u8x16)a, IMM8))
  49. }
  50. @(require_results, enable_target_feature="sse4.1")
  51. _mm_extract_epi32 :: #force_inline proc "c" (a: __m128i, $IMM8: u32) -> i32 {
  52. return simd.extract(transmute(i32x4)a, IMM8)
  53. }
  54. @(require_results, enable_target_feature="sse4.1")
  55. _mm_insert_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 {
  56. return insertps(a, b, IMM8)
  57. }
  58. @(require_results, enable_target_feature="sse4.1")
  59. _mm_insert_epi8 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i {
  60. return transmute(__m128i)simd.replace(transmute(i8x16)a, IMM8, i8(i))
  61. }
  62. @(require_results, enable_target_feature="sse4.1")
  63. _mm_insert_epi32 :: #force_inline proc "c" (a: __m128i, i: i32, $IMM8: u32) -> __m128i {
  64. return transmute(__m128i)simd.replace(transmute(i32x4)a, IMM8, i)
  65. }
  66. @(require_results, enable_target_feature="sse4.1")
  67. _mm_max_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  68. return transmute(__m128i)pmaxsb(transmute(i8x16)a, transmute(i8x16)b)
  69. }
  70. @(require_results, enable_target_feature="sse4.1")
  71. _mm_max_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  72. return transmute(__m128i)pmaxuw(transmute(u16x8)a, transmute(u16x8)b)
  73. }
  74. @(require_results, enable_target_feature="sse4.1")
  75. _mm_max_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  76. return transmute(__m128i)pmaxsd(transmute(i32x4)a, transmute(i32x4)b)
  77. }
  78. @(require_results, enable_target_feature="sse4.1")
  79. _mm_max_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  80. return transmute(__m128i)pmaxud(transmute(u32x4)a, transmute(u32x4)b)
  81. }
  82. @(require_results, enable_target_feature="sse4.1")
  83. _mm_min_epi8 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  84. return transmute(__m128i)pminsb(transmute(i8x16)a, transmute(i8x16)b)
  85. }
  86. @(require_results, enable_target_feature="sse4.1")
  87. _mm_min_epu16 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  88. return transmute(__m128i)pminuw(transmute(u16x8)a, transmute(u16x8)b)
  89. }
  90. @(require_results, enable_target_feature="sse4.1")
  91. _mm_min_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  92. return transmute(__m128i)pminsd(transmute(i32x4)a, transmute(i32x4)b)
  93. }
  94. @(require_results, enable_target_feature="sse4.1")
  95. _mm_min_epu32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  96. return transmute(__m128i)pminud(transmute(u32x4)a, transmute(u32x4)b)
  97. }
  98. @(require_results, enable_target_feature="sse4.1")
  99. _mm_packus_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  100. return transmute(__m128i)packusdw(transmute(i32x4)a, transmute(i32x4)b)
  101. }
  102. @(require_results, enable_target_feature="sse4.1")
  103. _mm_cmpeq_epi64 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  104. return transmute(__m128i)simd.lanes_eq(a, b)
  105. }
  106. @(require_results, enable_target_feature="sse4.1")
  107. _mm_cvtepi8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  108. x := transmute(i8x16)a
  109. y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7)
  110. return transmute(__m128i)i16x8(y)
  111. }
  112. @(require_results, enable_target_feature="sse4.1")
  113. _mm_cvtepi8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  114. x := transmute(i8x16)a
  115. y := simd.shuffle(x, x, 0, 1, 2, 3)
  116. return transmute(__m128i)i32x4(y)
  117. }
  118. @(require_results, enable_target_feature="sse4.1")
  119. _mm_cvtepi8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  120. x := transmute(i8x16)a
  121. y := simd.shuffle(x, x, 0, 1)
  122. return i64x2(y)
  123. }
  124. @(require_results, enable_target_feature="sse4.1")
  125. _mm_cvtepi16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  126. x := transmute(i16x8)a
  127. y := simd.shuffle(x, x, 0, 1, 2, 3)
  128. return transmute(__m128i)i32x4(y)
  129. }
  130. @(require_results, enable_target_feature="sse4.1")
  131. _mm_cvtepi16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  132. x := transmute(i16x8)a
  133. y := simd.shuffle(x, x, 0, 1)
  134. return i64x2(y)
  135. }
  136. @(require_results, enable_target_feature="sse4.1")
  137. _mm_cvtepi32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  138. x := transmute(i32x4)a
  139. y := simd.shuffle(x, x, 0, 1)
  140. return i64x2(y)
  141. }
  142. @(require_results, enable_target_feature="sse4.1")
  143. _mm_cvtepu8_epi16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  144. x := transmute(u8x16)a
  145. y := simd.shuffle(x, x, 0, 1, 2, 3, 4, 5, 6, 7)
  146. return transmute(__m128i)i16x8(y)
  147. }
  148. @(require_results, enable_target_feature="sse4.1")
  149. _mm_cvtepu8_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  150. x := transmute(u8x16)a
  151. y := simd.shuffle(x, x, 0, 1, 2, 3)
  152. return transmute(__m128i)i32x4(y)
  153. }
  154. @(require_results, enable_target_feature="sse4.1")
  155. _mm_cvtepu8_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  156. x := transmute(u8x16)a
  157. y := simd.shuffle(x, x, 0, 1)
  158. return i64x2(y)
  159. }
  160. @(require_results, enable_target_feature="sse4.1")
  161. _mm_cvtepu16_epi32 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  162. x := transmute(u16x8)a
  163. y := simd.shuffle(x, x, 0, 1, 2, 3)
  164. return transmute(__m128i)i32x4(y)
  165. }
  166. @(require_results, enable_target_feature="sse4.1")
  167. _mm_cvtepu16_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  168. x := transmute(u16x8)a
  169. y := simd.shuffle(x, x, 0, 1)
  170. return i64x2(y)
  171. }
  172. @(require_results, enable_target_feature="sse4.1")
  173. _mm_cvtepu32_epi64 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  174. x := transmute(u32x4)a
  175. y := simd.shuffle(x, x, 0, 1)
  176. return i64x2(y)
  177. }
  178. @(require_results, enable_target_feature="sse4.1")
  179. _mm_dp_pd :: #force_inline proc "c" (a, b: __m128d, $IMM8: u8) -> __m128d {
  180. return dppd(a, b, IMM8)
  181. }
  182. @(require_results, enable_target_feature="sse4.1")
  183. _mm_dp_ps :: #force_inline proc "c" (a, b: __m128, $IMM8: u8) -> __m128 {
  184. return dpps(a, b, IMM8)
  185. }
  186. @(require_results, enable_target_feature="sse4.1")
  187. _mm_floor_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
  188. return simd.floor(a)
  189. }
  190. @(require_results, enable_target_feature="sse4.1")
  191. _mm_floor_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
  192. return simd.floor(a)
  193. }
  194. @(require_results, enable_target_feature="sse4.1")
  195. _mm_floor_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
  196. return roundsd(a, b, _MM_FROUND_FLOOR)
  197. }
  198. @(require_results, enable_target_feature="sse4.1")
  199. _mm_floor_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  200. return roundss(a, b, _MM_FROUND_FLOOR)
  201. }
  202. @(require_results, enable_target_feature="sse4.1")
  203. _mm_ceil_pd :: #force_inline proc "c" (a: __m128d) -> __m128d {
  204. return simd.ceil(a)
  205. }
  206. @(require_results, enable_target_feature="sse4.1")
  207. _mm_ceil_ps :: #force_inline proc "c" (a: __m128) -> __m128 {
  208. return simd.ceil(a)
  209. }
  210. @(require_results, enable_target_feature="sse4.1")
  211. _mm_ceil_sd :: #force_inline proc "c" (a, b: __m128d) -> __m128d {
  212. return roundsd(a, b, _MM_FROUND_CEIL)
  213. }
  214. @(require_results, enable_target_feature="sse4.1")
  215. _mm_ceil_ss :: #force_inline proc "c" (a, b: __m128) -> __m128 {
  216. return roundss(a, b, _MM_FROUND_CEIL)
  217. }
  218. @(require_results, enable_target_feature="sse4.1")
  219. _mm_round_pd :: #force_inline proc "c" (a: __m128d, $ROUNDING: i32) -> __m128d {
  220. return roundpd(a, ROUNDING)
  221. }
  222. @(require_results, enable_target_feature="sse4.1")
  223. _mm_round_ps :: #force_inline proc "c" (a: __m128, $ROUNDING: i32) -> __m128 {
  224. return roundps(a, ROUNDING)
  225. }
  226. @(require_results, enable_target_feature="sse4.1")
  227. _mm_round_sd :: #force_inline proc "c" (a, b: __m128d, $ROUNDING: i32) -> __m128d {
  228. return roundsd(a, b, ROUNDING)
  229. }
  230. @(require_results, enable_target_feature="sse4.1")
  231. _mm_round_ss :: #force_inline proc "c" (a, b: __m128, $ROUNDING: i32) -> __m128 {
  232. return roundss(a, b, ROUNDING)
  233. }
  234. @(require_results, enable_target_feature="sse4.1")
  235. _mm_minpos_epu16 :: #force_inline proc "c" (a: __m128i) -> __m128i {
  236. return transmute(__m128i)phminposuw(transmute(u16x8)a)
  237. }
  238. @(require_results, enable_target_feature="sse4.1")
  239. _mm_mul_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  240. return pmuldq(transmute(i32x4)a, transmute(i32x4)b)
  241. }
  242. @(require_results, enable_target_feature="sse4.1")
  243. _mm_mullo_epi32 :: #force_inline proc "c" (a, b: __m128i) -> __m128i {
  244. return transmute(__m128i)simd.mul(transmute(i32x4)a, transmute(i32x4)b)
  245. }
  246. @(require_results, enable_target_feature="sse4.1")
  247. _mm_mpsadbw_epu8 :: #force_inline proc "c" (a, b: __m128i, $IMM8: u8) -> __m128i {
  248. return transmute(__m128i)mpsadbw(transmute(u8x16)a, transmute(u8x16)b, IMM8)
  249. }
  250. @(require_results, enable_target_feature="sse4.1")
  251. _mm_testz_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
  252. return ptestz(a, mask)
  253. }
  254. @(require_results, enable_target_feature="sse4.1")
  255. _mm_testc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
  256. return ptestc(a, mask)
  257. }
  258. @(require_results, enable_target_feature="sse4.1")
  259. _mm_testnzc_si128 :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
  260. return ptestnzc(a, mask)
  261. }
  262. @(require_results, enable_target_feature="sse4.1")
  263. _mm_test_all_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
  264. return _mm_testz_si128(a, mask)
  265. }
  266. @(require_results, enable_target_feature="sse2,sse4.1")
  267. _mm_test_all_ones :: #force_inline proc "c" (a: __m128i) -> i32 {
  268. return _mm_testc_si128(a, _mm_cmpeq_epi32(a, a))
  269. }
  270. @(require_results, enable_target_feature="sse4.1")
  271. _mm_test_mix_ones_zeros :: #force_inline proc "c" (a: __m128i, mask: __m128i) -> i32 {
  272. return _mm_testnzc_si128(a, mask)
  273. }
  274. when ODIN_ARCH == .amd64 {
  275. @(require_results, enable_target_feature="sse4.1")
  276. _mm_extract_epi64 :: #force_inline proc "c" (a: __m128i, $IMM1: u32) -> i64 {
  277. return simd.extract(transmute(i64x2)a, IMM1)
  278. }
  279. @(require_results, enable_target_feature="sse4.1")
  280. _mm_insert_epi64 :: #force_inline proc "c" (a: __m128i, i: i64, $IMM1: u32) -> __m128i {
  281. return transmute(__m128i)simd.replace(transmute(i64x2)a, IMM1, i)
  282. }
  283. }
  284. @(private, default_calling_convention="none")
  285. foreign _ {
  286. @(link_name = "llvm.x86.sse41.pblendvb")
  287. pblendvb :: proc(a, b: i8x16, mask: i8x16) -> i8x16 ---
  288. @(link_name = "llvm.x86.sse41.blendvpd")
  289. blendvpd :: proc(a, b, mask: __m128d) -> __m128d ---
  290. @(link_name = "llvm.x86.sse41.blendvps")
  291. blendvps :: proc(a, b, mask: __m128) -> __m128 ---
  292. @(link_name = "llvm.x86.sse41.blendpd")
  293. blendpd :: proc(a, b: __m128d, #const imm2: u8) -> __m128d ---
  294. @(link_name = "llvm.x86.sse41.blendps")
  295. blendps :: proc(a, b: __m128, #const imm4: u8) -> __m128 ---
  296. @(link_name = "llvm.x86.sse41.pblendw")
  297. pblendw :: proc(a: i16x8, b: i16x8, #const imm8: u8) -> i16x8 ---
  298. @(link_name = "llvm.x86.sse41.insertps")
  299. insertps :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
  300. @(link_name = "llvm.x86.sse41.pmaxsb")
  301. pmaxsb :: proc(a, b: i8x16) -> i8x16 ---
  302. @(link_name = "llvm.x86.sse41.pmaxuw")
  303. pmaxuw :: proc(a, b: u16x8) -> u16x8 ---
  304. @(link_name = "llvm.x86.sse41.pmaxsd")
  305. pmaxsd :: proc(a, b: i32x4) -> i32x4 ---
  306. @(link_name = "llvm.x86.sse41.pmaxud")
  307. pmaxud :: proc(a, b: u32x4) -> u32x4 ---
  308. @(link_name = "llvm.x86.sse41.pminsb")
  309. pminsb :: proc(a, b: i8x16) -> i8x16 ---
  310. @(link_name = "llvm.x86.sse41.pminuw")
  311. pminuw :: proc(a, b: u16x8) -> u16x8 ---
  312. @(link_name = "llvm.x86.sse41.pminsd")
  313. pminsd :: proc(a, b: i32x4) -> i32x4 ---
  314. @(link_name = "llvm.x86.sse41.pminud")
  315. pminud :: proc(a, b: u32x4) -> u32x4 ---
  316. @(link_name = "llvm.x86.sse41.packusdw")
  317. packusdw :: proc(a, b: i32x4) -> u16x8 ---
  318. @(link_name = "llvm.x86.sse41.dppd")
  319. dppd :: proc(a, b: __m128d, #const imm8: u8) -> __m128d ---
  320. @(link_name = "llvm.x86.sse41.dpps")
  321. dpps :: proc(a, b: __m128, #const imm8: u8) -> __m128 ---
  322. @(link_name = "llvm.x86.sse41.round.pd")
  323. roundpd :: proc(a: __m128d, rounding: i32) -> __m128d ---
  324. @(link_name = "llvm.x86.sse41.round.ps")
  325. roundps :: proc(a: __m128, rounding: i32) -> __m128 ---
  326. @(link_name = "llvm.x86.sse41.round.sd")
  327. roundsd :: proc(a, b: __m128d, rounding: i32) -> __m128d ---
  328. @(link_name = "llvm.x86.sse41.round.ss")
  329. roundss :: proc(a, b: __m128, rounding: i32) -> __m128 ---
  330. @(link_name = "llvm.x86.sse41.phminposuw")
  331. phminposuw :: proc(a: u16x8) -> u16x8 ---
  332. @(link_name = "llvm.x86.sse41.pmuldq")
  333. pmuldq :: proc(a, b: i32x4) -> i64x2 ---
  334. @(link_name = "llvm.x86.sse41.mpsadbw")
  335. mpsadbw :: proc(a, b: u8x16, #const imm8: u8) -> u16x8 ---
  336. @(link_name = "llvm.x86.sse41.ptestz")
  337. ptestz :: proc(a, mask: i64x2) -> i32 ---
  338. @(link_name = "llvm.x86.sse41.ptestc")
  339. ptestc :: proc(a, mask: i64x2) -> i32 ---
  340. @(link_name = "llvm.x86.sse41.ptestnzc")
  341. ptestnzc :: proc(a, mask: i64x2) -> i32 ---
  342. }