emmintrin.h 37 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480
  1. /*===---- emmintrin.h - SSE2 intrinsics ------------------------------------===
  2. *
  3. * Permission is hereby granted, free of charge, to any person obtaining a copy
  4. * of this software and associated documentation files (the "Software"), to deal
  5. * in the Software without restriction, including without limitation the rights
  6. * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  7. * copies of the Software, and to permit persons to whom the Software is
  8. * furnished to do so, subject to the following conditions:
  9. *
  10. * The above copyright notice and this permission notice shall be included in
  11. * all copies or substantial portions of the Software.
  12. *
  13. * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  14. * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  15. * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  16. * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  17. * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  18. * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  19. * THE SOFTWARE.
  20. *
  21. *===-----------------------------------------------------------------------===
  22. */
  23. #ifndef __EMMINTRIN_H
  24. #define __EMMINTRIN_H
  25. #ifndef __SSE2__
  26. #error "SSE2 instruction set not enabled"
  27. #else
  28. #include <xmmintrin.h>
  29. typedef double __m128d __attribute__((__vector_size__(16)));
  30. typedef long long __m128i __attribute__((__vector_size__(16)));
  31. /* Type defines. */
  32. typedef double __v2df __attribute__ ((__vector_size__ (16)));
  33. typedef long long __v2di __attribute__ ((__vector_size__ (16)));
  34. typedef short __v8hi __attribute__((__vector_size__(16)));
  35. typedef char __v16qi __attribute__((__vector_size__(16)));
  36. /* Define the default attributes for the functions in this file. */
  37. #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
  38. static __inline__ __m128d __DEFAULT_FN_ATTRS
  39. _mm_add_sd(__m128d __a, __m128d __b)
  40. {
  41. __a[0] += __b[0];
  42. return __a;
  43. }
  44. static __inline__ __m128d __DEFAULT_FN_ATTRS
  45. _mm_add_pd(__m128d __a, __m128d __b)
  46. {
  47. return __a + __b;
  48. }
  49. static __inline__ __m128d __DEFAULT_FN_ATTRS
  50. _mm_sub_sd(__m128d __a, __m128d __b)
  51. {
  52. __a[0] -= __b[0];
  53. return __a;
  54. }
  55. static __inline__ __m128d __DEFAULT_FN_ATTRS
  56. _mm_sub_pd(__m128d __a, __m128d __b)
  57. {
  58. return __a - __b;
  59. }
  60. static __inline__ __m128d __DEFAULT_FN_ATTRS
  61. _mm_mul_sd(__m128d __a, __m128d __b)
  62. {
  63. __a[0] *= __b[0];
  64. return __a;
  65. }
  66. static __inline__ __m128d __DEFAULT_FN_ATTRS
  67. _mm_mul_pd(__m128d __a, __m128d __b)
  68. {
  69. return __a * __b;
  70. }
  71. static __inline__ __m128d __DEFAULT_FN_ATTRS
  72. _mm_div_sd(__m128d __a, __m128d __b)
  73. {
  74. __a[0] /= __b[0];
  75. return __a;
  76. }
  77. static __inline__ __m128d __DEFAULT_FN_ATTRS
  78. _mm_div_pd(__m128d __a, __m128d __b)
  79. {
  80. return __a / __b;
  81. }
  82. static __inline__ __m128d __DEFAULT_FN_ATTRS
  83. _mm_sqrt_sd(__m128d __a, __m128d __b)
  84. {
  85. __m128d __c = __builtin_ia32_sqrtsd(__b);
  86. return (__m128d) { __c[0], __a[1] };
  87. }
  88. static __inline__ __m128d __DEFAULT_FN_ATTRS
  89. _mm_sqrt_pd(__m128d __a)
  90. {
  91. return __builtin_ia32_sqrtpd(__a);
  92. }
  93. static __inline__ __m128d __DEFAULT_FN_ATTRS
  94. _mm_min_sd(__m128d __a, __m128d __b)
  95. {
  96. return __builtin_ia32_minsd(__a, __b);
  97. }
  98. static __inline__ __m128d __DEFAULT_FN_ATTRS
  99. _mm_min_pd(__m128d __a, __m128d __b)
  100. {
  101. return __builtin_ia32_minpd(__a, __b);
  102. }
  103. static __inline__ __m128d __DEFAULT_FN_ATTRS
  104. _mm_max_sd(__m128d __a, __m128d __b)
  105. {
  106. return __builtin_ia32_maxsd(__a, __b);
  107. }
  108. static __inline__ __m128d __DEFAULT_FN_ATTRS
  109. _mm_max_pd(__m128d __a, __m128d __b)
  110. {
  111. return __builtin_ia32_maxpd(__a, __b);
  112. }
  113. static __inline__ __m128d __DEFAULT_FN_ATTRS
  114. _mm_and_pd(__m128d __a, __m128d __b)
  115. {
  116. return (__m128d)((__v4si)__a & (__v4si)__b);
  117. }
  118. static __inline__ __m128d __DEFAULT_FN_ATTRS
  119. _mm_andnot_pd(__m128d __a, __m128d __b)
  120. {
  121. return (__m128d)(~(__v4si)__a & (__v4si)__b);
  122. }
  123. static __inline__ __m128d __DEFAULT_FN_ATTRS
  124. _mm_or_pd(__m128d __a, __m128d __b)
  125. {
  126. return (__m128d)((__v4si)__a | (__v4si)__b);
  127. }
  128. static __inline__ __m128d __DEFAULT_FN_ATTRS
  129. _mm_xor_pd(__m128d __a, __m128d __b)
  130. {
  131. return (__m128d)((__v4si)__a ^ (__v4si)__b);
  132. }
  133. static __inline__ __m128d __DEFAULT_FN_ATTRS
  134. _mm_cmpeq_pd(__m128d __a, __m128d __b)
  135. {
  136. return (__m128d)__builtin_ia32_cmpeqpd(__a, __b);
  137. }
  138. static __inline__ __m128d __DEFAULT_FN_ATTRS
  139. _mm_cmplt_pd(__m128d __a, __m128d __b)
  140. {
  141. return (__m128d)__builtin_ia32_cmpltpd(__a, __b);
  142. }
  143. static __inline__ __m128d __DEFAULT_FN_ATTRS
  144. _mm_cmple_pd(__m128d __a, __m128d __b)
  145. {
  146. return (__m128d)__builtin_ia32_cmplepd(__a, __b);
  147. }
  148. static __inline__ __m128d __DEFAULT_FN_ATTRS
  149. _mm_cmpgt_pd(__m128d __a, __m128d __b)
  150. {
  151. return (__m128d)__builtin_ia32_cmpltpd(__b, __a);
  152. }
  153. static __inline__ __m128d __DEFAULT_FN_ATTRS
  154. _mm_cmpge_pd(__m128d __a, __m128d __b)
  155. {
  156. return (__m128d)__builtin_ia32_cmplepd(__b, __a);
  157. }
  158. static __inline__ __m128d __DEFAULT_FN_ATTRS
  159. _mm_cmpord_pd(__m128d __a, __m128d __b)
  160. {
  161. return (__m128d)__builtin_ia32_cmpordpd(__a, __b);
  162. }
  163. static __inline__ __m128d __DEFAULT_FN_ATTRS
  164. _mm_cmpunord_pd(__m128d __a, __m128d __b)
  165. {
  166. return (__m128d)__builtin_ia32_cmpunordpd(__a, __b);
  167. }
  168. static __inline__ __m128d __DEFAULT_FN_ATTRS
  169. _mm_cmpneq_pd(__m128d __a, __m128d __b)
  170. {
  171. return (__m128d)__builtin_ia32_cmpneqpd(__a, __b);
  172. }
  173. static __inline__ __m128d __DEFAULT_FN_ATTRS
  174. _mm_cmpnlt_pd(__m128d __a, __m128d __b)
  175. {
  176. return (__m128d)__builtin_ia32_cmpnltpd(__a, __b);
  177. }
  178. static __inline__ __m128d __DEFAULT_FN_ATTRS
  179. _mm_cmpnle_pd(__m128d __a, __m128d __b)
  180. {
  181. return (__m128d)__builtin_ia32_cmpnlepd(__a, __b);
  182. }
  183. static __inline__ __m128d __DEFAULT_FN_ATTRS
  184. _mm_cmpngt_pd(__m128d __a, __m128d __b)
  185. {
  186. return (__m128d)__builtin_ia32_cmpnltpd(__b, __a);
  187. }
  188. static __inline__ __m128d __DEFAULT_FN_ATTRS
  189. _mm_cmpnge_pd(__m128d __a, __m128d __b)
  190. {
  191. return (__m128d)__builtin_ia32_cmpnlepd(__b, __a);
  192. }
  193. static __inline__ __m128d __DEFAULT_FN_ATTRS
  194. _mm_cmpeq_sd(__m128d __a, __m128d __b)
  195. {
  196. return (__m128d)__builtin_ia32_cmpeqsd(__a, __b);
  197. }
  198. static __inline__ __m128d __DEFAULT_FN_ATTRS
  199. _mm_cmplt_sd(__m128d __a, __m128d __b)
  200. {
  201. return (__m128d)__builtin_ia32_cmpltsd(__a, __b);
  202. }
  203. static __inline__ __m128d __DEFAULT_FN_ATTRS
  204. _mm_cmple_sd(__m128d __a, __m128d __b)
  205. {
  206. return (__m128d)__builtin_ia32_cmplesd(__a, __b);
  207. }
  208. static __inline__ __m128d __DEFAULT_FN_ATTRS
  209. _mm_cmpgt_sd(__m128d __a, __m128d __b)
  210. {
  211. __m128d __c = __builtin_ia32_cmpltsd(__b, __a);
  212. return (__m128d) { __c[0], __a[1] };
  213. }
  214. static __inline__ __m128d __DEFAULT_FN_ATTRS
  215. _mm_cmpge_sd(__m128d __a, __m128d __b)
  216. {
  217. __m128d __c = __builtin_ia32_cmplesd(__b, __a);
  218. return (__m128d) { __c[0], __a[1] };
  219. }
  220. static __inline__ __m128d __DEFAULT_FN_ATTRS
  221. _mm_cmpord_sd(__m128d __a, __m128d __b)
  222. {
  223. return (__m128d)__builtin_ia32_cmpordsd(__a, __b);
  224. }
  225. static __inline__ __m128d __DEFAULT_FN_ATTRS
  226. _mm_cmpunord_sd(__m128d __a, __m128d __b)
  227. {
  228. return (__m128d)__builtin_ia32_cmpunordsd(__a, __b);
  229. }
  230. static __inline__ __m128d __DEFAULT_FN_ATTRS
  231. _mm_cmpneq_sd(__m128d __a, __m128d __b)
  232. {
  233. return (__m128d)__builtin_ia32_cmpneqsd(__a, __b);
  234. }
  235. static __inline__ __m128d __DEFAULT_FN_ATTRS
  236. _mm_cmpnlt_sd(__m128d __a, __m128d __b)
  237. {
  238. return (__m128d)__builtin_ia32_cmpnltsd(__a, __b);
  239. }
  240. static __inline__ __m128d __DEFAULT_FN_ATTRS
  241. _mm_cmpnle_sd(__m128d __a, __m128d __b)
  242. {
  243. return (__m128d)__builtin_ia32_cmpnlesd(__a, __b);
  244. }
  245. static __inline__ __m128d __DEFAULT_FN_ATTRS
  246. _mm_cmpngt_sd(__m128d __a, __m128d __b)
  247. {
  248. __m128d __c = __builtin_ia32_cmpnltsd(__b, __a);
  249. return (__m128d) { __c[0], __a[1] };
  250. }
  251. static __inline__ __m128d __DEFAULT_FN_ATTRS
  252. _mm_cmpnge_sd(__m128d __a, __m128d __b)
  253. {
  254. __m128d __c = __builtin_ia32_cmpnlesd(__b, __a);
  255. return (__m128d) { __c[0], __a[1] };
  256. }
  257. static __inline__ int __DEFAULT_FN_ATTRS
  258. _mm_comieq_sd(__m128d __a, __m128d __b)
  259. {
  260. return __builtin_ia32_comisdeq(__a, __b);
  261. }
  262. static __inline__ int __DEFAULT_FN_ATTRS
  263. _mm_comilt_sd(__m128d __a, __m128d __b)
  264. {
  265. return __builtin_ia32_comisdlt(__a, __b);
  266. }
  267. static __inline__ int __DEFAULT_FN_ATTRS
  268. _mm_comile_sd(__m128d __a, __m128d __b)
  269. {
  270. return __builtin_ia32_comisdle(__a, __b);
  271. }
  272. static __inline__ int __DEFAULT_FN_ATTRS
  273. _mm_comigt_sd(__m128d __a, __m128d __b)
  274. {
  275. return __builtin_ia32_comisdgt(__a, __b);
  276. }
  277. static __inline__ int __DEFAULT_FN_ATTRS
  278. _mm_comige_sd(__m128d __a, __m128d __b)
  279. {
  280. return __builtin_ia32_comisdge(__a, __b);
  281. }
  282. static __inline__ int __DEFAULT_FN_ATTRS
  283. _mm_comineq_sd(__m128d __a, __m128d __b)
  284. {
  285. return __builtin_ia32_comisdneq(__a, __b);
  286. }
  287. static __inline__ int __DEFAULT_FN_ATTRS
  288. _mm_ucomieq_sd(__m128d __a, __m128d __b)
  289. {
  290. return __builtin_ia32_ucomisdeq(__a, __b);
  291. }
  292. static __inline__ int __DEFAULT_FN_ATTRS
  293. _mm_ucomilt_sd(__m128d __a, __m128d __b)
  294. {
  295. return __builtin_ia32_ucomisdlt(__a, __b);
  296. }
  297. static __inline__ int __DEFAULT_FN_ATTRS
  298. _mm_ucomile_sd(__m128d __a, __m128d __b)
  299. {
  300. return __builtin_ia32_ucomisdle(__a, __b);
  301. }
  302. static __inline__ int __DEFAULT_FN_ATTRS
  303. _mm_ucomigt_sd(__m128d __a, __m128d __b)
  304. {
  305. return __builtin_ia32_ucomisdgt(__a, __b);
  306. }
  307. static __inline__ int __DEFAULT_FN_ATTRS
  308. _mm_ucomige_sd(__m128d __a, __m128d __b)
  309. {
  310. return __builtin_ia32_ucomisdge(__a, __b);
  311. }
  312. static __inline__ int __DEFAULT_FN_ATTRS
  313. _mm_ucomineq_sd(__m128d __a, __m128d __b)
  314. {
  315. return __builtin_ia32_ucomisdneq(__a, __b);
  316. }
  317. static __inline__ __m128 __DEFAULT_FN_ATTRS
  318. _mm_cvtpd_ps(__m128d __a)
  319. {
  320. return __builtin_ia32_cvtpd2ps(__a);
  321. }
  322. static __inline__ __m128d __DEFAULT_FN_ATTRS
  323. _mm_cvtps_pd(__m128 __a)
  324. {
  325. return __builtin_ia32_cvtps2pd(__a);
  326. }
  327. static __inline__ __m128d __DEFAULT_FN_ATTRS
  328. _mm_cvtepi32_pd(__m128i __a)
  329. {
  330. return __builtin_ia32_cvtdq2pd((__v4si)__a);
  331. }
  332. static __inline__ __m128i __DEFAULT_FN_ATTRS
  333. _mm_cvtpd_epi32(__m128d __a)
  334. {
  335. return __builtin_ia32_cvtpd2dq(__a);
  336. }
  337. static __inline__ int __DEFAULT_FN_ATTRS
  338. _mm_cvtsd_si32(__m128d __a)
  339. {
  340. return __builtin_ia32_cvtsd2si(__a);
  341. }
  342. static __inline__ __m128 __DEFAULT_FN_ATTRS
  343. _mm_cvtsd_ss(__m128 __a, __m128d __b)
  344. {
  345. __a[0] = __b[0];
  346. return __a;
  347. }
  348. static __inline__ __m128d __DEFAULT_FN_ATTRS
  349. _mm_cvtsi32_sd(__m128d __a, int __b)
  350. {
  351. __a[0] = __b;
  352. return __a;
  353. }
  354. static __inline__ __m128d __DEFAULT_FN_ATTRS
  355. _mm_cvtss_sd(__m128d __a, __m128 __b)
  356. {
  357. __a[0] = __b[0];
  358. return __a;
  359. }
  360. static __inline__ __m128i __DEFAULT_FN_ATTRS
  361. _mm_cvttpd_epi32(__m128d __a)
  362. {
  363. return (__m128i)__builtin_ia32_cvttpd2dq(__a);
  364. }
  365. static __inline__ int __DEFAULT_FN_ATTRS
  366. _mm_cvttsd_si32(__m128d __a)
  367. {
  368. return __a[0];
  369. }
  370. static __inline__ __m64 __DEFAULT_FN_ATTRS
  371. _mm_cvtpd_pi32(__m128d __a)
  372. {
  373. return (__m64)__builtin_ia32_cvtpd2pi(__a);
  374. }
  375. static __inline__ __m64 __DEFAULT_FN_ATTRS
  376. _mm_cvttpd_pi32(__m128d __a)
  377. {
  378. return (__m64)__builtin_ia32_cvttpd2pi(__a);
  379. }
  380. static __inline__ __m128d __DEFAULT_FN_ATTRS
  381. _mm_cvtpi32_pd(__m64 __a)
  382. {
  383. return __builtin_ia32_cvtpi2pd((__v2si)__a);
  384. }
  385. static __inline__ double __DEFAULT_FN_ATTRS
  386. _mm_cvtsd_f64(__m128d __a)
  387. {
  388. return __a[0];
  389. }
  390. static __inline__ __m128d __DEFAULT_FN_ATTRS
  391. _mm_load_pd(double const *__dp)
  392. {
  393. return *(__m128d*)__dp;
  394. }
  395. static __inline__ __m128d __DEFAULT_FN_ATTRS
  396. _mm_load1_pd(double const *__dp)
  397. {
  398. struct __mm_load1_pd_struct {
  399. double __u;
  400. } __attribute__((__packed__, __may_alias__));
  401. double __u = ((struct __mm_load1_pd_struct*)__dp)->__u;
  402. return (__m128d){ __u, __u };
  403. }
  404. #define _mm_load_pd1(dp) _mm_load1_pd(dp)
  405. static __inline__ __m128d __DEFAULT_FN_ATTRS
  406. _mm_loadr_pd(double const *__dp)
  407. {
  408. __m128d __u = *(__m128d*)__dp;
  409. return __builtin_shufflevector(__u, __u, 1, 0);
  410. }
  411. static __inline__ __m128d __DEFAULT_FN_ATTRS
  412. _mm_loadu_pd(double const *__dp)
  413. {
  414. struct __loadu_pd {
  415. __m128d __v;
  416. } __attribute__((__packed__, __may_alias__));
  417. return ((struct __loadu_pd*)__dp)->__v;
  418. }
  419. static __inline__ __m128d __DEFAULT_FN_ATTRS
  420. _mm_load_sd(double const *__dp)
  421. {
  422. struct __mm_load_sd_struct {
  423. double __u;
  424. } __attribute__((__packed__, __may_alias__));
  425. double __u = ((struct __mm_load_sd_struct*)__dp)->__u;
  426. return (__m128d){ __u, 0 };
  427. }
  428. static __inline__ __m128d __DEFAULT_FN_ATTRS
  429. _mm_loadh_pd(__m128d __a, double const *__dp)
  430. {
  431. struct __mm_loadh_pd_struct {
  432. double __u;
  433. } __attribute__((__packed__, __may_alias__));
  434. double __u = ((struct __mm_loadh_pd_struct*)__dp)->__u;
  435. return (__m128d){ __a[0], __u };
  436. }
  437. static __inline__ __m128d __DEFAULT_FN_ATTRS
  438. _mm_loadl_pd(__m128d __a, double const *__dp)
  439. {
  440. struct __mm_loadl_pd_struct {
  441. double __u;
  442. } __attribute__((__packed__, __may_alias__));
  443. double __u = ((struct __mm_loadl_pd_struct*)__dp)->__u;
  444. return (__m128d){ __u, __a[1] };
  445. }
  446. static __inline__ __m128d __DEFAULT_FN_ATTRS
  447. _mm_set_sd(double __w)
  448. {
  449. return (__m128d){ __w, 0 };
  450. }
  451. static __inline__ __m128d __DEFAULT_FN_ATTRS
  452. _mm_set1_pd(double __w)
  453. {
  454. return (__m128d){ __w, __w };
  455. }
  456. static __inline__ __m128d __DEFAULT_FN_ATTRS
  457. _mm_set_pd(double __w, double __x)
  458. {
  459. return (__m128d){ __x, __w };
  460. }
  461. static __inline__ __m128d __DEFAULT_FN_ATTRS
  462. _mm_setr_pd(double __w, double __x)
  463. {
  464. return (__m128d){ __w, __x };
  465. }
  466. static __inline__ __m128d __DEFAULT_FN_ATTRS
  467. _mm_setzero_pd(void)
  468. {
  469. return (__m128d){ 0, 0 };
  470. }
  471. static __inline__ __m128d __DEFAULT_FN_ATTRS
  472. _mm_move_sd(__m128d __a, __m128d __b)
  473. {
  474. return (__m128d){ __b[0], __a[1] };
  475. }
  476. static __inline__ void __DEFAULT_FN_ATTRS
  477. _mm_store_sd(double *__dp, __m128d __a)
  478. {
  479. struct __mm_store_sd_struct {
  480. double __u;
  481. } __attribute__((__packed__, __may_alias__));
  482. ((struct __mm_store_sd_struct*)__dp)->__u = __a[0];
  483. }
  484. static __inline__ void __DEFAULT_FN_ATTRS
  485. _mm_store1_pd(double *__dp, __m128d __a)
  486. {
  487. struct __mm_store1_pd_struct {
  488. double __u[2];
  489. } __attribute__((__packed__, __may_alias__));
  490. ((struct __mm_store1_pd_struct*)__dp)->__u[0] = __a[0];
  491. ((struct __mm_store1_pd_struct*)__dp)->__u[1] = __a[0];
  492. }
  493. static __inline__ void __DEFAULT_FN_ATTRS
  494. _mm_store_pd(double *__dp, __m128d __a)
  495. {
  496. *(__m128d *)__dp = __a;
  497. }
  498. static __inline__ void __DEFAULT_FN_ATTRS
  499. _mm_storeu_pd(double *__dp, __m128d __a)
  500. {
  501. __builtin_ia32_storeupd(__dp, __a);
  502. }
  503. static __inline__ void __DEFAULT_FN_ATTRS
  504. _mm_storer_pd(double *__dp, __m128d __a)
  505. {
  506. __a = __builtin_shufflevector(__a, __a, 1, 0);
  507. *(__m128d *)__dp = __a;
  508. }
  509. static __inline__ void __DEFAULT_FN_ATTRS
  510. _mm_storeh_pd(double *__dp, __m128d __a)
  511. {
  512. struct __mm_storeh_pd_struct {
  513. double __u;
  514. } __attribute__((__packed__, __may_alias__));
  515. ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1];
  516. }
  517. static __inline__ void __DEFAULT_FN_ATTRS
  518. _mm_storel_pd(double *__dp, __m128d __a)
  519. {
  520. struct __mm_storeh_pd_struct {
  521. double __u;
  522. } __attribute__((__packed__, __may_alias__));
  523. ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0];
  524. }
  525. static __inline__ __m128i __DEFAULT_FN_ATTRS
  526. _mm_add_epi8(__m128i __a, __m128i __b)
  527. {
  528. return (__m128i)((__v16qi)__a + (__v16qi)__b);
  529. }
  530. static __inline__ __m128i __DEFAULT_FN_ATTRS
  531. _mm_add_epi16(__m128i __a, __m128i __b)
  532. {
  533. return (__m128i)((__v8hi)__a + (__v8hi)__b);
  534. }
  535. static __inline__ __m128i __DEFAULT_FN_ATTRS
  536. _mm_add_epi32(__m128i __a, __m128i __b)
  537. {
  538. return (__m128i)((__v4si)__a + (__v4si)__b);
  539. }
  540. static __inline__ __m64 __DEFAULT_FN_ATTRS
  541. _mm_add_si64(__m64 __a, __m64 __b)
  542. {
  543. return __a + __b;
  544. }
  545. static __inline__ __m128i __DEFAULT_FN_ATTRS
  546. _mm_add_epi64(__m128i __a, __m128i __b)
  547. {
  548. return __a + __b;
  549. }
  550. static __inline__ __m128i __DEFAULT_FN_ATTRS
  551. _mm_adds_epi8(__m128i __a, __m128i __b)
  552. {
  553. return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b);
  554. }
  555. static __inline__ __m128i __DEFAULT_FN_ATTRS
  556. _mm_adds_epi16(__m128i __a, __m128i __b)
  557. {
  558. return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b);
  559. }
  560. static __inline__ __m128i __DEFAULT_FN_ATTRS
  561. _mm_adds_epu8(__m128i __a, __m128i __b)
  562. {
  563. return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b);
  564. }
  565. static __inline__ __m128i __DEFAULT_FN_ATTRS
  566. _mm_adds_epu16(__m128i __a, __m128i __b)
  567. {
  568. return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b);
  569. }
  570. static __inline__ __m128i __DEFAULT_FN_ATTRS
  571. _mm_avg_epu8(__m128i __a, __m128i __b)
  572. {
  573. return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b);
  574. }
  575. static __inline__ __m128i __DEFAULT_FN_ATTRS
  576. _mm_avg_epu16(__m128i __a, __m128i __b)
  577. {
  578. return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b);
  579. }
  580. static __inline__ __m128i __DEFAULT_FN_ATTRS
  581. _mm_madd_epi16(__m128i __a, __m128i __b)
  582. {
  583. return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b);
  584. }
  585. static __inline__ __m128i __DEFAULT_FN_ATTRS
  586. _mm_max_epi16(__m128i __a, __m128i __b)
  587. {
  588. return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b);
  589. }
  590. static __inline__ __m128i __DEFAULT_FN_ATTRS
  591. _mm_max_epu8(__m128i __a, __m128i __b)
  592. {
  593. return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b);
  594. }
  595. static __inline__ __m128i __DEFAULT_FN_ATTRS
  596. _mm_min_epi16(__m128i __a, __m128i __b)
  597. {
  598. return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b);
  599. }
  600. static __inline__ __m128i __DEFAULT_FN_ATTRS
  601. _mm_min_epu8(__m128i __a, __m128i __b)
  602. {
  603. return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b);
  604. }
  605. static __inline__ __m128i __DEFAULT_FN_ATTRS
  606. _mm_mulhi_epi16(__m128i __a, __m128i __b)
  607. {
  608. return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b);
  609. }
  610. static __inline__ __m128i __DEFAULT_FN_ATTRS
  611. _mm_mulhi_epu16(__m128i __a, __m128i __b)
  612. {
  613. return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b);
  614. }
  615. static __inline__ __m128i __DEFAULT_FN_ATTRS
  616. _mm_mullo_epi16(__m128i __a, __m128i __b)
  617. {
  618. return (__m128i)((__v8hi)__a * (__v8hi)__b);
  619. }
  620. static __inline__ __m64 __DEFAULT_FN_ATTRS
  621. _mm_mul_su32(__m64 __a, __m64 __b)
  622. {
  623. return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b);
  624. }
  625. static __inline__ __m128i __DEFAULT_FN_ATTRS
  626. _mm_mul_epu32(__m128i __a, __m128i __b)
  627. {
  628. return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b);
  629. }
  630. static __inline__ __m128i __DEFAULT_FN_ATTRS
  631. _mm_sad_epu8(__m128i __a, __m128i __b)
  632. {
  633. return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b);
  634. }
  635. static __inline__ __m128i __DEFAULT_FN_ATTRS
  636. _mm_sub_epi8(__m128i __a, __m128i __b)
  637. {
  638. return (__m128i)((__v16qi)__a - (__v16qi)__b);
  639. }
  640. static __inline__ __m128i __DEFAULT_FN_ATTRS
  641. _mm_sub_epi16(__m128i __a, __m128i __b)
  642. {
  643. return (__m128i)((__v8hi)__a - (__v8hi)__b);
  644. }
  645. static __inline__ __m128i __DEFAULT_FN_ATTRS
  646. _mm_sub_epi32(__m128i __a, __m128i __b)
  647. {
  648. return (__m128i)((__v4si)__a - (__v4si)__b);
  649. }
  650. static __inline__ __m64 __DEFAULT_FN_ATTRS
  651. _mm_sub_si64(__m64 __a, __m64 __b)
  652. {
  653. return __a - __b;
  654. }
  655. static __inline__ __m128i __DEFAULT_FN_ATTRS
  656. _mm_sub_epi64(__m128i __a, __m128i __b)
  657. {
  658. return __a - __b;
  659. }
  660. static __inline__ __m128i __DEFAULT_FN_ATTRS
  661. _mm_subs_epi8(__m128i __a, __m128i __b)
  662. {
  663. return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b);
  664. }
  665. static __inline__ __m128i __DEFAULT_FN_ATTRS
  666. _mm_subs_epi16(__m128i __a, __m128i __b)
  667. {
  668. return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b);
  669. }
  670. static __inline__ __m128i __DEFAULT_FN_ATTRS
  671. _mm_subs_epu8(__m128i __a, __m128i __b)
  672. {
  673. return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b);
  674. }
  675. static __inline__ __m128i __DEFAULT_FN_ATTRS
  676. _mm_subs_epu16(__m128i __a, __m128i __b)
  677. {
  678. return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b);
  679. }
  680. static __inline__ __m128i __DEFAULT_FN_ATTRS
  681. _mm_and_si128(__m128i __a, __m128i __b)
  682. {
  683. return __a & __b;
  684. }
  685. static __inline__ __m128i __DEFAULT_FN_ATTRS
  686. _mm_andnot_si128(__m128i __a, __m128i __b)
  687. {
  688. return ~__a & __b;
  689. }
  690. static __inline__ __m128i __DEFAULT_FN_ATTRS
  691. _mm_or_si128(__m128i __a, __m128i __b)
  692. {
  693. return __a | __b;
  694. }
  695. static __inline__ __m128i __DEFAULT_FN_ATTRS
  696. _mm_xor_si128(__m128i __a, __m128i __b)
  697. {
  698. return __a ^ __b;
  699. }
  700. #define _mm_slli_si128(a, imm) __extension__ ({ \
  701. (__m128i)__builtin_shufflevector((__v16qi)_mm_setzero_si128(), \
  702. (__v16qi)(__m128i)(a), \
  703. ((imm)&0xF0) ? 0 : 16 - ((imm)&0xF), \
  704. ((imm)&0xF0) ? 0 : 17 - ((imm)&0xF), \
  705. ((imm)&0xF0) ? 0 : 18 - ((imm)&0xF), \
  706. ((imm)&0xF0) ? 0 : 19 - ((imm)&0xF), \
  707. ((imm)&0xF0) ? 0 : 20 - ((imm)&0xF), \
  708. ((imm)&0xF0) ? 0 : 21 - ((imm)&0xF), \
  709. ((imm)&0xF0) ? 0 : 22 - ((imm)&0xF), \
  710. ((imm)&0xF0) ? 0 : 23 - ((imm)&0xF), \
  711. ((imm)&0xF0) ? 0 : 24 - ((imm)&0xF), \
  712. ((imm)&0xF0) ? 0 : 25 - ((imm)&0xF), \
  713. ((imm)&0xF0) ? 0 : 26 - ((imm)&0xF), \
  714. ((imm)&0xF0) ? 0 : 27 - ((imm)&0xF), \
  715. ((imm)&0xF0) ? 0 : 28 - ((imm)&0xF), \
  716. ((imm)&0xF0) ? 0 : 29 - ((imm)&0xF), \
  717. ((imm)&0xF0) ? 0 : 30 - ((imm)&0xF), \
  718. ((imm)&0xF0) ? 0 : 31 - ((imm)&0xF)); })
  719. #define _mm_bslli_si128(a, imm) \
  720. _mm_slli_si128((a), (imm))
  721. static __inline__ __m128i __DEFAULT_FN_ATTRS
  722. _mm_slli_epi16(__m128i __a, int __count)
  723. {
  724. return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count);
  725. }
  726. static __inline__ __m128i __DEFAULT_FN_ATTRS
  727. _mm_sll_epi16(__m128i __a, __m128i __count)
  728. {
  729. return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count);
  730. }
  731. static __inline__ __m128i __DEFAULT_FN_ATTRS
  732. _mm_slli_epi32(__m128i __a, int __count)
  733. {
  734. return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count);
  735. }
  736. static __inline__ __m128i __DEFAULT_FN_ATTRS
  737. _mm_sll_epi32(__m128i __a, __m128i __count)
  738. {
  739. return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count);
  740. }
  741. static __inline__ __m128i __DEFAULT_FN_ATTRS
  742. _mm_slli_epi64(__m128i __a, int __count)
  743. {
  744. return __builtin_ia32_psllqi128(__a, __count);
  745. }
  746. static __inline__ __m128i __DEFAULT_FN_ATTRS
  747. _mm_sll_epi64(__m128i __a, __m128i __count)
  748. {
  749. return __builtin_ia32_psllq128(__a, __count);
  750. }
  751. static __inline__ __m128i __DEFAULT_FN_ATTRS
  752. _mm_srai_epi16(__m128i __a, int __count)
  753. {
  754. return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count);
  755. }
  756. static __inline__ __m128i __DEFAULT_FN_ATTRS
  757. _mm_sra_epi16(__m128i __a, __m128i __count)
  758. {
  759. return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count);
  760. }
  761. static __inline__ __m128i __DEFAULT_FN_ATTRS
  762. _mm_srai_epi32(__m128i __a, int __count)
  763. {
  764. return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count);
  765. }
  766. static __inline__ __m128i __DEFAULT_FN_ATTRS
  767. _mm_sra_epi32(__m128i __a, __m128i __count)
  768. {
  769. return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count);
  770. }
  771. #define _mm_srli_si128(a, imm) __extension__ ({ \
  772. (__m128i)__builtin_shufflevector((__v16qi)(__m128i)(a), \
  773. (__v16qi)_mm_setzero_si128(), \
  774. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 0, \
  775. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 1, \
  776. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 2, \
  777. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 3, \
  778. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 4, \
  779. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 5, \
  780. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 6, \
  781. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 7, \
  782. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 8, \
  783. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 9, \
  784. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 10, \
  785. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 11, \
  786. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 12, \
  787. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 13, \
  788. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 14, \
  789. ((imm)&0xF0) ? 16 : ((imm)&0xF) + 15); })
  790. #define _mm_bsrli_si128(a, imm) \
  791. _mm_srli_si128((a), (imm))
  792. static __inline__ __m128i __DEFAULT_FN_ATTRS
  793. _mm_srli_epi16(__m128i __a, int __count)
  794. {
  795. return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count);
  796. }
  797. static __inline__ __m128i __DEFAULT_FN_ATTRS
  798. _mm_srl_epi16(__m128i __a, __m128i __count)
  799. {
  800. return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count);
  801. }
  802. static __inline__ __m128i __DEFAULT_FN_ATTRS
  803. _mm_srli_epi32(__m128i __a, int __count)
  804. {
  805. return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count);
  806. }
  807. static __inline__ __m128i __DEFAULT_FN_ATTRS
  808. _mm_srl_epi32(__m128i __a, __m128i __count)
  809. {
  810. return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count);
  811. }
  812. static __inline__ __m128i __DEFAULT_FN_ATTRS
  813. _mm_srli_epi64(__m128i __a, int __count)
  814. {
  815. return __builtin_ia32_psrlqi128(__a, __count);
  816. }
  817. static __inline__ __m128i __DEFAULT_FN_ATTRS
  818. _mm_srl_epi64(__m128i __a, __m128i __count)
  819. {
  820. return __builtin_ia32_psrlq128(__a, __count);
  821. }
  822. static __inline__ __m128i __DEFAULT_FN_ATTRS
  823. _mm_cmpeq_epi8(__m128i __a, __m128i __b)
  824. {
  825. return (__m128i)((__v16qi)__a == (__v16qi)__b);
  826. }
  827. static __inline__ __m128i __DEFAULT_FN_ATTRS
  828. _mm_cmpeq_epi16(__m128i __a, __m128i __b)
  829. {
  830. return (__m128i)((__v8hi)__a == (__v8hi)__b);
  831. }
  832. static __inline__ __m128i __DEFAULT_FN_ATTRS
  833. _mm_cmpeq_epi32(__m128i __a, __m128i __b)
  834. {
  835. return (__m128i)((__v4si)__a == (__v4si)__b);
  836. }
  837. static __inline__ __m128i __DEFAULT_FN_ATTRS
  838. _mm_cmpgt_epi8(__m128i __a, __m128i __b)
  839. {
  840. /* This function always performs a signed comparison, but __v16qi is a char
  841. which may be signed or unsigned. */
  842. typedef signed char __v16qs __attribute__((__vector_size__(16)));
  843. return (__m128i)((__v16qs)__a > (__v16qs)__b);
  844. }
  845. static __inline__ __m128i __DEFAULT_FN_ATTRS
  846. _mm_cmpgt_epi16(__m128i __a, __m128i __b)
  847. {
  848. return (__m128i)((__v8hi)__a > (__v8hi)__b);
  849. }
  850. static __inline__ __m128i __DEFAULT_FN_ATTRS
  851. _mm_cmpgt_epi32(__m128i __a, __m128i __b)
  852. {
  853. return (__m128i)((__v4si)__a > (__v4si)__b);
  854. }
  855. static __inline__ __m128i __DEFAULT_FN_ATTRS
  856. _mm_cmplt_epi8(__m128i __a, __m128i __b)
  857. {
  858. return _mm_cmpgt_epi8(__b, __a);
  859. }
  860. static __inline__ __m128i __DEFAULT_FN_ATTRS
  861. _mm_cmplt_epi16(__m128i __a, __m128i __b)
  862. {
  863. return _mm_cmpgt_epi16(__b, __a);
  864. }
  865. static __inline__ __m128i __DEFAULT_FN_ATTRS
  866. _mm_cmplt_epi32(__m128i __a, __m128i __b)
  867. {
  868. return _mm_cmpgt_epi32(__b, __a);
  869. }
  870. #ifdef __x86_64__
  871. static __inline__ __m128d __DEFAULT_FN_ATTRS
  872. _mm_cvtsi64_sd(__m128d __a, long long __b)
  873. {
  874. __a[0] = __b;
  875. return __a;
  876. }
  877. static __inline__ long long __DEFAULT_FN_ATTRS
  878. _mm_cvtsd_si64(__m128d __a)
  879. {
  880. return __builtin_ia32_cvtsd2si64(__a);
  881. }
  882. static __inline__ long long __DEFAULT_FN_ATTRS
  883. _mm_cvttsd_si64(__m128d __a)
  884. {
  885. return __a[0];
  886. }
  887. #endif
  888. static __inline__ __m128 __DEFAULT_FN_ATTRS
  889. _mm_cvtepi32_ps(__m128i __a)
  890. {
  891. return __builtin_ia32_cvtdq2ps((__v4si)__a);
  892. }
  893. static __inline__ __m128i __DEFAULT_FN_ATTRS
  894. _mm_cvtps_epi32(__m128 __a)
  895. {
  896. return (__m128i)__builtin_ia32_cvtps2dq(__a);
  897. }
  898. static __inline__ __m128i __DEFAULT_FN_ATTRS
  899. _mm_cvttps_epi32(__m128 __a)
  900. {
  901. return (__m128i)__builtin_ia32_cvttps2dq(__a);
  902. }
  903. static __inline__ __m128i __DEFAULT_FN_ATTRS
  904. _mm_cvtsi32_si128(int __a)
  905. {
  906. return (__m128i)(__v4si){ __a, 0, 0, 0 };
  907. }
  908. #ifdef __x86_64__
  909. static __inline__ __m128i __DEFAULT_FN_ATTRS
  910. _mm_cvtsi64_si128(long long __a)
  911. {
  912. return (__m128i){ __a, 0 };
  913. }
  914. #endif
  915. static __inline__ int __DEFAULT_FN_ATTRS
  916. _mm_cvtsi128_si32(__m128i __a)
  917. {
  918. __v4si __b = (__v4si)__a;
  919. return __b[0];
  920. }
  921. #ifdef __x86_64__
  922. static __inline__ long long __DEFAULT_FN_ATTRS
  923. _mm_cvtsi128_si64(__m128i __a)
  924. {
  925. return __a[0];
  926. }
  927. #endif
  928. static __inline__ __m128i __DEFAULT_FN_ATTRS
  929. _mm_load_si128(__m128i const *__p)
  930. {
  931. return *__p;
  932. }
  933. static __inline__ __m128i __DEFAULT_FN_ATTRS
  934. _mm_loadu_si128(__m128i const *__p)
  935. {
  936. struct __loadu_si128 {
  937. __m128i __v;
  938. } __attribute__((__packed__, __may_alias__));
  939. return ((struct __loadu_si128*)__p)->__v;
  940. }
  941. static __inline__ __m128i __DEFAULT_FN_ATTRS
  942. _mm_loadl_epi64(__m128i const *__p)
  943. {
  944. struct __mm_loadl_epi64_struct {
  945. long long __u;
  946. } __attribute__((__packed__, __may_alias__));
  947. return (__m128i) { ((struct __mm_loadl_epi64_struct*)__p)->__u, 0};
  948. }
  949. static __inline__ __m128i __DEFAULT_FN_ATTRS
  950. _mm_set_epi64x(long long q1, long long q0)
  951. {
  952. return (__m128i){ q0, q1 };
  953. }
  954. static __inline__ __m128i __DEFAULT_FN_ATTRS
  955. _mm_set_epi64(__m64 q1, __m64 q0)
  956. {
  957. return (__m128i){ (long long)q0, (long long)q1 };
  958. }
  959. static __inline__ __m128i __DEFAULT_FN_ATTRS
  960. _mm_set_epi32(int i3, int i2, int i1, int i0)
  961. {
  962. return (__m128i)(__v4si){ i0, i1, i2, i3};
  963. }
  964. static __inline__ __m128i __DEFAULT_FN_ATTRS
  965. _mm_set_epi16(short w7, short w6, short w5, short w4, short w3, short w2, short w1, short w0)
  966. {
  967. return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
  968. }
  969. static __inline__ __m128i __DEFAULT_FN_ATTRS
  970. _mm_set_epi8(char b15, char b14, char b13, char b12, char b11, char b10, char b9, char b8, char b7, char b6, char b5, char b4, char b3, char b2, char b1, char b0)
  971. {
  972. return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
  973. }
  974. static __inline__ __m128i __DEFAULT_FN_ATTRS
  975. _mm_set1_epi64x(long long __q)
  976. {
  977. return (__m128i){ __q, __q };
  978. }
  979. static __inline__ __m128i __DEFAULT_FN_ATTRS
  980. _mm_set1_epi64(__m64 __q)
  981. {
  982. return (__m128i){ (long long)__q, (long long)__q };
  983. }
  984. static __inline__ __m128i __DEFAULT_FN_ATTRS
  985. _mm_set1_epi32(int __i)
  986. {
  987. return (__m128i)(__v4si){ __i, __i, __i, __i };
  988. }
  989. static __inline__ __m128i __DEFAULT_FN_ATTRS
  990. _mm_set1_epi16(short __w)
  991. {
  992. return (__m128i)(__v8hi){ __w, __w, __w, __w, __w, __w, __w, __w };
  993. }
  994. static __inline__ __m128i __DEFAULT_FN_ATTRS
  995. _mm_set1_epi8(char __b)
  996. {
  997. return (__m128i)(__v16qi){ __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b };
  998. }
  999. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1000. _mm_setr_epi64(__m64 q0, __m64 q1)
  1001. {
  1002. return (__m128i){ (long long)q0, (long long)q1 };
  1003. }
  1004. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1005. _mm_setr_epi32(int i0, int i1, int i2, int i3)
  1006. {
  1007. return (__m128i)(__v4si){ i0, i1, i2, i3};
  1008. }
  1009. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1010. _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7)
  1011. {
  1012. return (__m128i)(__v8hi){ w0, w1, w2, w3, w4, w5, w6, w7 };
  1013. }
  1014. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1015. _mm_setr_epi8(char b0, char b1, char b2, char b3, char b4, char b5, char b6, char b7, char b8, char b9, char b10, char b11, char b12, char b13, char b14, char b15)
  1016. {
  1017. return (__m128i)(__v16qi){ b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15 };
  1018. }
  1019. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1020. _mm_setzero_si128(void)
  1021. {
  1022. return (__m128i){ 0LL, 0LL };
  1023. }
  1024. static __inline__ void __DEFAULT_FN_ATTRS
  1025. _mm_store_si128(__m128i *__p, __m128i __b)
  1026. {
  1027. *__p = __b;
  1028. }
  1029. static __inline__ void __DEFAULT_FN_ATTRS
  1030. _mm_storeu_si128(__m128i *__p, __m128i __b)
  1031. {
  1032. __builtin_ia32_storedqu((char *)__p, (__v16qi)__b);
  1033. }
  1034. static __inline__ void __DEFAULT_FN_ATTRS
  1035. _mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p)
  1036. {
  1037. __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p);
  1038. }
  1039. static __inline__ void __DEFAULT_FN_ATTRS
  1040. _mm_storel_epi64(__m128i *__p, __m128i __a)
  1041. {
  1042. struct __mm_storel_epi64_struct {
  1043. long long __u;
  1044. } __attribute__((__packed__, __may_alias__));
  1045. ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0];
  1046. }
  1047. static __inline__ void __DEFAULT_FN_ATTRS
  1048. _mm_stream_pd(double *__p, __m128d __a)
  1049. {
  1050. __builtin_ia32_movntpd(__p, __a);
  1051. }
  1052. static __inline__ void __DEFAULT_FN_ATTRS
  1053. _mm_stream_si128(__m128i *__p, __m128i __a)
  1054. {
  1055. __builtin_ia32_movntdq(__p, __a);
  1056. }
  1057. static __inline__ void __DEFAULT_FN_ATTRS
  1058. _mm_stream_si32(int *__p, int __a)
  1059. {
  1060. __builtin_ia32_movnti(__p, __a);
  1061. }
  1062. #ifdef __x86_64__
  1063. static __inline__ void __DEFAULT_FN_ATTRS
  1064. _mm_stream_si64(long long *__p, long long __a)
  1065. {
  1066. __builtin_ia32_movnti64(__p, __a);
  1067. }
  1068. #endif
  1069. static __inline__ void __DEFAULT_FN_ATTRS
  1070. _mm_clflush(void const *__p)
  1071. {
  1072. __builtin_ia32_clflush(__p);
  1073. }
  1074. static __inline__ void __DEFAULT_FN_ATTRS
  1075. _mm_lfence(void)
  1076. {
  1077. __builtin_ia32_lfence();
  1078. }
  1079. static __inline__ void __DEFAULT_FN_ATTRS
  1080. _mm_mfence(void)
  1081. {
  1082. __builtin_ia32_mfence();
  1083. }
  1084. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1085. _mm_packs_epi16(__m128i __a, __m128i __b)
  1086. {
  1087. return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b);
  1088. }
  1089. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1090. _mm_packs_epi32(__m128i __a, __m128i __b)
  1091. {
  1092. return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b);
  1093. }
  1094. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1095. _mm_packus_epi16(__m128i __a, __m128i __b)
  1096. {
  1097. return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b);
  1098. }
  1099. static __inline__ int __DEFAULT_FN_ATTRS
  1100. _mm_extract_epi16(__m128i __a, int __imm)
  1101. {
  1102. __v8hi __b = (__v8hi)__a;
  1103. return (unsigned short)__b[__imm & 7];
  1104. }
  1105. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1106. _mm_insert_epi16(__m128i __a, int __b, int __imm)
  1107. {
  1108. __v8hi __c = (__v8hi)__a;
  1109. __c[__imm & 7] = __b;
  1110. return (__m128i)__c;
  1111. }
  1112. static __inline__ int __DEFAULT_FN_ATTRS
  1113. _mm_movemask_epi8(__m128i __a)
  1114. {
  1115. return __builtin_ia32_pmovmskb128((__v16qi)__a);
  1116. }
  1117. #define _mm_shuffle_epi32(a, imm) __extension__ ({ \
  1118. (__m128i)__builtin_shufflevector((__v4si)(__m128i)(a), \
  1119. (__v4si)_mm_set1_epi32(0), \
  1120. (imm) & 0x3, ((imm) & 0xc) >> 2, \
  1121. ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6); })
  1122. #define _mm_shufflelo_epi16(a, imm) __extension__ ({ \
  1123. (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
  1124. (__v8hi)_mm_set1_epi16(0), \
  1125. (imm) & 0x3, ((imm) & 0xc) >> 2, \
  1126. ((imm) & 0x30) >> 4, ((imm) & 0xc0) >> 6, \
  1127. 4, 5, 6, 7); })
  1128. #define _mm_shufflehi_epi16(a, imm) __extension__ ({ \
  1129. (__m128i)__builtin_shufflevector((__v8hi)(__m128i)(a), \
  1130. (__v8hi)_mm_set1_epi16(0), \
  1131. 0, 1, 2, 3, \
  1132. 4 + (((imm) & 0x03) >> 0), \
  1133. 4 + (((imm) & 0x0c) >> 2), \
  1134. 4 + (((imm) & 0x30) >> 4), \
  1135. 4 + (((imm) & 0xc0) >> 6)); })
  1136. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1137. _mm_unpackhi_epi8(__m128i __a, __m128i __b)
  1138. {
  1139. return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15);
  1140. }
  1141. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1142. _mm_unpackhi_epi16(__m128i __a, __m128i __b)
  1143. {
  1144. return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7);
  1145. }
  1146. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1147. _mm_unpackhi_epi32(__m128i __a, __m128i __b)
  1148. {
  1149. return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3);
  1150. }
  1151. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1152. _mm_unpackhi_epi64(__m128i __a, __m128i __b)
  1153. {
  1154. return (__m128i)__builtin_shufflevector(__a, __b, 1, 2+1);
  1155. }
  1156. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1157. _mm_unpacklo_epi8(__m128i __a, __m128i __b)
  1158. {
  1159. return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7);
  1160. }
  1161. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1162. _mm_unpacklo_epi16(__m128i __a, __m128i __b)
  1163. {
  1164. return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3);
  1165. }
  1166. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1167. _mm_unpacklo_epi32(__m128i __a, __m128i __b)
  1168. {
  1169. return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1);
  1170. }
  1171. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1172. _mm_unpacklo_epi64(__m128i __a, __m128i __b)
  1173. {
  1174. return (__m128i)__builtin_shufflevector(__a, __b, 0, 2+0);
  1175. }
  1176. static __inline__ __m64 __DEFAULT_FN_ATTRS
  1177. _mm_movepi64_pi64(__m128i __a)
  1178. {
  1179. return (__m64)__a[0];
  1180. }
  1181. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1182. _mm_movpi64_epi64(__m64 __a)
  1183. {
  1184. return (__m128i){ (long long)__a, 0 };
  1185. }
  1186. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1187. _mm_move_epi64(__m128i __a)
  1188. {
  1189. return __builtin_shufflevector(__a, (__m128i){ 0 }, 0, 2);
  1190. }
  1191. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1192. _mm_unpackhi_pd(__m128d __a, __m128d __b)
  1193. {
  1194. return __builtin_shufflevector(__a, __b, 1, 2+1);
  1195. }
  1196. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1197. _mm_unpacklo_pd(__m128d __a, __m128d __b)
  1198. {
  1199. return __builtin_shufflevector(__a, __b, 0, 2+0);
  1200. }
  1201. static __inline__ int __DEFAULT_FN_ATTRS
  1202. _mm_movemask_pd(__m128d __a)
  1203. {
  1204. return __builtin_ia32_movmskpd(__a);
  1205. }
  1206. #define _mm_shuffle_pd(a, b, i) __extension__ ({ \
  1207. __builtin_shufflevector((__m128d)(a), (__m128d)(b), \
  1208. (i) & 1, (((i) & 2) >> 1) + 2); })
  1209. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1210. _mm_castpd_ps(__m128d __a)
  1211. {
  1212. return (__m128)__a;
  1213. }
  1214. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1215. _mm_castpd_si128(__m128d __a)
  1216. {
  1217. return (__m128i)__a;
  1218. }
  1219. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1220. _mm_castps_pd(__m128 __a)
  1221. {
  1222. return (__m128d)__a;
  1223. }
  1224. static __inline__ __m128i __DEFAULT_FN_ATTRS
  1225. _mm_castps_si128(__m128 __a)
  1226. {
  1227. return (__m128i)__a;
  1228. }
  1229. static __inline__ __m128 __DEFAULT_FN_ATTRS
  1230. _mm_castsi128_ps(__m128i __a)
  1231. {
  1232. return (__m128)__a;
  1233. }
  1234. static __inline__ __m128d __DEFAULT_FN_ATTRS
  1235. _mm_castsi128_pd(__m128i __a)
  1236. {
  1237. return (__m128d)__a;
  1238. }
  1239. static __inline__ void __DEFAULT_FN_ATTRS
  1240. _mm_pause(void)
  1241. {
  1242. __asm__ volatile ("pause");
  1243. }
  1244. #undef __DEFAULT_FN_ATTRS
  1245. #define _MM_SHUFFLE2(x, y) (((x) << 1) | (y))
  1246. #endif /* __SSE2__ */
  1247. #endif /* __EMMINTRIN_H */