emulation.h 2.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687
  1. // Copyright 2009-2020 Intel Corporation
  2. // SPDX-License-Identifier: Apache-2.0
  3. #pragma once
  4. /* Make precision match SSE, at the cost of some performance */
  5. #if !defined(__aarch64__)
  6. # define SSE2NEON_PRECISE_DIV 1
  7. # define SSE2NEON_PRECISE_SQRT 1
  8. #endif
  9. #include "sse2neon.h"
  10. __forceinline __m128 _mm_abs_ps(__m128 a) { return vabsq_f32(a); }
  11. __forceinline __m128 _mm_fmadd_ps (__m128 a, __m128 b, __m128 c) { return vfmaq_f32(c, a, b); }
  12. __forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { return vfmsq_f32(c, a, b); }
  13. __forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmaq_f32(c, a, b)); }
  14. __forceinline __m128 _mm_fmsub_ps (__m128 a, __m128 b, __m128 c) { return vnegq_f32(vfmsq_f32(c, a, b)); }
  15. __forceinline __m128 _mm_broadcast_ss (float const * mem_addr)
  16. {
  17. return vdupq_n_f32(*mem_addr);
  18. }
  19. // AVX2 emulation leverages Intel FMA defs above. Include after them.
  20. #include "avx2neon.h"
  21. /* Dummy defines for floating point control */
  22. #define _MM_MASK_MASK 0x1f80
  23. #define _MM_MASK_DIV_ZERO 0x200
  24. // #define _MM_FLUSH_ZERO_ON 0x8000
  25. #define _MM_MASK_DENORM 0x100
  26. #define _MM_SET_EXCEPTION_MASK(x)
  27. // #define _MM_SET_FLUSH_ZERO_MODE(x)
  28. /*
  29. __forceinline int _mm_getcsr()
  30. {
  31. return 0;
  32. }
  33. __forceinline void _mm_mfence()
  34. {
  35. __sync_synchronize();
  36. }
  37. */
  38. __forceinline __m128i _mm_load4epu8_epi32(__m128i *ptr)
  39. {
  40. uint8x8_t t0 = vld1_u8((uint8_t*)ptr);
  41. uint16x8_t t1 = vmovl_u8(t0);
  42. uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
  43. return vreinterpretq_s32_u32(t2);
  44. }
  45. __forceinline __m128i _mm_load4epu16_epi32(__m128i *ptr)
  46. {
  47. uint16x8_t t0 = vld1q_u16((uint16_t*)ptr);
  48. uint32x4_t t1 = vmovl_u16(vget_low_u16(t0));
  49. return vreinterpretq_s32_u32(t1);
  50. }
  51. __forceinline __m128i _mm_load4epi8_f32(__m128i *ptr)
  52. {
  53. int8x8_t t0 = vld1_s8((int8_t*)ptr);
  54. int16x8_t t1 = vmovl_s8(t0);
  55. int32x4_t t2 = vmovl_s16(vget_low_s16(t1));
  56. float32x4_t t3 = vcvtq_f32_s32(t2);
  57. return vreinterpretq_s32_f32(t3);
  58. }
  59. __forceinline __m128i _mm_load4epu8_f32(__m128i *ptr)
  60. {
  61. uint8x8_t t0 = vld1_u8((uint8_t*)ptr);
  62. uint16x8_t t1 = vmovl_u8(t0);
  63. uint32x4_t t2 = vmovl_u16(vget_low_u16(t1));
  64. return vreinterpretq_s32_u32(t2);
  65. }
  66. __forceinline __m128i _mm_load4epi16_f32(__m128i *ptr)
  67. {
  68. int16x8_t t0 = vld1q_s16((int16_t*)ptr);
  69. int32x4_t t1 = vmovl_s16(vget_low_s16(t0));
  70. float32x4_t t2 = vcvtq_f32_s32(t1);
  71. return vreinterpretq_s32_f32(t2);
  72. }