simdExtra.h 3.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384
  1. // zlib open source license
  2. //
  3. // Copyright (c) 2019 David Forsgren Piuva
  4. //
  5. // This software is provided 'as-is', without any express or implied
  6. // warranty. In no event will the authors be held liable for any damages
  7. // arising from the use of this software.
  8. //
  9. // Permission is granted to anyone to use this software for any purpose,
  10. // including commercial applications, and to alter it and redistribute it
  11. // freely, subject to the following restrictions:
  12. //
  13. // 1. The origin of this software must not be misrepresented; you must not
  14. // claim that you wrote the original software. If you use this software
  15. // in a product, an acknowledgment in the product documentation would be
  16. // appreciated but is not required.
  17. //
  18. // 2. Altered source versions must be plainly marked as such, and must not be
  19. // misrepresented as being the original software.
  20. //
  21. // 3. This notice may not be removed or altered from any source
  22. // distribution.
  23. // An advanced high performance extension to the simpler simd.h
  24. // The caller is expected to write the reference implementation separatelly for unhandled target machines.
  25. // Because the code is not as clean as when using infix math operations from simd.h,
  26. // so you will need to write a separate scalar version anyway for documentating the behaviour.
  27. // This module can only be used when the USE_SIMD_EXTRA macro is defined.
  28. // This allow USE_SIMD_EXTRA to be more picky about which SIMD instruction sets to use
  29. // in order to get access to a larger intersection between the platforms.
  30. // It also keeps simd.h easy to port and emulate.
  31. // Works directly with simd vectors using aliases, instead of the wrappers.
  32. // This makes it easier to mix directly with SIMD intrinsics for a specific target.
  33. #ifndef DFPSR_SIMD_EXTRA
  34. #define DFPSR_SIMD_EXTRA
  35. #include "simd.h"
  36. #ifdef USE_SSE2
  37. #define USE_SIMD_EXTRA
  38. //struct SIMD_F32x4x2 {
  39. // SIMD_F32x4 val[2];
  40. //};
  41. //struct SIMD_U16x8x2 {
  42. // SIMD_U16x8 val[2];
  43. //};
  44. struct SIMD_U32x4x2 {
  45. SIMD_U32x4 val[2];
  46. };
  47. //struct SIMD_I32x4x2 {
  48. // SIMD_I32x4 val[2];
  49. //};
  50. static inline SIMD_U32x4x2 ZIP_U32_SIMD(SIMD_U32x4 lower, SIMD_U32x4 higher) {
  51. ALIGN16 SIMD_U32x4x2 result;
  52. result.val[0] = _mm_unpacklo_epi32(lower, higher);
  53. result.val[1] = _mm_unpackhi_epi32(lower, higher);
  54. return result;
  55. }
  56. static inline SIMD_U32x4 ZIP_LOW_U32_SIMD(SIMD_U32x4 lower, SIMD_U32x4 higher) {
  57. return _mm_unpacklo_epi32(lower, higher);
  58. }
  59. static inline SIMD_U32x4 ZIP_HIGH_U32_SIMD(SIMD_U32x4 lower, SIMD_U32x4 higher) {
  60. return _mm_unpackhi_epi32(lower, higher);
  61. }
  62. #elif USE_NEON
  63. #define USE_SIMD_EXTRA
  64. // TODO: Write regression tests and try simdExtra.h with NEON activated
  65. //#define SIMD_F32x4x2 float32x4x2_t
  66. //#define SIMD_U16x8x2 uint16x8x2_t
  67. #define SIMD_U32x4x2 uint32x4x2_t
  68. //#define SIMD_I32x4x2 int32x4x2_t
  69. static inline SIMD_U32x4x2 ZIP_U32_SIMD(SIMD_U32x4 lower, SIMD_U32x4 higher) {
  70. return vzipq_u32(lower, higher);
  71. }
  72. static inline SIMD_U32x4 ZIP_LOW_U32_SIMD(SIMD_U32x4 lower, SIMD_U32x4 higher) {
  73. //return vzipq_u32(lower, higher).val[0];
  74. return float32x2x2_t vzip_u32(vget_low_u32(lower), vget_low_u32(higher));
  75. }
  76. static inline SIMD_U32x4 ZIP_HIGH_U32_SIMD(SIMD_U32x4 lower, SIMD_U32x4 higher) {
  77. //return vzipq_u32(lower, higher).val[1];
  78. return float32x2x2_t vzip_u32(vget_high_u32(lower), vget_high_u32(higher));
  79. }
  80. #endif
  81. #endif