mMathSSE.cc 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131
  1. //-----------------------------------------------------------------------------
  2. // Copyright (c) 2013 GarageGames, LLC
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files (the "Software"), to
  6. // deal in the Software without restriction, including without limitation the
  7. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. // sell copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. // IN THE SOFTWARE.
  21. //-----------------------------------------------------------------------------
  22. #include "math/mMathFn.h"
  23. #include "math/mPlane.h"
  24. #include "math/mMatrix.h"
  25. // if we set our flag, we always try to build the inlined asm.
  26. // EXCEPT if we're in an old version of Codewarrior that can't handle SSE code.
  27. #if defined(TORQUE_SUPPORTS_NASM)
  28. #define ADD_SSE_FN
  29. extern "C"
  30. {
  31. void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result);
  32. }
  33. #elif defined(TORQUE_SUPPORTS_VC_INLINE_X86_ASM)
  34. #define ADD_SSE_FN
  35. // inlined version here.
  36. void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result)
  37. {
  38. __asm
  39. {
  40. mov ecx, matA
  41. mov edx, matB
  42. mov eax, result
  43. movss xmm0, [edx]
  44. movups xmm1, [ecx]
  45. shufps xmm0, xmm0, 0
  46. movss xmm2, [edx+4]
  47. mulps xmm0, xmm1
  48. shufps xmm2, xmm2, 0
  49. movups xmm3, [ecx+10h]
  50. movss xmm7, [edx+8]
  51. mulps xmm2, xmm3
  52. shufps xmm7, xmm7, 0
  53. addps xmm0, xmm2
  54. movups xmm4, [ecx+20h]
  55. movss xmm2, [edx+0Ch]
  56. mulps xmm7, xmm4
  57. shufps xmm2, xmm2, 0
  58. addps xmm0, xmm7
  59. movups xmm5, [ecx+30h]
  60. movss xmm6, [edx+10h]
  61. mulps xmm2, xmm5
  62. movss xmm7, [edx+14h]
  63. shufps xmm6, xmm6, 0
  64. addps xmm0, xmm2
  65. shufps xmm7, xmm7, 0
  66. movlps [eax], xmm0
  67. movhps [eax+8], xmm0
  68. mulps xmm7, xmm3
  69. movss xmm0, [edx+18h]
  70. mulps xmm6, xmm1
  71. shufps xmm0, xmm0, 0
  72. addps xmm6, xmm7
  73. mulps xmm0, xmm4
  74. movss xmm2, [edx+24h]
  75. addps xmm6, xmm0
  76. movss xmm0, [edx+1Ch]
  77. movss xmm7, [edx+20h]
  78. shufps xmm0, xmm0, 0
  79. shufps xmm7, xmm7, 0
  80. mulps xmm0, xmm5
  81. mulps xmm7, xmm1
  82. addps xmm6, xmm0
  83. shufps xmm2, xmm2, 0
  84. movlps [eax+10h], xmm6
  85. movhps [eax+18h], xmm6
  86. mulps xmm2, xmm3
  87. movss xmm6, [edx+28h]
  88. addps xmm7, xmm2
  89. shufps xmm6, xmm6, 0
  90. movss xmm2, [edx+2Ch]
  91. mulps xmm6, xmm4
  92. shufps xmm2, xmm2, 0
  93. addps xmm7, xmm6
  94. mulps xmm2, xmm5
  95. movss xmm0, [edx+34h]
  96. addps xmm7, xmm2
  97. shufps xmm0, xmm0, 0
  98. movlps [eax+20h], xmm7
  99. movss xmm2, [edx+30h]
  100. movhps [eax+28h], xmm7
  101. mulps xmm0, xmm3
  102. shufps xmm2, xmm2, 0
  103. movss xmm6, [edx+38h]
  104. mulps xmm2, xmm1
  105. shufps xmm6, xmm6, 0
  106. addps xmm2, xmm0
  107. mulps xmm6, xmm4
  108. movss xmm7, [edx+3Ch]
  109. shufps xmm7, xmm7, 0
  110. addps xmm2, xmm6
  111. mulps xmm7, xmm5
  112. addps xmm2, xmm7
  113. movups [eax+30h], xmm2
  114. }
  115. }
  116. #endif
  117. void mInstall_Library_SSE()
  118. {
  119. #if defined(ADD_SSE_FN)
  120. m_matF_x_matF = SSE_MatrixF_x_MatrixF;
  121. // m_matF_x_point3F = Athlon_MatrixF_x_Point3F;
  122. // m_matF_x_vectorF = Athlon_MatrixF_x_VectorF;
  123. #endif
  124. }