mMathAMD.cc 12 KB


  1. //-----------------------------------------------------------------------------
  2. // Copyright (c) 2013 GarageGames, LLC
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files (the "Software"), to
  6. // deal in the Software without restriction, including without limitation the
  7. // rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. // sell copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions:
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. // FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. // IN THE SOFTWARE.
  21. //-----------------------------------------------------------------------------
  22. #include "math/mMathFn.h"
  23. #include "math/mPlane.h"
  24. #include "math/mMatrix.h"
  25. // extern void (*m_matF_x_point3F)(const F32 *m, const F32 *p, F32 *presult);
  26. // extern void (*m_matF_x_vectorF)(const F32 *m, const F32 *v, F32 *vresult);
  27. /* not currently implemented.
  28. void Athlon_MatrixF_x_Point3F(const F32 *m, const F32 *p, F32 *presult)
  29. {
  30. m;
  31. p;
  32. presult;
  33. }
  34. */
  35. //============================================================
  36. // Here's the C code for MatF_x_MatF:
  37. // note that the code below does it in a different order (optimal asm, after all!)
  38. //
  39. // r[0] = a[0]*b[0] + a[1]*b[4] + a[2]*b[8] + a[3]*b[12];
  40. // r[1] = a[0]*b[1] + a[1]*b[5] + a[2]*b[9] + a[3]*b[13];
  41. // r[2] = a[0]*b[2] + a[1]*b[6] + a[2]*b[10] + a[3]*b[14];
  42. // r[3] = a[0]*b[3] + a[1]*b[7] + a[2]*b[11] + a[3]*b[15];
  43. //
  44. // r[4] = a[4]*b[0] + a[5]*b[4] + a[6]*b[8] + a[7]*b[12];
  45. // r[5] = a[4]*b[1] + a[5]*b[5] + a[6]*b[9] + a[7]*b[13];
  46. // r[6] = a[4]*b[2] + a[5]*b[6] + a[6]*b[10] + a[7]*b[14];
  47. // r[7] = a[4]*b[3] + a[5]*b[7] + a[6]*b[11] + a[7]*b[15];
  48. //
  49. // r[8] = a[8]*b[0] + a[9]*b[4] + a[10]*b[8] + a[11]*b[12];
  50. // r[9] = a[8]*b[1] + a[9]*b[5] + a[10]*b[9] + a[11]*b[13];
  51. // r[10]= a[8]*b[2] + a[9]*b[6] + a[10]*b[10]+ a[11]*b[14];
  52. // r[11]= a[8]*b[3] + a[9]*b[7] + a[10]*b[11]+ a[11]*b[15];
  53. //
  54. // r[12]= a[12]*b[0]+ a[13]*b[4]+ a[14]*b[8] + a[15]*b[12];
  55. // r[13]= a[12]*b[1]+ a[13]*b[5]+ a[14]*b[9] + a[15]*b[13];
  56. // r[14]= a[12]*b[2]+ a[13]*b[6]+ a[14]*b[10]+ a[15]*b[14];
  57. // r[15]= a[12]*b[3]+ a[13]*b[7]+ a[14]*b[11]+ a[15]*b[15];
  58. //============================================================
  59. #if defined(TORQUE_SUPPORTS_NASM)
  60. #define ADD_3DNOW_FUNCS
  61. extern "C"
  62. {
  63. void Athlon_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result);
  64. }
  65. #elif defined(TORQUE_SUPPORTS_VC_INLINE_X86_ASM)
  66. #define ADD_3DNOW_FUNCS
  67. // inlined version here.
  68. void Athlon_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result)
  69. {
  70. __asm
  71. {
  72. femms
  73. mov ecx, matA
  74. mov edx, matB
  75. mov eax, result
  76. prefetch [ecx+32] ;// These may help -
  77. prefetch [edx+32] ;// and probably don't hurt
  78. movq mm0,[ecx] ;// a21 | a11
  79. movq mm1,[ecx+8] ;// a41 | a31
  80. movq mm4,[edx] ;// b21 | b11
  81. punpckhdq mm2,mm0 ;// a21 |
  82. movq mm5,[edx+16] ;// b22 | b12
  83. punpckhdq mm3,mm1 ;// a41 |
  84. movq mm6,[edx+32] ;// b23 | b13
  85. punpckldq mm0,mm0 ;// a11 | a11
  86. punpckldq mm1,mm1 ;// a31 | a31
  87. pfmul mm4,mm0 ;// a11*b21 | a11*b11
  88. punpckhdq mm2,mm2 ;// a21 | a21
  89. pfmul mm0,[edx+8] ;// a11*b41 | a11*b31
  90. movq mm7,[edx+48] ;// b24 | b14
  91. pfmul mm5,mm2 ;// a21*b22 | a21*b12
  92. punpckhdq mm3,mm3 ;// a41 | a41
  93. pfmul mm2,[edx+24] ;// a21*b42 | a21*b32
  94. pfmul mm6,mm1 ;// a31*b23 | a31*b13
  95. pfadd mm5,mm4 ;// a21*b22 + a11*b21 | a21*b12 + a11*b11
  96. pfmul mm1,[edx+40] ;// a31*b43 | a31*b33
  97. pfadd mm2,mm0 ;// a21*b42 + a11*b41 | a21*b32 + a11*b31
  98. pfmul mm7,mm3 ;// a41*b24 | a41*b14
  99. pfadd mm6,mm5 ;// a21*b22 + a11*b21 + a31*b23 | a21*b12 + a11*b11 + a31*b13
  100. pfmul mm3,[edx+56] ;// a41*b44 | a41*b34
  101. pfadd mm2,mm1 ;// a21*b42 + a11*b41 + a31*b43 | a21*b32 + a11*b31 + a31*b33
  102. pfadd mm7,mm6 ;// a41*b24 + a21*b22 + a11*b21 + a31*b23 | a41*b14 + a21*b12 + a11*b11 + a31*b13
  103. movq mm0,[ecx+16] ;// a22 | a12
  104. pfadd mm3,mm2 ;// a41*b44 + a21*b42 + a11*b41 + a31*b43 | a41*b34 + a21*b32 + a11*b31 + a31*b33
  105. movq mm1,[ecx+24] ;// a42 | a32
  106. movq [eax],mm7 ;// r21 | r11
  107. movq mm4,[edx] ;// b21 | b11
  108. movq [eax+8],mm3 ;// r41 | r31
  109. punpckhdq mm2,mm0 ;// a22 | XXX
  110. movq mm5,[edx+16] ;// b22 | b12
  111. punpckhdq mm3,mm1 ;// a42 | XXX
  112. movq mm6,[edx+32] ;// b23 | b13
  113. punpckldq mm0,mm0 ;// a12 | a12
  114. punpckldq mm1,mm1 ;// a32 | a32
  115. pfmul mm4,mm0 ;// a12*b21 | a12*b11
  116. punpckhdq mm2,mm2 ;// a22 | a22
  117. pfmul mm0,[edx+8] ;// a12*b41 | a12*b31
  118. movq mm7,[edx+48] ;// b24 | b14
  119. pfmul mm5,mm2 ;// a22*b22 | a22*b12
  120. punpckhdq mm3,mm3 ;// a42 | a42
  121. pfmul mm2,[edx+24] ;// a22*b42 | a22*b32
  122. pfmul mm6,mm1 ;// a32*b23 | a32*b13
  123. pfadd mm5,mm4 ;// a12*b21 + a22*b22 | a12*b11 + a22*b12
  124. pfmul mm1,[edx+40] ;// a32*b43 | a32*b33
  125. pfadd mm2,mm0 ;// a12*b41 + a22*b42 | a12*b11 + a22*b32
  126. pfmul mm7,mm3 ;// a42*b24 | a42*b14
  127. pfadd mm6,mm5 ;// a32*b23 + a12*b21 + a22*b22 | a32*b13 + a12*b11 + a22*b12
  128. pfmul mm3,[edx+56] ;// a42*b44 | a42*b34
  129. pfadd mm2,mm1 ;// a32*b43 + a12*b41 + a22*b42 | a32*b33 + a12*b11 + a22*b32
  130. pfadd mm7,mm6 ;// a42*b24 + a32*b23 + a12*b21 + a22*b22 | a42*b14 + a32*b13 + a12*b11 + a22*b12
  131. movq mm0,[ecx+32] ;// a23 | a13
  132. pfadd mm3,mm2 ;// a42*b44 + a32*b43 + a12*b41 + a22*b42 | a42*b34 + a32*b33 + a12*b11 + a22*b32
  133. movq mm1,[ecx+40] ;// a43 | a33
  134. movq [eax+16],mm7 ;// r22 | r12
  135. movq mm4,[edx] ;// b21 | b11
  136. movq [eax+24],mm3 ;// r42 | r32
  137. punpckhdq mm2,mm0 ;// a23 | XXX
  138. movq mm5,[edx+16] ;// b22 | b12
  139. punpckhdq mm3,mm1 ;// a43 | XXX
  140. movq mm6,[edx+32] ;// b23 | b13
  141. punpckldq mm0,mm0 ;// a13 | a13
  142. punpckldq mm1,mm1 ;// a33 | a33
  143. pfmul mm4,mm0 ;// a13*b21 | a13*b11
  144. punpckhdq mm2,mm2 ;// a23 | a23
  145. pfmul mm0,[edx+8] ;// a13*b41 | a13*b31
  146. movq mm7,[edx+48] ;// b24 | b14
  147. pfmul mm5,mm2 ;// a23*b22 | a23*b12
  148. punpckhdq mm3,mm3 ;// a43 | a43
  149. pfmul mm2,[edx+24] ;// a23*b42 | a23*b32
  150. pfmul mm6,mm1 ;// a33*b23 | a33*b13
  151. pfadd mm5,mm4 ;// a23*b22 + a13*b21 | a23*b12 + a13*b11
  152. pfmul mm1,[edx+40] ;// a33*b43 | a33*b33
  153. pfadd mm2,mm0 ;// a13*b41 + a23*b42 | a13*b31 + a23*b32
  154. pfmul mm7,mm3 ;// a43*b24 | a43*b14
  155. pfadd mm6,mm5 ;// a33*b23 + a23*b22 + a13*b21 | a33*b13 + a23*b12 + a13*b11
  156. pfmul mm3,[edx+56] ;// a43*b44 | a43*b34
  157. pfadd mm2,mm1 ;// a33*b43*a13*b41 + a23*b42 | a33*b33 + a13*b31 + a23*b32
  158. pfadd mm7,mm6 ;// a43*b24 + a33*b23 + a23*b22 + a13*b21 | a43*b14 + a33*b13 + a23*b12 + a13*b11
  159. movq mm0,[ecx+48] ;// a24 | a14
  160. pfadd mm3,mm2 ;// a43*b44 + a33*b43*a13*b41 + a23*b42 | a43*b34 + a33*b33 + a13*b31 + a23*b32
  161. movq mm1,[ecx+56] ;// a44 | a34
  162. movq [eax+32],mm7 ;// r23 | r13
  163. movq mm4,[edx] ;// b21 | b11
  164. movq [eax+40],mm3 ;// r43 | r33
  165. punpckhdq mm2,mm0 ;// a24 | XXX
  166. movq mm5,[edx+16] ;// b22 | b12
  167. punpckhdq mm3,mm1 ;// a44 | XXX
  168. movq mm6,[edx+32] ;// b23 | b13
  169. punpckldq mm0,mm0 ;// a14 | a14
  170. punpckldq mm1,mm1 ;// a34 | a34
  171. pfmul mm4,mm0 ;// a14*b21 | a14*b11
  172. punpckhdq mm2,mm2 ;// a24 | a24
  173. pfmul mm0,[edx+8] ;// a14*b41 | a14*b31
  174. movq mm7,[edx+48] ;// b24 | b14
  175. pfmul mm5,mm2 ;// a24*b22 | a24*b12
  176. punpckhdq mm3,mm3 ;// a44 | a44
  177. pfmul mm2,[edx+24] ;// a24*b 42 | a24*b32
  178. pfmul mm6,mm1 ;// a34*b23 | a34*b13
  179. pfadd mm5,mm4 ;// a14*b21 + a24*b22 | a14*b11 + a24*b12
  180. pfmul mm1,[edx+40] ;// a34*b43 | a34*b33
  181. pfadd mm2,mm0 ;// a14*b41 + a24*b 42 | a14*b31 + a24*b32
  182. pfmul mm7,mm3 ;// a44*b24 | a44*b14
  183. pfadd mm6,mm5 ;// a34*b23 + a14*b21 + a24*b22 | a34*b13 + a14*b11 + a24*b12
  184. pfmul mm3,[edx+56] ;// a44*b44 | a44*b34
  185. pfadd mm2,mm1 ;// a34*b43 + a14*b41 + a24*b 42 | a34*b33 + a14*b31 + a24*b32
  186. pfadd mm7,mm6 ;// a44*b24 + a14*b23 + a24*b 42 | a44*b14 + a14*b31 + a24*b32
  187. pfadd mm3,mm2 ;// a44*b44 + a34*b43 + a14*b41 + a24*b42 | a44*b34 + a34*b33 + a14*b31 + a24*b32
  188. movq [eax+48],mm7 ;// r24 | r14
  189. movq [eax+56],mm3 ;// r44 | r34
  190. femms
  191. }
  192. }
  193. #endif
  194. #if 0
  195. /* this isn't currently used/implemented.
  196. void Athlon_MatrixF_x_VectorF(const F32 *matrix, const F32 *vector, F32 *result)
  197. {
  198. __asm {
  199. femms
  200. mov eax,result
  201. mov ecx,vector
  202. mov edx,matrix
  203. // Here's what we're doing:
  204. // result[0] = M[0] * v[0] + M[1] * v[1] + M[2] * v[2];
  205. // result[1] = M[4] * v[0] + M[5] * v[1] + M[6] * v[2];
  206. // result[2] = M[8] * v[0] + M[9] * v[1] + M[10]* v[2];
  207. movq mm0,[ecx] // y | x
  208. movd mm1,[ecx+8] // 0 | z
  209. movd mm4,[edx+8] // 0 | m_13
  210. movq mm3,mm0 // y | x
  211. movd mm2,[edx+40] // 0 | m_33 (M[10])
  212. punpckldq mm0,mm0 // x | x
  213. punpckldq mm4,[edx+20] // m_31 | m_23
  214. pfmul mm0,[edx] // x * m_12 | x * m_11
  215. punpckhdq mm3,mm3 // y | y
  216. pfmul mm2,mm1 // 0 | z * m_33
  217. punpckldq mm1,mm1 // z | z
  218. pfmul mm4,[ecx] // y * m_31 | x * m_23
  219. pfmul mm3,[edx+12] // y * m_22 | y * m_21
  220. pfmul mm1,[edx+24] // z * m_32 | z * m_32
  221. pfacc mm4,mm4 // ? | y * m_31 + x * m_23
  222. pfadd mm3,mm0 // x * m_12 + y * m_22 | x * m_11 + y * m_21
  223. pfadd mm4,mm2 // ? | y * m_31 + x * m_23 + z * m_33
  224. pfadd mm3,mm1 // x * m_12 + y * m_22 + z * m_32 | x * m_11 + y * m_21 + z * m_32
  225. movd [eax+8],mm4 // r_z
  226. movq [eax],mm3 // r_y | r_x
  227. femms
  228. }
  229. }
  230. */
  231. #endif
  232. void mInstall_AMD_Math()
  233. {
  234. #if defined(ADD_3DNOW_FUNCS)
  235. m_matF_x_matF = Athlon_MatrixF_x_MatrixF;
  236. #endif
  237. // m_matF_x_point3F = Athlon_MatrixF_x_Point3F;
  238. // m_matF_x_vectorF = Athlon_MatrixF_x_VectorF;
  239. }