mMathAMD_ASM.asm 7.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177
  1. ;-----------------------------------------------------------------------------
  2. ; Copyright (c) 2012 GarageGames, LLC
  3. ;
  4. ; Permission is hereby granted, free of charge, to any person obtaining a copy
  5. ; of this software and associated documentation files (the "Software"), to
  6. ; deal in the Software without restriction, including without limitation the
  7. ; rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. ; sell copies of the Software, and to permit persons to whom the Software is
  9. ; furnished to do so, subject to the following conditions:
  10. ;
  11. ; The above copyright notice and this permission notice shall be included in
  12. ; all copies or substantial portions of the Software.
  13. ;
  14. ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. ; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. ; IN THE SOFTWARE.
  21. ;-----------------------------------------------------------------------------
  22. segment .data
  23. matA dd 0
  24. result dd 0
  25. matB dd 0
  26. segment .text
  27. %macro export_fn 1
  28. %ifidn __OUTPUT_FORMAT__, elf
  29. ; No underscore needed for ELF object files
  30. global %1
  31. %1:
  32. %else
  33. global _%1
  34. _%1:
  35. %endif
  36. %endmacro
  37. %define arg(x) [esp+(x*4)]
  38. ;void Athlon_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result)
  39. export_fn Athlon_MatrixF_x_MatrixF
  40. mov ecx, arg(1)
  41. mov edx, arg(2)
  42. mov eax, arg(3)
  43. femms
  44. prefetch [ecx+32] ; These may help -
  45. prefetch [edx+32] ; and probably don't hurt
  46. movq mm0,[ecx] ; a21 | a11
  47. movq mm1,[ecx+8] ; a41 | a31
  48. movq mm4,[edx] ; b21 | b11
  49. punpckhdq mm2,mm0 ; a21 |
  50. movq mm5,[edx+16] ; b22 | b12
  51. punpckhdq mm3,mm1 ; a41 |
  52. movq mm6,[edx+32] ; b23 | b13
  53. punpckldq mm0,mm0 ; a11 | a11
  54. punpckldq mm1,mm1 ; a31 | a31
  55. pfmul mm4,mm0 ; a11*b21 | a11*b11
  56. punpckhdq mm2,mm2 ; a21 | a21
  57. pfmul mm0,[edx+8] ; a11*b41 | a11*b31
  58. movq mm7,[edx+48] ; b24 | b14
  59. pfmul mm5,mm2 ; a21*b22 | a21*b12
  60. punpckhdq mm3,mm3 ; a41 | a41
  61. pfmul mm2,[edx+24] ; a21*b42 | a21*b32
  62. pfmul mm6,mm1 ; a31*b23 | a31*b13
  63. pfadd mm5,mm4 ; a21*b22 + a11*b21 | a21*b12 + a11*b11
  64. pfmul mm1,[edx+40] ; a31*b43 | a31*b33
  65. pfadd mm2,mm0 ; a21*b42 + a11*b41 | a21*b32 + a11*b31
  66. pfmul mm7,mm3 ; a41*b24 | a41*b14
  67. pfadd mm6,mm5 ; a21*b22 + a11*b21 + a31*b23 | a21*b12 + a11*b11 + a31*b13
  68. pfmul mm3,[edx+56] ; a41*b44 | a41*b34
  69. pfadd mm2,mm1 ; a21*b42 + a11*b41 + a31*b43 | a21*b32 + a11*b31 + a31*b33
  70. pfadd mm7,mm6 ; a41*b24 + a21*b22 + a11*b21 + a31*b23 | a41*b14 + a21*b12 + a11*b11 + a31*b13
  71. movq mm0,[ecx+16] ; a22 | a12
  72. pfadd mm3,mm2 ; a41*b44 + a21*b42 + a11*b41 + a31*b43 | a41*b34 + a21*b32 + a11*b31 + a31*b33
  73. movq mm1,[ecx+24] ; a42 | a32
  74. movq [eax],mm7 ; r21 | r11
  75. movq mm4,[edx] ; b21 | b11
  76. movq [eax+8],mm3 ; r41 | r31
  77. punpckhdq mm2,mm0 ; a22 | XXX
  78. movq mm5,[edx+16] ; b22 | b12
  79. punpckhdq mm3,mm1 ; a42 | XXX
  80. movq mm6,[edx+32] ; b23 | b13
  81. punpckldq mm0,mm0 ; a12 | a12
  82. punpckldq mm1,mm1 ; a32 | a32
  83. pfmul mm4,mm0 ; a12*b21 | a12*b11
  84. punpckhdq mm2,mm2 ; a22 | a22
  85. pfmul mm0,[edx+8] ; a12*b41 | a12*b31
  86. movq mm7,[edx+48] ; b24 | b14
  87. pfmul mm5,mm2 ; a22*b22 | a22*b12
  88. punpckhdq mm3,mm3 ; a42 | a42
  89. pfmul mm2,[edx+24] ; a22*b42 | a22*b32
  90. pfmul mm6,mm1 ; a32*b23 | a32*b13
  91. pfadd mm5,mm4 ; a12*b21 + a22*b22 | a12*b11 + a22*b12
  92. pfmul mm1,[edx+40] ; a32*b43 | a32*b33
  93. pfadd mm2,mm0 ; a12*b41 + a22*b42 | a12*b11 + a22*b32
  94. pfmul mm7,mm3 ; a42*b24 | a42*b14
  95. pfadd mm6,mm5 ; a32*b23 + a12*b21 + a22*b22 | a32*b13 + a12*b11 + a22*b12
  96. pfmul mm3,[edx+56] ; a42*b44 | a42*b34
  97. pfadd mm2,mm1 ; a32*b43 + a12*b41 + a22*b42 | a32*b33 + a12*b11 + a22*b32
  98. pfadd mm7,mm6 ; a42*b24 + a32*b23 + a12*b21 + a22*b22 | a42*b14 + a32*b13 + a12*b11 + a22*b12
  99. movq mm0,[ecx+32] ; a23 | a13
  100. pfadd mm3,mm2 ; a42*b44 + a32*b43 + a12*b41 + a22*b42 | a42*b34 + a32*b33 + a12*b11 + a22*b32
  101. movq mm1,[ecx+40] ; a43 | a33
  102. movq [eax+16],mm7 ; r22 | r12
  103. movq mm4,[edx] ; b21 | b11
  104. movq [eax+24],mm3 ; r42 | r32
  105. punpckhdq mm2,mm0 ; a23 | XXX
  106. movq mm5,[edx+16] ; b22 | b12
  107. punpckhdq mm3,mm1 ; a43 | XXX
  108. movq mm6,[edx+32] ; b23 | b13
  109. punpckldq mm0,mm0 ; a13 | a13
  110. punpckldq mm1,mm1 ; a33 | a33
  111. pfmul mm4,mm0 ; a13*b21 | a13*b11
  112. punpckhdq mm2,mm2 ; a23 | a23
  113. pfmul mm0,[edx+8] ; a13*b41 | a13*b31
  114. movq mm7,[edx+48] ; b24 | b14
  115. pfmul mm5,mm2 ; a23*b22 | a23*b12
  116. punpckhdq mm3,mm3 ; a43 | a43
  117. pfmul mm2,[edx+24] ; a23*b42 | a23*b32
  118. pfmul mm6,mm1 ; a33*b23 | a33*b13
  119. pfadd mm5,mm4 ; a23*b22 + a13*b21 | a23*b12 + a13*b11
  120. pfmul mm1,[edx+40] ; a33*b43 | a33*b33
  121. pfadd mm2,mm0 ; a13*b41 + a23*b42 | a13*b31 + a23*b32
  122. pfmul mm7,mm3 ; a43*b24 | a43*b14
  123. pfadd mm6,mm5 ; a33*b23 + a23*b22 + a13*b21 | a33*b13 + a23*b12 + a13*b11
  124. pfmul mm3,[edx+56] ; a43*b44 | a43*b34
  125. pfadd mm2,mm1 ; a33*b43*a13*b41 + a23*b42 | a33*b33 + a13*b31 + a23*b32
  126. pfadd mm7,mm6 ; a43*b24 + a33*b23 + a23*b22 + a13*b21 | a43*b14 + a33*b13 + a23*b12 + a13*b11
  127. movq mm0,[ecx+48] ; a24 | a14
  128. pfadd mm3,mm2 ; a43*b44 + a33*b43*a13*b41 + a23*b42 | a43*b34 + a33*b33 + a13*b31 + a23*b32
  129. movq mm1,[ecx+56] ; a44 | a34
  130. movq [eax+32],mm7 ; r23 | r13
  131. movq mm4,[edx] ; b21 | b11
  132. movq [eax+40],mm3 ; r43 | r33
  133. punpckhdq mm2,mm0 ; a24 | XXX
  134. movq mm5,[edx+16] ; b22 | b12
  135. punpckhdq mm3,mm1 ; a44 | XXX
  136. movq mm6,[edx+32] ; b23 | b13
  137. punpckldq mm0,mm0 ; a14 | a14
  138. punpckldq mm1,mm1 ; a34 | a34
  139. pfmul mm4,mm0 ; a14*b21 | a14*b11
  140. punpckhdq mm2,mm2 ; a24 | a24
  141. pfmul mm0,[edx+8] ; a14*b41 | a14*b31
  142. movq mm7,[edx+48] ; b24 | b14
  143. pfmul mm5,mm2 ; a24*b22 | a24*b12
  144. punpckhdq mm3,mm3 ; a44 | a44
  145. pfmul mm2,[edx+24] ; a24*b 42 | a24*b32
  146. pfmul mm6,mm1 ; a34*b23 | a34*b13
  147. pfadd mm5,mm4 ; a14*b21 + a24*b22 | a14*b11 + a24*b12
  148. pfmul mm1,[edx+40] ; a34*b43 | a34*b33
  149. pfadd mm2,mm0 ; a14*b41 + a24*b 42 | a14*b31 + a24*b32
  150. pfmul mm7,mm3 ; a44*b24 | a44*b14
  151. pfadd mm6,mm5 ; a34*b23 + a14*b21 + a24*b22 | a34*b13 + a14*b11 + a24*b12
  152. pfmul mm3,[edx+56] ; a44*b44 | a44*b34
  153. pfadd mm2,mm1 ; a34*b43 + a14*b41 + a24*b 42 | a34*b33 + a14*b31 + a24*b32
  154. pfadd mm7,mm6 ; a44*b24 + a14*b23 + a24*b 42 | a44*b14 + a14*b31 + a24*b32
  155. pfadd mm3,mm2 ; a44*b44 + a34*b43 + a14*b41 + a24*b42 | a44*b34 + a34*b33 + a14*b31 + a24*b32
  156. movq [eax+48],mm7 ; r24 | r14
  157. movq [eax+56],mm3 ; r44 | r34
  158. femms
  159. ret