mMathSSE_ASM.asm 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. ;-----------------------------------------------------------------------------
  2. ; Copyright (c) 2012 GarageGames, LLC
  3. ;
  4. ; Permission is hereby granted, free of charge, to any person obtaining a copy
  5. ; of this software and associated documentation files (the "Software"), to
  6. ; deal in the Software without restriction, including without limitation the
  7. ; rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  8. ; sell copies of the Software, and to permit persons to whom the Software is
  9. ; furnished to do so, subject to the following conditions:
  10. ;
  11. ; The above copyright notice and this permission notice shall be included in
  12. ; all copies or substantial portions of the Software.
  13. ;
  14. ; THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. ; IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. ; FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17. ; AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. ; LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  19. ; FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  20. ; IN THE SOFTWARE.
  21. ;-----------------------------------------------------------------------------
  22. segment .data
  23. matA dd 0
  24. result dd 0
  25. matB dd 0
  26. segment .text
  27. %macro export_fn 1
  28. %ifidn __OUTPUT_FORMAT__, elf
  29. ; No underscore needed for ELF object files
  30. global %1
  31. %1:
  32. %else
  33. global _%1
  34. _%1:
  35. %endif
  36. %endmacro
  37. %define arg(x) [esp+(x*4)]
  38. ;void SSE_MatrixF_x_MatrixF(const F32 *matA, const F32 *matB, F32 *result)
  39. export_fn SSE_MatrixF_x_MatrixF
  40. mov edx, arg(1)
  41. mov ecx, arg(2)
  42. mov eax, arg(3)
  43. movss xmm0, [edx]
  44. movups xmm1, [ecx]
  45. shufps xmm0, xmm0, 0
  46. movss xmm2, [edx+4]
  47. mulps xmm0, xmm1
  48. shufps xmm2, xmm2, 0
  49. movups xmm3, [ecx+10h]
  50. movss xmm7, [edx+8]
  51. mulps xmm2, xmm3
  52. shufps xmm7, xmm7, 0
  53. addps xmm0, xmm2
  54. movups xmm4, [ecx+20h]
  55. movss xmm2, [edx+0Ch]
  56. mulps xmm7, xmm4
  57. shufps xmm2, xmm2, 0
  58. addps xmm0, xmm7
  59. movups xmm5, [ecx+30h]
  60. movss xmm6, [edx+10h]
  61. mulps xmm2, xmm5
  62. movss xmm7, [edx+14h]
  63. shufps xmm6, xmm6, 0
  64. addps xmm0, xmm2
  65. shufps xmm7, xmm7, 0
  66. movlps [eax], xmm0
  67. movhps [eax+8], xmm0
  68. mulps xmm7, xmm3
  69. movss xmm0, [edx+18h]
  70. mulps xmm6, xmm1
  71. shufps xmm0, xmm0, 0
  72. addps xmm6, xmm7
  73. mulps xmm0, xmm4
  74. movss xmm2, [edx+24h]
  75. addps xmm6, xmm0
  76. movss xmm0, [edx+1Ch]
  77. movss xmm7, [edx+20h]
  78. shufps xmm0, xmm0, 0
  79. shufps xmm7, xmm7, 0
  80. mulps xmm0, xmm5
  81. mulps xmm7, xmm1
  82. addps xmm6, xmm0
  83. shufps xmm2, xmm2, 0
  84. movlps [eax+10h], xmm6
  85. movhps [eax+18h], xmm6
  86. mulps xmm2, xmm3
  87. movss xmm6, [edx+28h]
  88. addps xmm7, xmm2
  89. shufps xmm6, xmm6, 0
  90. movss xmm2, [edx+2Ch]
  91. mulps xmm6, xmm4
  92. shufps xmm2, xmm2, 0
  93. addps xmm7, xmm6
  94. mulps xmm2, xmm5
  95. movss xmm0, [edx+34h]
  96. addps xmm7, xmm2
  97. shufps xmm0, xmm0, 0
  98. movlps [eax+20h], xmm7
  99. movss xmm2, [edx+30h]
  100. movhps [eax+28h], xmm7
  101. mulps xmm0, xmm3
  102. shufps xmm2, xmm2, 0
  103. movss xmm6, [edx+38h]
  104. mulps xmm2, xmm1
  105. shufps xmm6, xmm6, 0
  106. addps xmm2, xmm0
  107. mulps xmm6, xmm4
  108. movss xmm7, [edx+3Ch]
  109. shufps xmm7, xmm7, 0
  110. addps xmm2, xmm6
  111. mulps xmm7, xmm5
  112. addps xmm2, xmm7
  113. movups [eax+30h], xmm2
  114. ret