mmx_clr.as 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. #
  2. # MMX surface clear routines for HERMES
  3. # Copyright (c) 1998 Christian Nentwich ([email protected])
  4. # This source code is licensed under the GNU LGPL
  5. #
  6. # Please refer to the file COPYING.LIB contained in the distribution for
  7. # licensing conditions
  8. #
  9. .globl _ClearMMX_32
  10. .globl _ClearMMX_24
  11. .globl _ClearMMX_16
  12. .globl _ClearMMX_8
  13. .text
  14. ##
  15. ## --------------------------------------------------------------------------
  16. ## HermesClearInterface (ebp+..)
  17. ## 0: char8 *dest
  18. ## 4: int32 value
  19. ## 8: unsigned int width (already checked to be >0!)
  20. ## 12: unsigned int height (already checked to be >0!)
  21. ## 16: int add
  22. _ClearMMX_32:
  23. pushl %ebp
  24. movl %esp,%ebp
  25. movl 8(%ebp),%ebp
  26. movl 4(%ebp),%eax # pixel value
  27. movd 4(%ebp),%mm0
  28. movl 12(%ebp),%edx # height
  29. movq %mm0,%mm1
  30. psllq $32,%mm0
  31. movl (%ebp),%edi # destination
  32. por %mm1,%mm0
  33. _ClearMMX_32.L_y:
  34. movl 8(%ebp),%ecx
  35. movl %ecx,%ebx
  36. shrl %ecx
  37. jz _ClearMMX_32.L_last
  38. _ClearMMX_32.L_x:
  39. movq %mm0,(%edi)
  40. addl $8,%edi
  41. decl %ecx
  42. jnz _ClearMMX_32.L_x
  43. _ClearMMX_32.L_last:
  44. testl $1,%ebx
  45. jz _ClearMMX_32.L_endline
  46. movl %eax,(%edi)
  47. addl $4,%edi
  48. _ClearMMX_32.L_endline:
  49. addl 16(%ebp),%edi
  50. decl %edx
  51. jnz _ClearMMX_32.L_y
  52. emms
  53. popl %ebp
  54. ret
  55. _ClearMMX_24:
  56. ret
  57. _ClearMMX_16:
  58. pushl %ebp
  59. movl %esp,%ebp
  60. movl 8(%ebp),%ebp
  61. movl 4(%ebp),%eax # pixel value
  62. movl 4(%ebp),%ebx
  63. movl 12(%ebp),%edx # height
  64. movl (%ebp),%edi # destination
  65. shll $16,%eax # Duplicate pixel value
  66. andl $0x0ffff,%ebx
  67. orl %ebx,%eax
  68. movd %eax,%mm0
  69. movd %eax,%mm1
  70. psllq $32,%mm0
  71. por %mm1,%mm0
  72. _ClearMMX_16.L_y:
  73. movl 8(%ebp),%ecx
  74. testl $3,%edi # Check if destination is aligned mod 4
  75. jz _ClearMMX_16.L_aligned
  76. movw %ax,(%edi) # otherwise write one pixel
  77. addl $2,%edi
  78. decl %ecx
  79. jz _ClearMMX_16.L_endline
  80. _ClearMMX_16.L_aligned:
  81. movl %ecx,%ebx
  82. shrl $2,%ecx
  83. jz _ClearMMX_16.L_last
  84. _ClearMMX_16.L_x:
  85. movq %mm0,(%edi)
  86. addl $8,%edi
  87. decl %ecx
  88. jnz _ClearMMX_16.L_x
  89. _ClearMMX_16.L_last:
  90. andl $3,%ebx
  91. jz _ClearMMX_16.L_endline
  92. movw %ax,(%edi) # Write trailing pixels
  93. addl $2,%edi
  94. decl %ebx
  95. jz _ClearMMX_16.L_endline
  96. movw %ax,(%edi)
  97. addl $2,%edi
  98. decl %ebx
  99. jz _ClearMMX_16.L_endline
  100. movw %ax,(%edi)
  101. addl $2,%edi
  102. decl %ebx
  103. jnz _ClearMMX_16.L_endline
  104. _ClearMMX_16.L_endline:
  105. addl 16(%ebp),%edi
  106. decl %edx
  107. jnz _ClearMMX_16.L_y
  108. emms
  109. popl %ebp
  110. ret
  111. ## Clear8_x86 isnt optimised fully yet as it seems to be a tiny bit slower
  112. ## than the C routine
  113. _ClearMMX_8:
  114. pushl %ebp
  115. movl %esp,%ebp
  116. movl 8(%ebp),%ebp
  117. movl 4(%ebp),%eax # pixel value
  118. movl 4(%ebp),%ebx
  119. movl 12(%ebp),%edx # height
  120. andl $0x0ff,%ebx
  121. shll $8,%eax # Put the byte pixel value in all four bytes
  122. movl (%ebp),%edi # destination
  123. movb %bl,%al
  124. movb %bl,%bh
  125. shll $16,%eax
  126. movb %bh,%ah
  127. movb %bl,%al
  128. movd %eax,%mm0
  129. movd %eax,%mm1
  130. psllq $32,%mm0
  131. por %mm1,%mm0
  132. _ClearMMX_8.L_y:
  133. movl 8(%ebp),%ecx
  134. testl $3,%edi # Align mod 4
  135. jz _ClearMMX_8.L_aligned
  136. movl %edi,%ebx
  137. andl $3,%ebx
  138. movb %al,(%edi) # Unrolled (copy & paste), align and jump
  139. incl %edi # if finished, faster than a loop...
  140. decl %ecx
  141. jz _ClearMMX_8.L_endline
  142. decl %ebx
  143. jz _ClearMMX_8.L_aligned
  144. movb %al,(%edi) # Second pixel
  145. incl %edi
  146. decl %ecx
  147. jz _ClearMMX_8.L_endline
  148. decl %ebx
  149. jz _ClearMMX_8.L_aligned
  150. movb %al,(%edi) # Third pixel
  151. incl %edi
  152. decl %ecx
  153. jz _ClearMMX_8.L_endline
  154. decl %ebx
  155. jz _ClearMMX_8.L_aligned
  156. _ClearMMX_8.L_aligned:
  157. movl %ecx,%ebx # Store ecx for later
  158. shrl $3,%ecx # We write 8 pixels at once
  159. jz _ClearMMX_8.L_last
  160. _ClearMMX_8.L_x:
  161. movq %mm0,(%edi)
  162. addl $8,%edi
  163. decl %ecx
  164. jnz _ClearMMX_8.L_x
  165. _ClearMMX_8.L_last:
  166. movl %ebx,%ecx # Clean up trailing pixels
  167. andl $7,%ecx # Could be up to 7 left
  168. jz _ClearMMX_8.L_endline
  169. testb $0b100,%cl # If theres less than four jump
  170. jz _ClearMMX_8.L_lessthanfour
  171. movl %eax,(%edi) # Otherwise write a dword
  172. addl $4,%edi
  173. subl $4,%ecx
  174. _ClearMMX_8.L_lessthanfour:
  175. rep
  176. stosb # Clean up the very rest
  177. _ClearMMX_8.L_endline:
  178. addl 16(%ebp),%edi
  179. decl %edx
  180. jnz _ClearMMX_8.L_y
  181. emms
  182. popl %ebp
  183. ret