x86_clr.as 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. #
  2. # x86 surface clear routines for HERMES
  3. # Copyright (c) 1998 Christian Nentwich ([email protected])
  4. # This source code is licensed under the GNU LGPL
  5. #
  6. # Please refer to the file COPYING.LIB contained in the distribution for
  7. # licensing conditions
  8. #
  9. # (04/10/99) Modified ClearX86_8 <[email protected]>
  10. .globl _ClearX86_32
  11. .globl _ClearX86_24
  12. .globl _ClearX86_16
  13. .globl _ClearX86_8
  14. .text
  15. ##
  16. ## --------------------------------------------------------------------------
  17. ## HermesClearInterface (ebp+..)
  18. ## 0: char8 *dest
  19. ## 4: int32 value
  20. ## 8: unsigned int width (already checked to be >0!)
  21. ## 12: unsigned int height (already checked to be >0!)
  22. ## 16: int add
  23. .align 8
  24. _ClearX86_32:
  25. pushl %ebp
  26. movl %esp,%ebp
  27. movl 8(%ebp),%ebp
  28. movl (%ebp),%edi # destination
  29. movl 4(%ebp),%eax # pixel value
  30. movl 12(%ebp),%edx # height
  31. .align 4
  32. _ClearX86_32.L_y:
  33. movl 8(%ebp),%ecx
  34. rep
  35. stosl
  36. addl 16(%ebp),%edi
  37. decl %edx
  38. jnz _ClearX86_32.L_y
  39. popl %ebp
  40. ret
  41. _ClearX86_24:
  42. ret
  43. .align 8
  44. _ClearX86_16:
  45. pushl %ebp
  46. movl %esp,%ebp
  47. movl 8(%ebp),%ebp
  48. movl (%ebp),%edi # destination
  49. movl 4(%ebp),%eax # pixel value
  50. movl 12(%ebp),%edx # height
  51. movl %eax,%ebx
  52. shll $16,%eax # Duplicate pixel value
  53. andl $0x0ffff,%ebx
  54. orl %ebx,%eax
  55. _ClearX86_16.L_y:
  56. movl 8(%ebp),%ecx
  57. testl $3,%edi # Check if destination is aligned mod 4
  58. jz _ClearX86_16.L_aligned
  59. movw %ax,(%edi) # otherwise write one pixel
  60. addl $2,%edi
  61. decl %ecx
  62. jz _ClearX86_16.L_endline
  63. _ClearX86_16.L_aligned:
  64. shrl %ecx
  65. rep
  66. stosl
  67. jnc _ClearX86_16.L_endline
  68. movw %ax,(%edi)
  69. addl $2,%edi
  70. _ClearX86_16.L_endline:
  71. addl 16(%ebp),%edi
  72. decl %edx
  73. jnz _ClearX86_16.L_y
  74. popl %ebp
  75. ret
  76. .align 8
  77. _ClearX86_8:
  78. pushl %ebp
  79. movl %esp,%ebp
  80. movl 8(%ebp),%ebp
  81. movl 4(%ebp),%eax # pixel value
  82. movl 12(%ebp),%edx # height
  83. movb %al,%ah
  84. movl (%ebp),%edi # destination
  85. movl %eax,%ecx
  86. shll $16,%eax # Put the byte pixel value in all four bytes
  87. andl $0x0ffff,%ecx # of eax
  88. movl 8(%ebp),%ebx
  89. orl %ecx,%eax
  90. cmpl $5,%ebx # removes need for extra checks later
  91. jbe _ClearX86_8.L_short_y
  92. .align 4
  93. _ClearX86_8.L_y:
  94. testl $3,%edi
  95. jz _ClearX86_8.L_aligned
  96. movl %edi,%ecx
  97. negl %ecx
  98. andl $3,%ecx
  99. subl %ecx,%ebx
  100. rep
  101. stosb
  102. _ClearX86_8.L_aligned:
  103. movl %ebx,%ecx
  104. shrl $2,%ecx
  105. andl $3,%ebx
  106. rep
  107. stosl
  108. movl %ebx,%ecx
  109. rep
  110. stosb
  111. addl 16(%ebp),%edi
  112. decl %edx
  113. movl 8(%ebp),%ebx
  114. jnz _ClearX86_8.L_y
  115. popl %ebp
  116. ret
  117. ## Short loop
  118. .align 4
  119. _ClearX86_8.L_short_y:
  120. movl %ebx,%ecx
  121. rep
  122. stosb
  123. addl 16(%ebp),%edi
  124. decl %edx
  125. jnz _ClearX86_8.L_short_y
  126. popl %ebp
  127. ret
  128. ## ClearX86_8 version 2,
  129. ## Im not sure wheather this is faster or not...
  130. ## too many jumps could confuse the CPU branch quessing
  131. .align 8
  132. _ClearX86_8_2:
  133. pushl %ebp
  134. movl %esp,%ebp
  135. movl 8(%ebp),%ebp
  136. movl 4(%ebp),%eax # pixel value
  137. movl 12(%ebp),%edx # height
  138. movb %al,%ah
  139. movl (%ebp),%edi # destination
  140. movl %eax,%ecx
  141. shll $16,%eax # Put the byte pixel value in all four bytes
  142. andl $0x0ffff,%ecx # of eax
  143. movl 8(%ebp),%ebx
  144. orl %ecx,%eax
  145. cmpl $5,%ebx # removes need for extra checks in main loop
  146. jbe _ClearX86_8_2.L_short_y
  147. .align 4
  148. _ClearX86_8_2.L_y:
  149. testl $3,%edi
  150. jz _ClearX86_8_2.L_aligned
  151. movl %edi,%ecx
  152. negl %ecx
  153. andl $3,%ecx
  154. movb %al,(%edi)
  155. subl %ecx,%ebx
  156. incl %edi
  157. decl %ecx
  158. jz _ClearX86_8_2.L_aligned
  159. movb %al,(%edi)
  160. incl %edi
  161. decl %ecx
  162. jz _ClearX86_8_2.L_aligned
  163. movb %al,(%edi)
  164. incl %edi
  165. _ClearX86_8_2.L_aligned:
  166. movl %ebx,%ecx
  167. shrl $2,%ecx
  168. andl $3,%ebx
  169. rep
  170. stosl
  171. jz _ClearX86_8_2.L_endline
  172. # ebx
  173. movb %al,(%edi)
  174. # Write remaining (1,2 or 3) pixels
  175. incl %edi
  176. decl %ebx
  177. jz _ClearX86_8_2.L_endline
  178. movb %al,(%edi)
  179. incl %edi
  180. decl %ebx
  181. jz _ClearX86_8_2.L_endline
  182. movb %al,(%edi)
  183. incl %edi
  184. decl %ebx
  185. jz _ClearX86_8_2.L_endline
  186. movb %al,(%edi)
  187. incl %edi
  188. _ClearX86_8_2.L_endline:
  189. addl 16(%ebp),%edi
  190. decl %edx
  191. movl 8(%ebp),%ebx
  192. jnz _ClearX86_8_2.L_y
  193. popl %ebp
  194. ret
  195. ## Short loop
  196. .align 4
  197. _ClearX86_8_2.L_short_y:
  198. movl %ebx,%ecx
  199. rep
  200. stosb
  201. addl 16(%ebp),%edi
  202. decl %edx
  203. jnz _ClearX86_8_2.L_short_y
  204. popl %ebp
  205. ret