123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297 |
- #
- # x86 surface clear routines for HERMES
- # Copyright (c) 1998 Christian Nentwich ([email protected])
- # This source code is licensed under the GNU LGPL
- #
- # Please refer to the file COPYING.LIB contained in the distribution for
- # licensing conditions
- #
- # (04/10/99) Modified ClearX86_8 <[email protected]>
- .globl _ClearX86_32
- .globl _ClearX86_24
- .globl _ClearX86_16
- .globl _ClearX86_8
- .text
- ##
- ## --------------------------------------------------------------------------
- ## HermesClearInterface (ebp+..)
- ## 0: char8 *dest
- ## 4: int32 value
- ## 8: unsigned int width (already checked to be >0!)
- ## 12: unsigned int height (already checked to be >0!)
- ## 16: int add
- .align 8
- _ClearX86_32:
- pushl %ebp
- movl %esp,%ebp
- movl 8(%ebp),%ebp
- movl (%ebp),%edi # destination
- movl 4(%ebp),%eax # pixel value
- movl 12(%ebp),%edx # height
- .align 4
- _ClearX86_32.L_y:
- movl 8(%ebp),%ecx
- rep
- stosl
- addl 16(%ebp),%edi
- decl %edx
- jnz _ClearX86_32.L_y
- popl %ebp
- ret
- _ClearX86_24:
- ret
- .align 8
- _ClearX86_16:
- pushl %ebp
- movl %esp,%ebp
- movl 8(%ebp),%ebp
- movl (%ebp),%edi # destination
- movl 4(%ebp),%eax # pixel value
- movl 12(%ebp),%edx # height
- movl %eax,%ebx
- shll $16,%eax # Duplicate pixel value
- andl $0x0ffff,%ebx
- orl %ebx,%eax
- _ClearX86_16.L_y:
- movl 8(%ebp),%ecx
- testl $3,%edi # Check if destination is aligned mod 4
- jz _ClearX86_16.L_aligned
- movw %ax,(%edi) # otherwise write one pixel
- addl $2,%edi
- decl %ecx
- jz _ClearX86_16.L_endline
- _ClearX86_16.L_aligned:
- shrl %ecx
- rep
- stosl
- jnc _ClearX86_16.L_endline
- movw %ax,(%edi)
- addl $2,%edi
- _ClearX86_16.L_endline:
- addl 16(%ebp),%edi
- decl %edx
- jnz _ClearX86_16.L_y
- popl %ebp
- ret
- .align 8
- _ClearX86_8:
- pushl %ebp
- movl %esp,%ebp
- movl 8(%ebp),%ebp
- movl 4(%ebp),%eax # pixel value
- movl 12(%ebp),%edx # height
- movb %al,%ah
- movl (%ebp),%edi # destination
- movl %eax,%ecx
- shll $16,%eax # Put the byte pixel value in all four bytes
- andl $0x0ffff,%ecx # of eax
- movl 8(%ebp),%ebx
- orl %ecx,%eax
- cmpl $5,%ebx # removes need for extra checks later
- jbe _ClearX86_8.L_short_y
- .align 4
- _ClearX86_8.L_y:
- testl $3,%edi
- jz _ClearX86_8.L_aligned
- movl %edi,%ecx
- negl %ecx
- andl $3,%ecx
- subl %ecx,%ebx
- rep
- stosb
- _ClearX86_8.L_aligned:
- movl %ebx,%ecx
- shrl $2,%ecx
- andl $3,%ebx
- rep
- stosl
- movl %ebx,%ecx
- rep
- stosb
- addl 16(%ebp),%edi
- decl %edx
- movl 8(%ebp),%ebx
- jnz _ClearX86_8.L_y
- popl %ebp
- ret
- ## Short loop
- .align 4
- _ClearX86_8.L_short_y:
- movl %ebx,%ecx
- rep
- stosb
- addl 16(%ebp),%edi
- decl %edx
- jnz _ClearX86_8.L_short_y
- popl %ebp
- ret
- ## ClearX86_8 version 2,
- ## Im not sure wheather this is faster or not...
- ## too many jumps could confuse the CPU branch quessing
- .align 8
- _ClearX86_8_2:
- pushl %ebp
- movl %esp,%ebp
- movl 8(%ebp),%ebp
- movl 4(%ebp),%eax # pixel value
- movl 12(%ebp),%edx # height
- movb %al,%ah
- movl (%ebp),%edi # destination
- movl %eax,%ecx
- shll $16,%eax # Put the byte pixel value in all four bytes
- andl $0x0ffff,%ecx # of eax
- movl 8(%ebp),%ebx
- orl %ecx,%eax
- cmpl $5,%ebx # removes need for extra checks in main loop
- jbe _ClearX86_8_2.L_short_y
- .align 4
- _ClearX86_8_2.L_y:
- testl $3,%edi
- jz _ClearX86_8_2.L_aligned
- movl %edi,%ecx
- negl %ecx
- andl $3,%ecx
- movb %al,(%edi)
- subl %ecx,%ebx
- incl %edi
- decl %ecx
- jz _ClearX86_8_2.L_aligned
- movb %al,(%edi)
- incl %edi
- decl %ecx
- jz _ClearX86_8_2.L_aligned
- movb %al,(%edi)
- incl %edi
- _ClearX86_8_2.L_aligned:
- movl %ebx,%ecx
- shrl $2,%ecx
- andl $3,%ebx
- rep
- stosl
- jz _ClearX86_8_2.L_endline
- # ebx
- movb %al,(%edi)
- # Write remaining (1,2 or 3) pixels
- incl %edi
- decl %ebx
- jz _ClearX86_8_2.L_endline
- movb %al,(%edi)
- incl %edi
- decl %ebx
- jz _ClearX86_8_2.L_endline
- movb %al,(%edi)
- incl %edi
- decl %ebx
- jz _ClearX86_8_2.L_endline
- movb %al,(%edi)
- incl %edi
- _ClearX86_8_2.L_endline:
- addl 16(%ebp),%edi
- decl %edx
- movl 8(%ebp),%ebx
- jnz _ClearX86_8_2.L_y
- popl %ebp
- ret
- ## Short loop
- .align 4
- _ClearX86_8_2.L_short_y:
- movl %ebx,%ecx
- rep
- stosb
- addl 16(%ebp),%edi
- decl %edx
- jnz _ClearX86_8_2.L_short_y
- popl %ebp
- ret
|