123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- #
- # MMX surface clear routines for HERMES
- # Copyright (c) 1998 Christian Nentwich ([email protected])
- # This source code is licensed under the GNU LGPL
- #
- # Please refer to the file COPYING.LIB contained in the distribution for
- # licensing conditions
- #
- .globl _ClearMMX_32
- .globl _ClearMMX_24
- .globl _ClearMMX_16
- .globl _ClearMMX_8
- .text
- ##
- ## --------------------------------------------------------------------------
- ## HermesClearInterface (ebp+..)
- ## 0: char8 *dest
- ## 4: int32 value
- ## 8: unsigned int width (already checked to be >0!)
- ## 12: unsigned int height (already checked to be >0!)
- ## 16: int add
- _ClearMMX_32:
- pushl %ebp
- movl %esp,%ebp
- movl 8(%ebp),%ebp
- movl 4(%ebp),%eax # pixel value
- movd 4(%ebp),%mm0
- movl 12(%ebp),%edx # height
- movq %mm0,%mm1
- psllq $32,%mm0
- movl (%ebp),%edi # destination
- por %mm1,%mm0
- _ClearMMX_32.L_y:
- movl 8(%ebp),%ecx
- movl %ecx,%ebx
- shrl %ecx
- jz _ClearMMX_32.L_last
- _ClearMMX_32.L_x:
- movq %mm0,(%edi)
- addl $8,%edi
- decl %ecx
- jnz _ClearMMX_32.L_x
- _ClearMMX_32.L_last:
- testl $1,%ebx
- jz _ClearMMX_32.L_endline
- movl %eax,(%edi)
- addl $4,%edi
- _ClearMMX_32.L_endline:
- addl 16(%ebp),%edi
- decl %edx
- jnz _ClearMMX_32.L_y
- emms
- popl %ebp
- ret
- _ClearMMX_24:
- ret
- _ClearMMX_16:
- pushl %ebp
- movl %esp,%ebp
- movl 8(%ebp),%ebp
- movl 4(%ebp),%eax # pixel value
- movl 4(%ebp),%ebx
- movl 12(%ebp),%edx # height
- movl (%ebp),%edi # destination
- shll $16,%eax # Duplicate pixel value
- andl $0x0ffff,%ebx
- orl %ebx,%eax
- movd %eax,%mm0
- movd %eax,%mm1
- psllq $32,%mm0
- por %mm1,%mm0
- _ClearMMX_16.L_y:
- movl 8(%ebp),%ecx
- testl $3,%edi # Check if destination is aligned mod 4
- jz _ClearMMX_16.L_aligned
- movw %ax,(%edi) # otherwise write one pixel
- addl $2,%edi
- decl %ecx
- jz _ClearMMX_16.L_endline
- _ClearMMX_16.L_aligned:
- movl %ecx,%ebx
- shrl $2,%ecx
- jz _ClearMMX_16.L_last
- _ClearMMX_16.L_x:
- movq %mm0,(%edi)
- addl $8,%edi
- decl %ecx
- jnz _ClearMMX_16.L_x
- _ClearMMX_16.L_last:
- andl $3,%ebx
- jz _ClearMMX_16.L_endline
- movw %ax,(%edi) # Write trailing pixels
- addl $2,%edi
- decl %ebx
- jz _ClearMMX_16.L_endline
- movw %ax,(%edi)
- addl $2,%edi
- decl %ebx
- jz _ClearMMX_16.L_endline
- movw %ax,(%edi)
- addl $2,%edi
- decl %ebx
- jnz _ClearMMX_16.L_endline
- _ClearMMX_16.L_endline:
- addl 16(%ebp),%edi
- decl %edx
- jnz _ClearMMX_16.L_y
- emms
- popl %ebp
- ret
- ## Clear8_x86 isnt optimised fully yet as it seems to be a tiny bit slower
- ## than the C routine
- _ClearMMX_8:
- pushl %ebp
- movl %esp,%ebp
- movl 8(%ebp),%ebp
- movl 4(%ebp),%eax # pixel value
- movl 4(%ebp),%ebx
- movl 12(%ebp),%edx # height
- andl $0x0ff,%ebx
- shll $8,%eax # Put the byte pixel value in all four bytes
- movl (%ebp),%edi # destination
- movb %bl,%al
- movb %bl,%bh
- shll $16,%eax
- movb %bh,%ah
- movb %bl,%al
- movd %eax,%mm0
- movd %eax,%mm1
- psllq $32,%mm0
- por %mm1,%mm0
- _ClearMMX_8.L_y:
- movl 8(%ebp),%ecx
- testl $3,%edi # Align mod 4
- jz _ClearMMX_8.L_aligned
- movl %edi,%ebx
- andl $3,%ebx
- movb %al,(%edi) # Unrolled (copy & paste), align and jump
- incl %edi # if finished, faster than a loop...
- decl %ecx
- jz _ClearMMX_8.L_endline
- decl %ebx
- jz _ClearMMX_8.L_aligned
- movb %al,(%edi) # Second pixel
- incl %edi
- decl %ecx
- jz _ClearMMX_8.L_endline
- decl %ebx
- jz _ClearMMX_8.L_aligned
- movb %al,(%edi) # Third pixel
- incl %edi
- decl %ecx
- jz _ClearMMX_8.L_endline
- decl %ebx
- jz _ClearMMX_8.L_aligned
- _ClearMMX_8.L_aligned:
- movl %ecx,%ebx # Store ecx for later
- shrl $3,%ecx # We write 8 pixels at once
- jz _ClearMMX_8.L_last
- _ClearMMX_8.L_x:
- movq %mm0,(%edi)
- addl $8,%edi
- decl %ecx
- jnz _ClearMMX_8.L_x
- _ClearMMX_8.L_last:
- movl %ebx,%ecx # Clean up trailing pixels
- andl $7,%ecx # Could be up to 7 left
- jz _ClearMMX_8.L_endline
- testb $0b100,%cl # If theres less than four jump
- jz _ClearMMX_8.L_lessthanfour
- movl %eax,(%edi) # Otherwise write a dword
- addl $4,%edi
- subl $4,%ecx
- _ClearMMX_8.L_lessthanfour:
- rep
- stosb # Clean up the very rest
- _ClearMMX_8.L_endline:
- addl 16(%ebp),%edi
- decl %edx
- jnz _ClearMMX_8.L_y
- emms
- popl %ebp
- ret
|