# # MMX surface clear routines for HERMES # Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) # This source code is licensed under the GNU LGPL # # Please refer to the file COPYING.LIB contained in the distribution for # licensing conditions # .globl _ClearMMX_32 .globl _ClearMMX_24 .globl _ClearMMX_16 .globl _ClearMMX_8 .text ## ## -------------------------------------------------------------------------- ## HermesClearInterface (ebp+..) ## 0: char8 *dest ## 4: int32 value ## 8: unsigned int width (already checked to be >0!) ## 12: unsigned int height (already checked to be >0!) ## 16: int add _ClearMMX_32: pushl %ebp movl %esp,%ebp movl 8(%ebp),%ebp movl 4(%ebp),%eax # pixel value movd 4(%ebp),%mm0 movl 12(%ebp),%edx # height movq %mm0,%mm1 psllq $32,%mm0 movl (%ebp),%edi # destination por %mm1,%mm0 _ClearMMX_32.L_y: movl 8(%ebp),%ecx movl %ecx,%ebx shrl %ecx jz _ClearMMX_32.L_last _ClearMMX_32.L_x: movq %mm0,(%edi) addl $8,%edi decl %ecx jnz _ClearMMX_32.L_x _ClearMMX_32.L_last: testl $1,%ebx jz _ClearMMX_32.L_endline movl %eax,(%edi) addl $4,%edi _ClearMMX_32.L_endline: addl 16(%ebp),%edi decl %edx jnz _ClearMMX_32.L_y emms popl %ebp ret _ClearMMX_24: ret _ClearMMX_16: pushl %ebp movl %esp,%ebp movl 8(%ebp),%ebp movl 4(%ebp),%eax # pixel value movl 4(%ebp),%ebx movl 12(%ebp),%edx # height movl (%ebp),%edi # destination shll $16,%eax # Duplicate pixel value andl $0x0ffff,%ebx orl %ebx,%eax movd %eax,%mm0 movd %eax,%mm1 psllq $32,%mm0 por %mm1,%mm0 _ClearMMX_16.L_y: movl 8(%ebp),%ecx testl $3,%edi # Check if destination is aligned mod 4 jz _ClearMMX_16.L_aligned movw %ax,(%edi) # otherwise write one pixel addl $2,%edi decl %ecx jz _ClearMMX_16.L_endline _ClearMMX_16.L_aligned: movl %ecx,%ebx shrl $2,%ecx jz _ClearMMX_16.L_last _ClearMMX_16.L_x: movq %mm0,(%edi) addl $8,%edi decl %ecx jnz _ClearMMX_16.L_x _ClearMMX_16.L_last: andl $3,%ebx jz _ClearMMX_16.L_endline movw %ax,(%edi) # Write trailing pixels addl $2,%edi decl %ebx jz _ClearMMX_16.L_endline movw %ax,(%edi) addl $2,%edi decl %ebx jz _ClearMMX_16.L_endline movw %ax,(%edi) addl $2,%edi decl %ebx jnz _ClearMMX_16.L_endline _ClearMMX_16.L_endline: addl 16(%ebp),%edi decl %edx jnz _ClearMMX_16.L_y emms popl %ebp ret ## Clear8_x86 isnt optimised fully yet as it seems to be a tiny bit slower ## than the C routine _ClearMMX_8: pushl %ebp movl %esp,%ebp movl 8(%ebp),%ebp movl 4(%ebp),%eax # pixel value movl 4(%ebp),%ebx movl 12(%ebp),%edx # height andl $0x0ff,%ebx shll $8,%eax # Put the byte pixel value in all four bytes movl (%ebp),%edi # destination movb %bl,%al movb %bl,%bh shll $16,%eax movb %bh,%ah movb %bl,%al movd %eax,%mm0 movd %eax,%mm1 psllq $32,%mm0 por %mm1,%mm0 _ClearMMX_8.L_y: movl 8(%ebp),%ecx testl $3,%edi # Align mod 4 jz _ClearMMX_8.L_aligned movl %edi,%ebx andl $3,%ebx movb %al,(%edi) # Unrolled (copy & paste), align and jump incl %edi # if finished, faster than a loop... decl %ecx jz _ClearMMX_8.L_endline decl %ebx jz _ClearMMX_8.L_aligned movb %al,(%edi) # Second pixel incl %edi decl %ecx jz _ClearMMX_8.L_endline decl %ebx jz _ClearMMX_8.L_aligned movb %al,(%edi) # Third pixel incl %edi decl %ecx jz _ClearMMX_8.L_endline decl %ebx jz _ClearMMX_8.L_aligned _ClearMMX_8.L_aligned: movl %ecx,%ebx # Store ecx for later shrl $3,%ecx # We write 8 pixels at once jz _ClearMMX_8.L_last _ClearMMX_8.L_x: movq %mm0,(%edi) addl $8,%edi decl %ecx jnz _ClearMMX_8.L_x _ClearMMX_8.L_last: movl %ebx,%ecx # Clean up trailing pixels andl $7,%ecx # Could be up to 7 left jz _ClearMMX_8.L_endline testb $0b100,%cl # If theres less than four jump jz _ClearMMX_8.L_lessthanfour movl %eax,(%edi) # Otherwise write a dword addl $4,%edi subl $4,%ecx _ClearMMX_8.L_lessthanfour: rep stosb # Clean up the very rest _ClearMMX_8.L_endline: addl 16(%ebp),%edi decl %edx jnz _ClearMMX_8.L_y emms popl %ebp ret