# # x86 surface clear routines for HERMES # Copyright (c) 1998 Christian Nentwich (c.nentwich@cs.ucl.ac.uk) # This source code is licensed under the GNU LGPL # # Please refer to the file COPYING.LIB contained in the distribution for # licensing conditions # # (04/10/99) Modified ClearX86_8 .globl _ClearX86_32 .globl _ClearX86_24 .globl _ClearX86_16 .globl _ClearX86_8 .text ## ## -------------------------------------------------------------------------- ## HermesClearInterface (ebp+..) ## 0: char8 *dest ## 4: int32 value ## 8: unsigned int width (already checked to be >0!) ## 12: unsigned int height (already checked to be >0!) ## 16: int add .align 8 _ClearX86_32: pushl %ebp movl %esp,%ebp movl 8(%ebp),%ebp movl (%ebp),%edi # destination movl 4(%ebp),%eax # pixel value movl 12(%ebp),%edx # height .align 4 _ClearX86_32.L_y: movl 8(%ebp),%ecx rep stosl addl 16(%ebp),%edi decl %edx jnz _ClearX86_32.L_y popl %ebp ret _ClearX86_24: ret .align 8 _ClearX86_16: pushl %ebp movl %esp,%ebp movl 8(%ebp),%ebp movl (%ebp),%edi # destination movl 4(%ebp),%eax # pixel value movl 12(%ebp),%edx # height movl %eax,%ebx shll $16,%eax # Duplicate pixel value andl $0x0ffff,%ebx orl %ebx,%eax _ClearX86_16.L_y: movl 8(%ebp),%ecx testl $3,%edi # Check if destination is aligned mod 4 jz _ClearX86_16.L_aligned movw %ax,(%edi) # otherwise write one pixel addl $2,%edi decl %ecx jz _ClearX86_16.L_endline _ClearX86_16.L_aligned: shrl %ecx rep stosl jnc _ClearX86_16.L_endline movw %ax,(%edi) addl $2,%edi _ClearX86_16.L_endline: addl 16(%ebp),%edi decl %edx jnz _ClearX86_16.L_y popl %ebp ret .align 8 _ClearX86_8: pushl %ebp movl %esp,%ebp movl 8(%ebp),%ebp movl 4(%ebp),%eax # pixel value movl 12(%ebp),%edx # height movb %al,%ah movl (%ebp),%edi # destination movl %eax,%ecx shll $16,%eax # Put the byte pixel value in all four bytes andl $0x0ffff,%ecx # of eax movl 8(%ebp),%ebx orl %ecx,%eax cmpl $5,%ebx # removes need for extra checks later jbe _ClearX86_8.L_short_y .align 4 _ClearX86_8.L_y: testl $3,%edi jz _ClearX86_8.L_aligned movl %edi,%ecx negl %ecx andl $3,%ecx subl %ecx,%ebx rep stosb _ClearX86_8.L_aligned: movl %ebx,%ecx shrl $2,%ecx andl $3,%ebx rep stosl movl %ebx,%ecx rep stosb addl 16(%ebp),%edi decl %edx movl 8(%ebp),%ebx jnz _ClearX86_8.L_y popl %ebp ret ## Short loop .align 4 _ClearX86_8.L_short_y: movl %ebx,%ecx rep stosb addl 16(%ebp),%edi decl %edx jnz _ClearX86_8.L_short_y popl %ebp ret ## ClearX86_8 version 2, ## Im not sure wheather this is faster or not... ## too many jumps could confuse the CPU branch quessing .align 8 _ClearX86_8_2: pushl %ebp movl %esp,%ebp movl 8(%ebp),%ebp movl 4(%ebp),%eax # pixel value movl 12(%ebp),%edx # height movb %al,%ah movl (%ebp),%edi # destination movl %eax,%ecx shll $16,%eax # Put the byte pixel value in all four bytes andl $0x0ffff,%ecx # of eax movl 8(%ebp),%ebx orl %ecx,%eax cmpl $5,%ebx # removes need for extra checks in main loop jbe _ClearX86_8_2.L_short_y .align 4 _ClearX86_8_2.L_y: testl $3,%edi jz _ClearX86_8_2.L_aligned movl %edi,%ecx negl %ecx andl $3,%ecx movb %al,(%edi) subl %ecx,%ebx incl %edi decl %ecx jz _ClearX86_8_2.L_aligned movb %al,(%edi) incl %edi decl %ecx jz _ClearX86_8_2.L_aligned movb %al,(%edi) incl %edi _ClearX86_8_2.L_aligned: movl %ebx,%ecx shrl $2,%ecx andl $3,%ebx rep stosl jz _ClearX86_8_2.L_endline # ebx movb %al,(%edi) # Write remaining (1,2 or 3) pixels incl %edi decl %ebx jz _ClearX86_8_2.L_endline movb %al,(%edi) incl %edi decl %ebx jz _ClearX86_8_2.L_endline movb %al,(%edi) incl %edi decl %ebx jz _ClearX86_8_2.L_endline movb %al,(%edi) incl %edi _ClearX86_8_2.L_endline: addl 16(%ebp),%edi decl %edx movl 8(%ebp),%ebx jnz _ClearX86_8_2.L_y popl %ebp ret ## Short loop .align 4 _ClearX86_8_2.L_short_y: movl %ebx,%ecx rep stosb addl 16(%ebp),%edi decl %edx jnz _ClearX86_8_2.L_short_y popl %ebp ret