123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237 |
- @ file core_asm.s
- @ core asm routines
- @ author cearn
- @ Modified by Legolas for fpc4gba use
- @
- @ === NOTES ===
- @ * 20050924: Lower overhead for all; reduced i-count for u16 loops.
- @ * These are 16/32bit memset and memcpy. The 32bit versions are in
- @ iwram for maximum effect and pretty much do what CpuFastSet does,
- @ except that it'll work for non multiples of 8 words too. Speed
- @ is as good as CpuFastSet, but with a little less overhead.
- @ * The 16bit versions call the 32bit ones if possible and/or desirable.
- @ They are thumb/ROM functions but did them in asm anyway because
- @ GCC goes haywire with the use of registers resulting in a much
- @ higher overhead (i.e., detrimental for low counts)
- @ * Crossover with inline while(nn--) loops (not for(ii++), which are
- @ much slower):
- @ memcpy32: ~4
- @ memset32: ~5
- @ memcpy16: ~8
- @ memset16: ~8
- .file "core_asm.s"
- @ === procedure memcpy32(dest: pointer; const src: pointer; wcount: u32); ======
- @ Fast-copy by words.
- @ param dest Destination address.
- @ param src Source address.
- @ param wcount Number of words.
- @ note: src and dst must be word aligned.
- @ note: r0 and r1 return as dst + wdn and src + wdn.
- @ Reglist:
- @ r0, r1: dst, src
- @ r2: wcount, then wcount>>3
- @ r3-r10: data buffer
- @ r12: wcount&7
- .text @ ?!?!?
- @ .section .iwram,"ax", %progbits
- .align 2
- .code 32
- .global memcpy32
- memcpy32:
- and r12, r2, #7
- movs r2, r2, lsr #3
- beq .Lres_cpy32
- stmfd sp!, {r4-r10}
- @ copy 32byte chunks with 8fold xxmia
- .Lmain_cpy32:
- ldmia r1!, {r3-r10}
- stmia r0!, {r3-r10}
- subs r2, r2, #1
- bhi .Lmain_cpy32
- ldmfd sp!, {r4-r10}
- @ and the residual 0-7 words
- .Lres_cpy32:
- subs r12, r12, #1
- ldmcsia r1!, {r3}
- stmcsia r0!, {r3}
- bcs .Lres_cpy32
- bx lr
- @ === procedure memset32(dest: pointer; wd: u32; wcount: u32); =================
- @ Fast-fill by words.
- @ param dest Destination address.
- @ param src Fill word (not address).
- @ param wcount Number of words to fill.
- @ note: dst must be word aligned.
- @ note: r0 returns as dst + wcount.
- @ Reglist:
- @ r0, r1: dst, src
- @ r2: wcount, then wcount>>3
- @ r3-r10: data buffer
- @ r12: wcount&7
- .text @?!?!?
- @ .section .iwram,"ax", %progbits
- .align 2
- .code 32
- .global memset32
- memset32:
- and r12, r2, #7
- movs r2, r2, lsr #3
- beq .Lres_set32
- stmfd sp!, {r4-r10}
- @ set 32byte chunks with 8fold xxmia
- mov r3, r1
- mov r4, r1
- mov r5, r1
- mov r6, r1
- mov r7, r1
- mov r8, r1
- mov r9, r1
- mov r10, r1
- .Lmain_set32:
- stmia r0!, {r3-r10}
- subs r2, r2, #1
- bhi .Lmain_set32
- ldmfd sp!, {r4-r10}
- @ residual 0-7 words
- .Lres_set32:
- subs r12, r12, #1
- stmcsia r0!, {r1}
- bcs .Lres_set32
- bx lr
- @ === procedure memcpy16(dest: pointer; const src: pointer; hwcount: u32); =====
- @ Copy for halfwords.
- @ Uses memcpy32() if hwcount>6 and src and dst are aligned equally.
- @ param dest Destination address.
- @ param src Source address.
- @ param hwcount Number of halfwords to fill.
- @ note: dst and src must be halfword aligned.
- @ note: r0 and r1 return as dst + hwcount and src + hwcount.
- @ Reglist:
- @ r0, r1: dst, src
- @ r2, r4: hwcount
- @ r3: tmp; and data buffer
- .text
- .align 2
- .code 16
- .global memcpy16
- .thumb_func
- memcpy16:
- push {r4, lr}
- @ under 5 hwords -> std cpy
- cmp r2, #5
- bls .Ltail_cpy16
- @ unreconcilable alignment -> std cpy
- @ if (dst^src)&2 -> alignment impossible
- mov r3, r0
- eor r3, r1
- lsl r3, r3, #31 @ (dst^src), bit 1 into carry
- bcs .Ltail_cpy16 @ (dst^src)&2 : must copy by halfword
- @ src and dst have same alignment -> word align
- lsl r3, r0, #31
- bcc .Lmain_cpy16 @ ~src&2 : already word aligned
- @ aligning is necessary: copy 1 hword and align
- ldrh r3, [r1]
- strh r3, [r0]
- add r0, #2
- add r1, #2
- sub r2, r2, #1
- @ right, and for the REAL work, we're gonna use memcpy32
- .Lmain_cpy16:
- lsl r4, r2, #31
- lsr r2, r2, #1
- ldr r3, .Lpool_cpy16
- bx r3
- nop
- @ NOTE: r0,r1 are altered by memcpy32, but in exactly the right
- @ way, so we can use them as is.
- lsr r2, r4, #31
- beq .Lend_cpy16
- .Ltail_cpy16:
- sub r2, #1
- bcc .Lend_cpy16 @ r2 was 0, bug out
- lsl r2, r2, #1
- .Lres_cpy16:
- ldrh r3, [r1, r2]
- strh r3, [r0, r2]
- sub r2, r2, #2
- bcs .Lres_cpy16
- .Lend_cpy16:
- pop {r4}
- pop {r3}
- bx r3
- .align 2
- .Lpool_cpy16:
- .word memcpy32
- @ === procedure memset16(dest: pointer; hw: u16; hwcount: u32); ================
- @ Fill for halfwords.
- @ Uses memset32() if hwcount>5
- @ param dest Destination address.
- @ param hw Source halfword (not address).
- @ param hwcount Number of halfwords to fill.
- @ note: dest must be halfword aligned.
- @ note: r0 returns as dest + hwcount.
- @ Reglist:
- @ r0, r1: dst, hw
- @ r2, r4: hwcount
- @ r3: tmp; and data buffer
- .text
- .align 2
- .code 16
- .global memset16
- .thumb_func
- memset16:
- push {r4, lr}
- @ under 6 hwords -> std set
- cmp r2, #5
- bls .Ltail_set16
- @ dst not word aligned: copy 1 hword and align
- lsl r3, r0, #31
- bcc .Lmain_set16
- strh r1, [r0]
- add r0, #2
- sub r2, r2, #1
- @ Again, memset32 does the real work
- .Lmain_set16:
- lsl r4, r1, #16
- orr r1, r4
- lsl r4, r2, #31
- lsr r2, r2, #1
- ldr r3, .Lpool_set16
- bx r3
- nop
- @ NOTE: r0 is altered by memset32, but in exactly the right
- @ way, so we can use is as is. r1 is now doubled though.
- lsr r2, r4, #31
- beq .Lend_set16
- lsr r1, #16
- .Ltail_set16:
- sub r2, #1
- bcc .Lend_set16 @ r2 was 0, bug out
- lsl r2, r2, #1
- .Lres_set16:
- strh r1, [r0, r2]
- sub r2, r2, #2
- bcs .Lres_set16
- .Lend_set16:
- pop {r4}
- pop {r3}
- bx r3
- .align 2
- .Lpool_set16:
- .word memset32
|