core_asm.as 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. @ file core_asm.s
  2. @ core asm routines
  3. @ author cearn
  4. @ Modified by Legolas for fpc4gba use
  5. @
  6. @ === NOTES ===
  7. @ * 20050924: Lower overhead for all; reduced i-count for u16 loops.
  8. @ * These are 16/32bit memset and memcpy. The 32bit versions are in
  9. @ iwram for maximum effect and pretty much do what CpuFastSet does,
  10. @ except that it'll work for non multiples of 8 words too. Speed
  11. @ is as good as CpuFastSet, but with a little less overhead.
  12. @ * The 16bit versions call the 32bit ones if possible and/or desirable.
  13. @ They are thumb/ROM functions but did them in asm anyway because
  14. @ GCC goes haywire with the use of registers resulting in a much
  15. @ higher overhead (i.e., detrimental for low counts)
  16. @ * Crossover with inline while(nn--) loops (not for(ii++), which are
  17. @ much slower):
  18. @ memcpy32: ~4
  19. @ memset32: ~5
  20. @ memcpy16: ~8
  21. @ memset16: ~8
  22. .file "core_asm.s"
  23. @ === procedure memcpy32(dest: pointer; const src: pointer; wcount: u32); ======
  24. @ Fast-copy by words.
  25. @ param dest Destination address.
  26. @ param src Source address.
  27. @ param wcount Number of words.
  28. @ note: src and dst must be word aligned.
  29. @ note: r0 and r1 return as dst + wdn and src + wdn.
  30. @ Reglist:
  31. @ r0, r1: dst, src
  32. @ r2: wcount, then wcount>>3
  33. @ r3-r10: data buffer
  34. @ r12: wcount&7
  35. .text @ ?!?!?
  36. @ .section .iwram,"ax", %progbits
  37. .align 2
  38. .code 32
  39. .global memcpy32
  40. memcpy32:
  41. and r12, r2, #7
  42. movs r2, r2, lsr #3
  43. beq .Lres_cpy32
  44. stmfd sp!, {r4-r10}
  45. @ copy 32byte chunks with 8fold xxmia
  46. .Lmain_cpy32:
  47. ldmia r1!, {r3-r10}
  48. stmia r0!, {r3-r10}
  49. subs r2, r2, #1
  50. bhi .Lmain_cpy32
  51. ldmfd sp!, {r4-r10}
  52. @ and the residual 0-7 words
  53. .Lres_cpy32:
  54. subs r12, r12, #1
  55. ldmcsia r1!, {r3}
  56. stmcsia r0!, {r3}
  57. bcs .Lres_cpy32
  58. bx lr
  59. @ === procedure memset32(dest: pointer; wd: u32; wcount: u32); =================
  60. @ Fast-fill by words.
  61. @ param dest Destination address.
  62. @ param src Fill word (not address).
  63. @ param wcount Number of words to fill.
  64. @ note: dst must be word aligned.
  65. @ note: r0 returns as dst + wcount.
  66. @ Reglist:
  67. @ r0, r1: dst, src
  68. @ r2: wcount, then wcount>>3
  69. @ r3-r10: data buffer
  70. @ r12: wcount&7
  71. .text @?!?!?
  72. @ .section .iwram,"ax", %progbits
  73. .align 2
  74. .code 32
  75. .global memset32
  76. memset32:
  77. and r12, r2, #7
  78. movs r2, r2, lsr #3
  79. beq .Lres_set32
  80. stmfd sp!, {r4-r10}
  81. @ set 32byte chunks with 8fold xxmia
  82. mov r3, r1
  83. mov r4, r1
  84. mov r5, r1
  85. mov r6, r1
  86. mov r7, r1
  87. mov r8, r1
  88. mov r9, r1
  89. mov r10, r1
  90. .Lmain_set32:
  91. stmia r0!, {r3-r10}
  92. subs r2, r2, #1
  93. bhi .Lmain_set32
  94. ldmfd sp!, {r4-r10}
  95. @ residual 0-7 words
  96. .Lres_set32:
  97. subs r12, r12, #1
  98. stmcsia r0!, {r1}
  99. bcs .Lres_set32
  100. bx lr
  101. @ === procedure memcpy16(dest: pointer; const src: pointer; hwcount: u32); =====
  102. @ Copy for halfwords.
  103. @ Uses memcpy32() if hwcount>6 and src and dst are aligned equally.
  104. @ param dest Destination address.
  105. @ param src Source address.
  106. @ param hwcount Number of halfwords to fill.
  107. @ note: dst and src must be halfword aligned.
  108. @ note: r0 and r1 return as dst + hwcount and src + hwcount.
  109. @ Reglist:
  110. @ r0, r1: dst, src
  111. @ r2, r4: hwcount
  112. @ r3: tmp; and data buffer
  113. .text
  114. .align 2
  115. .code 16
  116. .global memcpy16
  117. .thumb_func
  118. memcpy16:
  119. push {r4, lr}
  120. @ under 5 hwords -> std cpy
  121. cmp r2, #5
  122. bls .Ltail_cpy16
  123. @ unreconcilable alignment -> std cpy
  124. @ if (dst^src)&2 -> alignment impossible
  125. mov r3, r0
  126. eor r3, r1
  127. lsl r3, r3, #31 @ (dst^src), bit 1 into carry
  128. bcs .Ltail_cpy16 @ (dst^src)&2 : must copy by halfword
  129. @ src and dst have same alignment -> word align
  130. lsl r3, r0, #31
  131. bcc .Lmain_cpy16 @ ~src&2 : already word aligned
  132. @ aligning is necessary: copy 1 hword and align
  133. ldrh r3, [r1]
  134. strh r3, [r0]
  135. add r0, #2
  136. add r1, #2
  137. sub r2, r2, #1
  138. @ right, and for the REAL work, we're gonna use memcpy32
  139. .Lmain_cpy16:
  140. lsl r4, r2, #31
  141. lsr r2, r2, #1
  142. ldr r3, .Lpool_cpy16
  143. bx r3
  144. nop
  145. @ NOTE: r0,r1 are altered by memcpy32, but in exactly the right
  146. @ way, so we can use them as is.
  147. lsr r2, r4, #31
  148. beq .Lend_cpy16
  149. .Ltail_cpy16:
  150. sub r2, #1
  151. bcc .Lend_cpy16 @ r2 was 0, bug out
  152. lsl r2, r2, #1
  153. .Lres_cpy16:
  154. ldrh r3, [r1, r2]
  155. strh r3, [r0, r2]
  156. sub r2, r2, #2
  157. bcs .Lres_cpy16
  158. .Lend_cpy16:
  159. pop {r4}
  160. pop {r3}
  161. bx r3
  162. .align 2
  163. .Lpool_cpy16:
  164. .word memcpy32
  165. @ === procedure memset16(dest: pointer; hw: u16; hwcount: u32); ================
  166. @ Fill for halfwords.
  167. @ Uses memset32() if hwcount>5
  168. @ param dest Destination address.
  169. @ param hw Source halfword (not address).
  170. @ param hwcount Number of halfwords to fill.
  171. @ note: dest must be halfword aligned.
  172. @ note: r0 returns as dest + hwcount.
  173. @ Reglist:
  174. @ r0, r1: dst, hw
  175. @ r2, r4: hwcount
  176. @ r3: tmp; and data buffer
  177. .text
  178. .align 2
  179. .code 16
  180. .global memset16
  181. .thumb_func
  182. memset16:
  183. push {r4, lr}
  184. @ under 6 hwords -> std set
  185. cmp r2, #5
  186. bls .Ltail_set16
  187. @ dst not word aligned: copy 1 hword and align
  188. lsl r3, r0, #31
  189. bcc .Lmain_set16
  190. strh r1, [r0]
  191. add r0, #2
  192. sub r2, r2, #1
  193. @ Again, memset32 does the real work
  194. .Lmain_set16:
  195. lsl r4, r1, #16
  196. orr r1, r4
  197. lsl r4, r2, #31
  198. lsr r2, r2, #1
  199. ldr r3, .Lpool_set16
  200. bx r3
  201. nop
  202. @ NOTE: r0 is altered by memset32, but in exactly the right
  203. @ way, so we can use is as is. r1 is now doubled though.
  204. lsr r2, r4, #31
  205. beq .Lend_set16
  206. lsr r1, #16
  207. .Ltail_set16:
  208. sub r2, #1
  209. bcc .Lend_set16 @ r2 was 0, bug out
  210. lsl r2, r2, #1
  211. .Lres_set16:
  212. strh r1, [r0, r2]
  213. sub r2, r2, #2
  214. bcs .Lres_set16
  215. .Lend_set16:
  216. pop {r4}
  217. pop {r3}
  218. bx r3
  219. .align 2
  220. .Lpool_set16:
  221. .word memset32