b2ParticleAssembly.neon.s 19 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441
  1. @
  2. @ Copyright (c) 2014 Google, Inc.
  3. @
  4. @ This software is provided 'as-is', without any express or implied
  5. @ warranty. In no event will the authors be held liable for any damages
  6. @ arising from the use of this software.
  7. @ Permission is granted to anyone to use this software for any purpose,
  8. @ including commercial applications, and to alter it and redistribute it
  9. @ freely, subject to the following restrictions:
  10. @ 1. The origin of this software must not be misrepresented; you must not
  11. @ claim that you wrote the original software. If you use this software
  12. @ in a product, an acknowledgment in the product documentation would be
  13. @ appreciated but is not required.
  14. @ 2. Altered source versions must be plainly marked as such, and must not be
  15. @ misrepresented as being the original software.
  16. @ 3. This notice may not be removed or altered from any source distribution.
  17. @
  18. .text
  19. .syntax unified
  20. .balign 4
  21. .global CalculateTags_Simd
  22. .thumb_func
  23. CalculateTags_Simd:
  24. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  25. @
  26. @ int CalculateTags_Simd(const b2Vec2* positions,
  27. @ int count,
  28. @ const float& inverseDiameter,
  29. @ uint32* outTags)
  30. @
  31. @ r0: *positions
  32. @ r1: count
  33. @ r2: &inverseDiameter
  34. @ r3: *outTags
  35. @
  36. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  37. @ q0 == x
  38. @ q1 == y
  39. @ q2 ==
  40. @ q3 ==
  41. @ q4 ==
  42. @ q5 ==
  43. @ q6 ==
  44. @ q7 ==
  45. @ q8 ==
  46. @ q9 ==
  47. @ q10 ==
  48. @ q11 ==
  49. @ q12 == inverseDiameter
  50. @ q13 == xScale
  51. @ q14 == xOffset
  52. @ q15 == yOffset
  53. @ Load constants. Literals are > 32, so must load as integers first.
  54. vld1.f32 {d24[],d25[]}, [r2] @ q12 = inverseDiameter
  55. vmov.i32 q13, #0x100 @ q13 = xScale = 1 << 8
  56. vmov.i32 q14, #0x80000 @ q14 = xOffset = (1 << 8) * (1 << 11)
  57. @ = (1 << 19) = 524288
  58. vmov.i32 q15, #0x800 @ q15 = xScale = 1 << 11 = 2048
  59. vcvt.f32.u32 q13, q13 @ convert to float
  60. vcvt.f32.u32 q14, q14
  61. vcvt.f32.u32 q15, q15
  62. @ Calculate tags four at a time, from positions.
  63. .L_CalculateTags_MainLoop:
  64. @ We consume 32-bytes per iteration, so prefetch 4 iterations ahead.
  65. @ TODO: experiment with different prefetch lengths on different
  66. @ architectures.
  67. pld [r0, #128] @ Prefetch position data
  68. @ {q0, q1} == xPosition and yPosition
  69. @ Four values in each. q0 = (x0, x1, x2, x3)
  70. vld2.f32 {q0, q1}, [r0]! @ Read in positions; increment ptr
  71. @ Calculate tags four at a time.
  72. vmul.f32 q0, q0, q12 @ q0 = x = xPosition * inverseDiameter
  73. vmul.f32 q1, q1, q12 @ q1 = y = yPosition * inverseDiameter
  74. vmul.f32 q0, q0, q13 @ q0 = x * xScale
  75. vadd.f32 q1, q1, q15 @ q1 = y + yOffset
  76. vadd.f32 q0, q0, q14 @ q0 = x * xScale + xOffset
  77. vcvt.u32.f32 q1, q1 @ q1 = (uint32)(y + yOffset)
  78. vcvt.u32.f32 q0, q0 @ q0 = (uint32)(x * xScale + xOffset)
  79. vsli.u32 q0, q1, #20 @ q0 = tag
  80. @ = ((uint32)(y + yOffset) <<yShift)
  81. @ + (uint32)(xScale * x + xOffset)
  82. @ Decrement loop counter; sets the 'gt' flag used in 'bgt' below.
  83. @ Pipelining is best if there are instructions between the 'subs' and
  84. @ 'bgt' instructions, since it takes a few cycles for the result of
  85. @ 'subs' to propegate to the flags register.
  86. subs r1, r1, #4
  87. @ Write out, ignoring index.
  88. pld [r3, #64] @ Prefetch output tag array
  89. vst1.f32 {q0}, [r3]! @ write out tags; increment ptr
  90. bgt .L_CalculateTags_MainLoop
  91. .L_CalculateTags_Return:
  92. bx lr
  93. .balign 4
  94. .thumb_func
  95. @
  96. @ Once four contacts have been found, calculate their weights and
  97. @ normals (using SIMD, so all at once).
  98. @
  99. @ Also, grab their flags from the flags buffer, and OR them together.
  100. @ This flag grabbing is slow because we access the flag buffer in a
  101. @ random order. We use prefetch instructions 'pld' to minimize the
  102. @ cost of cache misses.
  103. @
  104. FindContacts_PostProcess:
  105. @ Preload first four flag addresses into cache.
  106. @ Note: hardware only has four preload slots.
  107. ldrh r9, [r4]
  108. ldrh r10, [r4, #2]
  109. ldrh r11, [r4, #16]
  110. ldrh r12, [r4, #18]
  111. pld [r7, r9, lsl #2]
  112. pld [r7, r10, lsl #2]
  113. pld [r7, r11, lsl #2]
  114. pld [r7, r12, lsl #2]
  115. @ q0 = packedIndices -- indices output to b2ParticleContact
  116. @ q1 = distBtParticlesSq -- will be used to calculate weight
  117. @ q2 = diffX -- will be used to calculate normal
  118. @ q3 = diffY -- will be used to calculate normal
  119. add r8, r4, #32
  120. vld4.f32 {d0, d2, d4, d6}, [r4]
  121. vld4.f32 {d1, d3, d5, d7}, [r8]
  122. @ Use distSq to estimate 1 / dist.
  123. vrsqrte.f32 q8, q1 @ q8 = 1 / dist -- (rough estimate)
  124. vmul.f32 q9, q8, q1 @ q9 = 1 / dist * distSq -- (appr 'dist')
  125. vrsqrts.f32 q9, q9, q8 @ q9 = (3 - 1/dist * dist) / 2 -- (error)
  126. vmul.f32 q8, q8, q9 @ q8 = (error) / dist -- (estimate)
  127. vcgt.f32 q9, q8, #0 @ q8 = 1 / dist > 0 (true if not NaN)
  128. vand q8, q8, q9 @ q8 = 1 / dist if valid, or 0 if NaN
  129. @ Since we expand the output to include 'weight', we need to preserve
  130. @ subsequent contacts. Note that there may be up to 7 contacts waiting
  131. @ to be post-processed, since we output contacts in up-to groups of 4.
  132. add r8, r4, #64
  133. vldmia r8, {q9, q10, q11}
  134. @ Load first four flags, 'or' them in pairs, then write to destination.
  135. ldr r9, [r7, r9, lsl #2]
  136. ldr r10, [r7, r10, lsl #2]
  137. ldr r11, [r7, r11, lsl #2]
  138. ldr r12, [r7, r12, lsl #2]
  139. orr r9, r9, r10
  140. orr r11, r11, r12
  141. str r9, [r4, #16]
  142. str r11, [r4, #36]
  143. @ Preload the next four flags into cache.
  144. ldrh r9, [r4, #32]
  145. ldrh r10, [r4, #34]
  146. ldrh r11, [r4, #48]
  147. ldrh r12, [r4, #50]
  148. pld [r7, r9, lsl #2]
  149. pld [r7, r10, lsl #2]
  150. pld [r7, r11, lsl #2]
  151. pld [r7, r12, lsl #2]
  152. @ Calculate normal and weight.
  153. vmul.f32 q1, q1, q8 @ q1 = distSq / dist = dist
  154. vmul.f32 q2, q2, q8 @ q2 = normX = diffX / dist
  155. vmul.f32 q1, q1, q14 @ q1 = dist / diameter
  156. vmul.f32 q3, q3, q8 @ q3 = normY = diffY / dist
  157. vsub.f32 q1, q12, q1 @ q1 = weight = 1 - dist / diameter
  158. @ Store again, making room for 'weight' member variable this time.
  159. @ TODO OPT: Interleave with 'or' instructions below.
  160. mov r8, #20 @ r8 = 20 = sizeof(b2ParticleContact)
  161. vst4.f32 {d0[0], d2[0], d4[0], d6[0]}, [r4], r8
  162. vst4.f32 {d0[1], d2[1], d4[1], d6[1]}, [r4], r8
  163. vst4.f32 {d1[0], d3[0], d5[0], d7[0]}, [r4], r8
  164. vst4.f32 {d1[1], d3[1], d5[1], d7[1]}, [r4], r8
  165. mov r8, #12 @ r8 = 12 = sizeof(FindContactInput)
  166. @ Load next four flags, 'or' them in pairs, then write to destination.
  167. ldr r9, [r7, r9, lsl #2]
  168. ldr r10, [r7, r10, lsl #2]
  169. ldr r11, [r7, r11, lsl #2]
  170. ldr r12, [r7, r12, lsl #2]
  171. orr r9, r9, r10
  172. orr r11, r11, r12
  173. str r9, [r4, #-24]
  174. str r11, [r4, #-4]
  175. @ Update output pointers. Since we output 4 contacts, and added 4 bytes
  176. @ for 'weight' on each contact, the output pointer must be advanced by
  177. @ 16 bytes.
  178. add r3, r3, #16
  179. add r5, r5, #4 @ numContacts += 4
  180. @ Restore subsequent contacts. That is, contacts that have yet to be
  181. @ post-processed.
  182. vstmia r4, {q9, q10, q11}
  183. bx lr
  184. @ When used with the 'vtbl' instruction, grabs the first byte of every
  185. @ word, and places it in the first word. Fills the second word with 0s.
  186. @ For example, (0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF)
  187. @ ==> (0xFF0000FF, 0x00000000)
  188. CONST_IS_CLOSE_TABLE_INDICES:
  189. .byte 0
  190. .byte 4
  191. .byte 8
  192. .byte 12
  193. .byte 0xFF
  194. .byte 0xFF
  195. .byte 0xFF
  196. .byte 0xFF
  197. .balign 4
  198. .global FindContactsFromChecks_Simd
  199. .thumb_func
  200. FindContactsFromChecks_Simd:
  201. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  202. @
  203. @ void FindContactsFromChecks_Simd(
  204. @ const FindContactInput* reordered,
  205. @ const FindContactCheck* checks,
  206. @ int numChecks,
  207. @ const float& particleDiameterSq,
  208. @ const float& particleDiameterInv,
  209. @ const uint32* flags,
  210. @ b2GrowableBuffer<b2ParticleContact>& contacts)
  211. @
  212. @ Parameters
  213. @ r0: *reordered
  214. @ r1: *checks
  215. @ r2: numChecks
  216. @ r3: particleDiameterSq
  217. @ [sp]: particleDiameterInv
  218. @ [sp+4]: *flags
  219. @ [sp+8]: contacts
  220. @
  221. @ Persistent Variables
  222. @ r0: *reordered (constant)
  223. @ r1: *checks (advance once per iteration)
  224. @ r2: numChecks (decrement once per iteration)
  225. @ r3: *out <-- next free entry of outContacts array
  226. @ r4: *postProcess <-- entry on-deck to be post-processed
  227. @ r5: numContacts
  228. @ r6: maxSafeContacts
  229. @ r7: *flags (constant)
  230. @ r8: 20 = sizeof(b2ParticleContact), or
  231. @ 12 = sizeof(FindContactInput) (constants)
  232. @
  233. @ Scratch Variables
  234. @ r9:
  235. @ r10: address of current particle position
  236. @ r11: address of comparator particle positions
  237. @ r12: isClose (compacted)
  238. @
  239. @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  240. @
  241. @ Scratch
  242. @ q0 == index ------> packedIndices
  243. @ q1 == positionX ---_ distBtParticlesSq
  244. @ q2 == positionY --_ --> normX
  245. @ q3 == ---> normY
  246. @
  247. @ Unused (note: these are callee-saved)
  248. @ q4 ==
  249. @ q5 ==
  250. @ q6 ==
  251. @ q7 ==
  252. @
  253. @ Scratch
  254. @ q8 == comparatorIndices
  255. @ q9 == comparatorPositionX
  256. @ q10 == comparatorPositionY
  257. @ q11 ==
  258. @
  259. @ Constants
  260. @ q12 == 1.0f
  261. @ q13 == isClose table indices
  262. @ q14 == 1 / particleDiameter
  263. @ q15 == particleDiameterSq
  264. push {r4-r11, lr}
  265. @ Load constants from registers and stack.
  266. vld1.f32 {d30[],d31[]}, [r3] @ q15 = particleDiameterSq
  267. ldr r12, [sp, #36] @ r12 = particleDiameterInv
  268. vld1.f32 {d28[],d29[]}, [r12] @ q14 = particleDiameterInv
  269. ldr r9, [sp, #44] @ r9 = contacts
  270. ldr r7, [sp, #40] @ r7 = flags
  271. ldr r3, [r9, #0] @ r3 = out = contacts.data
  272. ldr r6, [r9, #8] @ r6 = contacts.capacity
  273. mov r4, r3 @ r4 = postProcess = outContacts
  274. mov r5, #0 @ r5 = numContacts
  275. sub r6, r6, #8 @ r6 = maxSafeContacts = capacity - 8
  276. mov r8, #12 @ r8 = 12 = sizeof(FindContactInput)
  277. @ Perform zero iterations if 'numChecks' is empty.
  278. @ Must happen after initializing r5 = numContacts = 0.
  279. cmp r2, #0
  280. ble .L_FindContacts_Return
  281. @ Load and calculate remaining constants.
  282. vmov.f32 q12, #1.0 @ q12 = 1.0f splatted
  283. adr r12, CONST_IS_CLOSE_TABLE_INDICES
  284. vld1.8 {d26}, [r12] @ q13 = *CONST_IS_CLOSE_TABLE_INDICES
  285. .L_FindContacts_MainLoop:
  286. pld [r1, #8] @ prefetch two loops ahead
  287. @ r10 <== Address of 'position', the current particle position
  288. @ r11 <== Address of '&comparator[0]', the first particle position we
  289. @ compare against.
  290. ldr r10, [r1], #4 @ r10 = positionIndex|comparatorIndex
  291. smlatb r11, r10, r8, r0 @ r11 = address of first comparator
  292. smlabb r10, r10, r8, r0 @ r10 = address of current input
  293. add r12, r11, #24 @ r12 = address of third comparator
  294. @ Exit if not enough space in output array (part 1)
  295. cmp r5, r6
  296. @ {q0, q1, q2} == index, positionX, positionY, splatted across vector
  297. vld3.f32 {d0[], d2[], d4[]}, [r10]
  298. vld3.f32 {d1[], d3[], d5[]}, [r10]
  299. @ {q8, q9, q10} == comparatorIndices, comparatorPosX and comparatorPosY
  300. @ positions we compare against (positionX, positionY)
  301. vld3.f32 {d16, d18, d20}, [r11]
  302. vld3.f32 {d17, d19, d21}, [r12]
  303. @ q0 = packedIndices -- indices output to b2ParticleContact
  304. @ q1 = distBtParticlesSq -- will be used to calculate weight
  305. @ q2 = diffX -- will be used to calculate normal
  306. @ q3 = diffY -- will be used to calculate normal
  307. vsub.f32 q3, q10, q2 @ q3 = diffY = comparatorPosY - positionY
  308. vsub.f32 q2, q9, q1 @ q2 = diffX = comparatorPosX - positionX
  309. vsli.32 q0, q8, #16 @ q0 = comparatorIndex[i] << 16 | index
  310. vmul.f32 q1, q3, q3 @ q1 = diffX * diffX
  311. vmla.f32 q1, q2, q2 @ q1 = diffX * diffX + diffY * diffY
  312. @ Determine if each particle is close enough to output.
  313. @ Pack the isClose bitmap (four T or F) into a 32-bit bitmap.
  314. @ Move 32-bit bitmap to CPU register, for conditional operations.
  315. @ Note: NEON to CPU register moves are slow (20 cyclds) on some
  316. @ implementations of NEON.
  317. @ isClose = distBtParticlesSq < particleDiameterSq
  318. vclt.f32 q8, q1, q15 @ q8 == isClose
  319. vtbl.8 d16, {d16,d17}, d26 @ q8[0] == isClose(packed)
  320. vmov.32 r12, d16[0] @ q8[0] ==> r12.
  321. @ If not enough space in output array, grow it.
  322. @ This is a heavy operation, but should happen rarely.
  323. ble .L_FindContacts_Output
  324. ldr r9, [sp, #44] @ r9 = contacts
  325. str r5, [r9, #4] @ contacts.count = numContacts
  326. ldr r10, [r9, #0] @ r10 = contacts.data
  327. push {r0-r3, r9, r10, r12}
  328. vpush {q0, q1, q2, q3}
  329. vpush {q12, q13, q14, q15}
  330. mov r0, r9 @ r0 = contacts
  331. bl GrowParticleContactBuffer
  332. vpop {q12, q13, q14, q15}
  333. vpop {q0, q1, q2, q3}
  334. pop {r0-r3, r9, r10, r12}
  335. @ The output array was reallocated, so update 'out', 'postProcess' and
  336. @ 'maxSafeContacts' pointers.
  337. ldr r6, [r9, #8] @ r6 = contacts.capacity
  338. ldr r9, [r9, #0] @ r9 = contacts.data
  339. sub r9, r9, r10 @ r9 = data buffer offset
  340. sub r6, r6, #8 @ r6 = maxSafeContacts
  341. add r3, r3, r9 @ r3 += data buffer offset
  342. add r4, r4, r9 @ r4 += data buffer offset
  343. .L_FindContacts_Output:
  344. @ Store results to memory, but only results that are close
  345. tst r12, 0xFF
  346. it ne
  347. vst4ne.32 {d0[0],d2[0],d4[0],d6[0]}, [r3]! @ Store 1st contact
  348. tst r12, 0xFF00
  349. it ne
  350. vst4ne.32 {d0[1],d2[1],d4[1],d6[1]}, [r3]! @ Store 2nd contact
  351. tst r12, 0xFF0000
  352. it ne
  353. vst4ne.32 {d1[0],d3[0],d5[0],d7[0]}, [r3]! @ Store 3rd contact
  354. tst r12, 0xFF000000
  355. it ne
  356. vst4ne.32 {d1[1],d3[1],d5[1],d7[1]}, [r3]! @ Store 4th contact
  357. @ post-process the last four elements that have been output
  358. @ r12 = 5th element to not be post-processed yet
  359. add r12, r4, #64 @ r12 = nextPostProcess
  360. cmp r3, r12
  361. it ge
  362. blge FindContacts_PostProcess
  363. @ decrement loop counter; sets the 'gt' flag used in 'bgt' below
  364. subs r2, r2, #1
  365. bgt .L_FindContacts_MainLoop
  366. .L_FindContacts_PostProcessRemainingItems:
  367. @ If at least one output item needs post-processing, do it.
  368. subs r12, r3, r4
  369. ble .L_FindContacts_Return
  370. @ r12/16 = num extra contacts to process
  371. add r5, r5, r12, lsr #4 @ numContacts += num extra
  372. push {r5} @ Save numContacts, since stomped
  373. @ Ensure indices past end of array are zeroed out.
  374. @ We process 4 contacts in FindContacts_PostProcess, even if we only
  375. @ have one left to process.
  376. mov r12, #0
  377. str r12, [r3]
  378. str r12, [r3, #16]
  379. str r12, [r3, #32]
  380. bl FindContacts_PostProcess
  381. pop {r5} @ Restore numContacts
  382. .L_FindContacts_Return:
  383. @ Set the final number of contacts in the output buffer.
  384. ldr r9, [sp, #44] @ r9 = contacts
  385. str r5, [r9, #4] @ contacts.count = numContacts
  386. @ Return by popping the original lr into pc.
  387. pop {r4-r11, pc}