2
0

b3Solver.cpp 36 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128
  1. /*
  2. Copyright (c) 2012 Advanced Micro Devices, Inc.
  3. This software is provided 'as-is', without any express or implied warranty.
  4. In no event will the authors be held liable for any damages arising from the use of this software.
  5. Permission is granted to anyone to use this software for any purpose,
  6. including commercial applications, and to alter it and redistribute it freely,
  7. subject to the following restrictions:
  8. 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
  9. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
  10. 3. This notice may not be removed or altered from any source distribution.
  11. */
  12. //Originally written by Takahiro Harada
  13. #include "b3Solver.h"
  14. ///useNewBatchingKernel is a rewritten kernel using just a single thread of the warp, for experiments
  15. bool useNewBatchingKernel = true;
  16. bool gConvertConstraintOnCpu = false;
  17. #define B3_SOLVER_SETUP_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup.cl"
  18. #define B3_SOLVER_SETUP2_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solverSetup2.cl"
  19. #define B3_SOLVER_CONTACT_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveContact.cl"
  20. #define B3_SOLVER_FRICTION_KERNEL_PATH "src/Bullet3OpenCL/RigidBody/kernels/solveFriction.cl"
  21. #define B3_BATCHING_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernels.cl"
  22. #define B3_BATCHING_NEW_PATH "src/Bullet3OpenCL/RigidBody/kernels/batchingKernelsNew.cl"
  23. #include "Bullet3Dynamics/shared/b3ConvertConstraint4.h"
  24. #include "kernels/solverSetup.h"
  25. #include "kernels/solverSetup2.h"
  26. #include "kernels/solveContact.h"
  27. #include "kernels/solveFriction.h"
  28. #include "kernels/batchingKernels.h"
  29. #include "kernels/batchingKernelsNew.h"
  30. #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
  31. #include "Bullet3Common/b3Vector3.h"
  32. struct SolverDebugInfo
  33. {
  34. int m_valInt0;
  35. int m_valInt1;
  36. int m_valInt2;
  37. int m_valInt3;
  38. int m_valInt4;
  39. int m_valInt5;
  40. int m_valInt6;
  41. int m_valInt7;
  42. int m_valInt8;
  43. int m_valInt9;
  44. int m_valInt10;
  45. int m_valInt11;
  46. int m_valInt12;
  47. int m_valInt13;
  48. int m_valInt14;
  49. int m_valInt15;
  50. float m_val0;
  51. float m_val1;
  52. float m_val2;
  53. float m_val3;
  54. };
  55. class SolverDeviceInl
  56. {
  57. public:
  58. struct ParallelSolveData
  59. {
  60. b3OpenCLArray<unsigned int>* m_numConstraints;
  61. b3OpenCLArray<unsigned int>* m_offsets;
  62. };
  63. };
  64. b3Solver::b3Solver(cl_context ctx, cl_device_id device, cl_command_queue queue, int pairCapacity)
  65. : m_context(ctx),
  66. m_device(device),
  67. m_queue(queue),
  68. m_batchSizes(ctx, queue),
  69. m_nIterations(4)
  70. {
  71. m_sort32 = new b3RadixSort32CL(ctx, device, queue);
  72. m_scan = new b3PrefixScanCL(ctx, device, queue, B3_SOLVER_N_CELLS);
  73. m_search = new b3BoundSearchCL(ctx, device, queue, B3_SOLVER_N_CELLS);
  74. const int sortSize = B3NEXTMULTIPLEOF(pairCapacity, 512);
  75. m_sortDataBuffer = new b3OpenCLArray<b3SortData>(ctx, queue, sortSize);
  76. m_contactBuffer2 = new b3OpenCLArray<b3Contact4>(ctx, queue);
  77. m_numConstraints = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
  78. m_numConstraints->resize(B3_SOLVER_N_CELLS);
  79. m_offsets = new b3OpenCLArray<unsigned int>(ctx, queue, B3_SOLVER_N_CELLS);
  80. m_offsets->resize(B3_SOLVER_N_CELLS);
  81. const char* additionalMacros = "";
  82. // const char* srcFileNameForCaching="";
  83. cl_int pErrNum;
  84. const char* batchKernelSource = batchingKernelsCL;
  85. const char* batchKernelNewSource = batchingKernelsNewCL;
  86. const char* solverSetupSource = solverSetupCL;
  87. const char* solverSetup2Source = solverSetup2CL;
  88. const char* solveContactSource = solveContactCL;
  89. const char* solveFrictionSource = solveFrictionCL;
  90. {
  91. cl_program solveContactProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveContactSource, &pErrNum, additionalMacros, B3_SOLVER_CONTACT_KERNEL_PATH);
  92. b3Assert(solveContactProg);
  93. cl_program solveFrictionProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solveFrictionSource, &pErrNum, additionalMacros, B3_SOLVER_FRICTION_KERNEL_PATH);
  94. b3Assert(solveFrictionProg);
  95. cl_program solverSetup2Prog = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetup2Source, &pErrNum, additionalMacros, B3_SOLVER_SETUP2_KERNEL_PATH);
  96. b3Assert(solverSetup2Prog);
  97. cl_program solverSetupProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, solverSetupSource, &pErrNum, additionalMacros, B3_SOLVER_SETUP_KERNEL_PATH);
  98. b3Assert(solverSetupProg);
  99. m_solveFrictionKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveFrictionSource, "BatchSolveKernelFriction", &pErrNum, solveFrictionProg, additionalMacros);
  100. b3Assert(m_solveFrictionKernel);
  101. m_solveContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solveContactSource, "BatchSolveKernelContact", &pErrNum, solveContactProg, additionalMacros);
  102. b3Assert(m_solveContactKernel);
  103. m_contactToConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetupSource, "ContactToConstraintKernel", &pErrNum, solverSetupProg, additionalMacros);
  104. b3Assert(m_contactToConstraintKernel);
  105. m_setSortDataKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "SetSortDataKernel", &pErrNum, solverSetup2Prog, additionalMacros);
  106. b3Assert(m_setSortDataKernel);
  107. m_reorderContactKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "ReorderContactKernel", &pErrNum, solverSetup2Prog, additionalMacros);
  108. b3Assert(m_reorderContactKernel);
  109. m_copyConstraintKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, solverSetup2Source, "CopyConstraintKernel", &pErrNum, solverSetup2Prog, additionalMacros);
  110. b3Assert(m_copyConstraintKernel);
  111. }
  112. {
  113. cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelSource, &pErrNum, additionalMacros, B3_BATCHING_PATH);
  114. //cl_program batchingProg = b3OpenCLUtils::compileCLProgramFromString( ctx, device, 0, &pErrNum,additionalMacros, B3_BATCHING_PATH,true);
  115. b3Assert(batchingProg);
  116. m_batchingKernel = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelSource, "CreateBatches", &pErrNum, batchingProg, additionalMacros);
  117. b3Assert(m_batchingKernel);
  118. }
  119. {
  120. cl_program batchingNewProg = b3OpenCLUtils::compileCLProgramFromString(ctx, device, batchKernelNewSource, &pErrNum, additionalMacros, B3_BATCHING_NEW_PATH);
  121. b3Assert(batchingNewProg);
  122. m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString(ctx, device, batchKernelNewSource, "CreateBatchesNew", &pErrNum, batchingNewProg, additionalMacros);
  123. //m_batchingKernelNew = b3OpenCLUtils::compileCLKernelFromString( ctx, device, batchKernelNewSource, "CreateBatchesBruteForce", &pErrNum, batchingNewProg,additionalMacros );
  124. b3Assert(m_batchingKernelNew);
  125. }
  126. }
  127. b3Solver::~b3Solver()
  128. {
  129. delete m_offsets;
  130. delete m_numConstraints;
  131. delete m_sortDataBuffer;
  132. delete m_contactBuffer2;
  133. delete m_sort32;
  134. delete m_scan;
  135. delete m_search;
  136. clReleaseKernel(m_batchingKernel);
  137. clReleaseKernel(m_batchingKernelNew);
  138. clReleaseKernel(m_solveContactKernel);
  139. clReleaseKernel(m_solveFrictionKernel);
  140. clReleaseKernel(m_contactToConstraintKernel);
  141. clReleaseKernel(m_setSortDataKernel);
  142. clReleaseKernel(m_reorderContactKernel);
  143. clReleaseKernel(m_copyConstraintKernel);
  144. }
  145. template <bool JACOBI>
  146. static __inline void solveContact(b3GpuConstraint4& cs,
  147. const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
  148. const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
  149. float maxRambdaDt[4], float minRambdaDt[4])
  150. {
  151. b3Vector3 dLinVelA;
  152. dLinVelA.setZero();
  153. b3Vector3 dAngVelA;
  154. dAngVelA.setZero();
  155. b3Vector3 dLinVelB;
  156. dLinVelB.setZero();
  157. b3Vector3 dAngVelB;
  158. dAngVelB.setZero();
  159. for (int ic = 0; ic < 4; ic++)
  160. {
  161. // dont necessary because this makes change to 0
  162. if (cs.m_jacCoeffInv[ic] == 0.f) continue;
  163. {
  164. b3Vector3 angular0, angular1, linear;
  165. b3Vector3 r0 = cs.m_worldPos[ic] - (b3Vector3&)posA;
  166. b3Vector3 r1 = cs.m_worldPos[ic] - (b3Vector3&)posB;
  167. setLinearAndAngular((const b3Vector3&)cs.m_linear, (const b3Vector3&)r0, (const b3Vector3&)r1, &linear, &angular0, &angular1);
  168. float rambdaDt = calcRelVel((const b3Vector3&)cs.m_linear, (const b3Vector3&)-cs.m_linear, angular0, angular1,
  169. linVelA, angVelA, linVelB, angVelB) +
  170. cs.m_b[ic];
  171. rambdaDt *= cs.m_jacCoeffInv[ic];
  172. {
  173. float prevSum = cs.m_appliedRambdaDt[ic];
  174. float updated = prevSum;
  175. updated += rambdaDt;
  176. updated = b3Max(updated, minRambdaDt[ic]);
  177. updated = b3Min(updated, maxRambdaDt[ic]);
  178. rambdaDt = updated - prevSum;
  179. cs.m_appliedRambdaDt[ic] = updated;
  180. }
  181. b3Vector3 linImp0 = invMassA * linear * rambdaDt;
  182. b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
  183. b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
  184. b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
  185. #ifdef _WIN32
  186. b3Assert(_finite(linImp0.getX()));
  187. b3Assert(_finite(linImp1.getX()));
  188. #endif
  189. if (JACOBI)
  190. {
  191. dLinVelA += linImp0;
  192. dAngVelA += angImp0;
  193. dLinVelB += linImp1;
  194. dAngVelB += angImp1;
  195. }
  196. else
  197. {
  198. linVelA += linImp0;
  199. angVelA += angImp0;
  200. linVelB += linImp1;
  201. angVelB += angImp1;
  202. }
  203. }
  204. }
  205. if (JACOBI)
  206. {
  207. linVelA += dLinVelA;
  208. angVelA += dAngVelA;
  209. linVelB += dLinVelB;
  210. angVelB += dAngVelB;
  211. }
  212. }
  213. static __inline void solveFriction(b3GpuConstraint4& cs,
  214. const b3Vector3& posA, b3Vector3& linVelA, b3Vector3& angVelA, float invMassA, const b3Matrix3x3& invInertiaA,
  215. const b3Vector3& posB, b3Vector3& linVelB, b3Vector3& angVelB, float invMassB, const b3Matrix3x3& invInertiaB,
  216. float maxRambdaDt[4], float minRambdaDt[4])
  217. {
  218. if (cs.m_fJacCoeffInv[0] == 0 && cs.m_fJacCoeffInv[0] == 0) return;
  219. const b3Vector3& center = (const b3Vector3&)cs.m_center;
  220. b3Vector3 n = -(const b3Vector3&)cs.m_linear;
  221. b3Vector3 tangent[2];
  222. #if 1
  223. b3PlaneSpace1(n, tangent[0], tangent[1]);
  224. #else
  225. b3Vector3 r = cs.m_worldPos[0] - center;
  226. tangent[0] = cross3(n, r);
  227. tangent[1] = cross3(tangent[0], n);
  228. tangent[0] = normalize3(tangent[0]);
  229. tangent[1] = normalize3(tangent[1]);
  230. #endif
  231. b3Vector3 angular0, angular1, linear;
  232. b3Vector3 r0 = center - posA;
  233. b3Vector3 r1 = center - posB;
  234. for (int i = 0; i < 2; i++)
  235. {
  236. setLinearAndAngular(tangent[i], r0, r1, &linear, &angular0, &angular1);
  237. float rambdaDt = calcRelVel(linear, -linear, angular0, angular1,
  238. linVelA, angVelA, linVelB, angVelB);
  239. rambdaDt *= cs.m_fJacCoeffInv[i];
  240. {
  241. float prevSum = cs.m_fAppliedRambdaDt[i];
  242. float updated = prevSum;
  243. updated += rambdaDt;
  244. updated = b3Max(updated, minRambdaDt[i]);
  245. updated = b3Min(updated, maxRambdaDt[i]);
  246. rambdaDt = updated - prevSum;
  247. cs.m_fAppliedRambdaDt[i] = updated;
  248. }
  249. b3Vector3 linImp0 = invMassA * linear * rambdaDt;
  250. b3Vector3 linImp1 = invMassB * (-linear) * rambdaDt;
  251. b3Vector3 angImp0 = (invInertiaA * angular0) * rambdaDt;
  252. b3Vector3 angImp1 = (invInertiaB * angular1) * rambdaDt;
  253. #ifdef _WIN32
  254. b3Assert(_finite(linImp0.getX()));
  255. b3Assert(_finite(linImp1.getX()));
  256. #endif
  257. linVelA += linImp0;
  258. angVelA += angImp0;
  259. linVelB += linImp1;
  260. angVelB += angImp1;
  261. }
  262. { // angular damping for point constraint
  263. b3Vector3 ab = (posB - posA).normalized();
  264. b3Vector3 ac = (center - posA).normalized();
  265. if (b3Dot(ab, ac) > 0.95f || (invMassA == 0.f || invMassB == 0.f))
  266. {
  267. float angNA = b3Dot(n, angVelA);
  268. float angNB = b3Dot(n, angVelB);
  269. angVelA -= (angNA * 0.1f) * n;
  270. angVelB -= (angNB * 0.1f) * n;
  271. }
  272. }
  273. }
  274. /*
  275. b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
  276. b3AlignedObjectArray<b3InertiaData>& m_shapes;
  277. b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
  278. b3AlignedObjectArray<int>* m_batchSizes;
  279. int m_cellIndex;
  280. int m_curWgidx;
  281. int m_start;
  282. int m_nConstraints;
  283. bool m_solveFriction;
  284. int m_maxNumBatches;
  285. */
  286. struct SolveTask // : public ThreadPool::Task
  287. {
  288. SolveTask(b3AlignedObjectArray<b3RigidBodyData>& bodies, b3AlignedObjectArray<b3InertiaData>& shapes, b3AlignedObjectArray<b3GpuConstraint4>& constraints,
  289. int start, int nConstraints, int maxNumBatches, b3AlignedObjectArray<int>* wgUsedBodies, int curWgidx, b3AlignedObjectArray<int>* batchSizes, int cellIndex)
  290. : m_bodies(bodies), m_shapes(shapes), m_constraints(constraints), m_batchSizes(batchSizes), m_cellIndex(cellIndex), m_curWgidx(curWgidx), m_start(start), m_nConstraints(nConstraints), m_solveFriction(true), m_maxNumBatches(maxNumBatches)
  291. {
  292. }
  293. unsigned short int getType() { return 0; }
  294. void run(int tIdx)
  295. {
  296. int offset = 0;
  297. for (int ii = 0; ii < B3_MAX_NUM_BATCHES; ii++)
  298. {
  299. int numInBatch = m_batchSizes->at(m_cellIndex * B3_MAX_NUM_BATCHES + ii);
  300. if (!numInBatch)
  301. break;
  302. for (int jj = 0; jj < numInBatch; jj++)
  303. {
  304. int i = m_start + offset + jj;
  305. int batchId = m_constraints[i].m_batchIdx;
  306. b3Assert(batchId == ii);
  307. float frictionCoeff = m_constraints[i].getFrictionCoeff();
  308. int aIdx = (int)m_constraints[i].m_bodyA;
  309. int bIdx = (int)m_constraints[i].m_bodyB;
  310. // int localBatch = m_constraints[i].m_batchIdx;
  311. b3RigidBodyData& bodyA = m_bodies[aIdx];
  312. b3RigidBodyData& bodyB = m_bodies[bIdx];
  313. if (!m_solveFriction)
  314. {
  315. float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
  316. float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
  317. solveContact<false>(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
  318. (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
  319. maxRambdaDt, minRambdaDt);
  320. }
  321. else
  322. {
  323. float maxRambdaDt[4] = {FLT_MAX, FLT_MAX, FLT_MAX, FLT_MAX};
  324. float minRambdaDt[4] = {0.f, 0.f, 0.f, 0.f};
  325. float sum = 0;
  326. for (int j = 0; j < 4; j++)
  327. {
  328. sum += m_constraints[i].m_appliedRambdaDt[j];
  329. }
  330. frictionCoeff = 0.7f;
  331. for (int j = 0; j < 4; j++)
  332. {
  333. maxRambdaDt[j] = frictionCoeff * sum;
  334. minRambdaDt[j] = -maxRambdaDt[j];
  335. }
  336. solveFriction(m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3&)m_shapes[aIdx].m_invInertiaWorld,
  337. (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3&)m_shapes[bIdx].m_invInertiaWorld,
  338. maxRambdaDt, minRambdaDt);
  339. }
  340. }
  341. offset += numInBatch;
  342. }
  343. /* for (int bb=0;bb<m_maxNumBatches;bb++)
  344. {
  345. //for(int ic=m_nConstraints-1; ic>=0; ic--)
  346. for(int ic=0; ic<m_nConstraints; ic++)
  347. {
  348. int i = m_start + ic;
  349. if (m_constraints[i].m_batchIdx != bb)
  350. continue;
  351. float frictionCoeff = m_constraints[i].getFrictionCoeff();
  352. int aIdx = (int)m_constraints[i].m_bodyA;
  353. int bIdx = (int)m_constraints[i].m_bodyB;
  354. int localBatch = m_constraints[i].m_batchIdx;
  355. b3RigidBodyData& bodyA = m_bodies[aIdx];
  356. b3RigidBodyData& bodyB = m_bodies[bIdx];
  357. if( !m_solveFriction )
  358. {
  359. float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  360. float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
  361. solveContact<false>( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass, (const b3Matrix3x3 &)m_shapes[aIdx].m_invInertiaWorld,
  362. (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass, (const b3Matrix3x3 &)m_shapes[bIdx].m_invInertiaWorld,
  363. maxRambdaDt, minRambdaDt );
  364. }
  365. else
  366. {
  367. float maxRambdaDt[4] = {FLT_MAX,FLT_MAX,FLT_MAX,FLT_MAX};
  368. float minRambdaDt[4] = {0.f,0.f,0.f,0.f};
  369. float sum = 0;
  370. for(int j=0; j<4; j++)
  371. {
  372. sum +=m_constraints[i].m_appliedRambdaDt[j];
  373. }
  374. frictionCoeff = 0.7f;
  375. for(int j=0; j<4; j++)
  376. {
  377. maxRambdaDt[j] = frictionCoeff*sum;
  378. minRambdaDt[j] = -maxRambdaDt[j];
  379. }
  380. solveFriction( m_constraints[i], (b3Vector3&)bodyA.m_pos, (b3Vector3&)bodyA.m_linVel, (b3Vector3&)bodyA.m_angVel, bodyA.m_invMass,(const b3Matrix3x3 &) m_shapes[aIdx].m_invInertiaWorld,
  381. (b3Vector3&)bodyB.m_pos, (b3Vector3&)bodyB.m_linVel, (b3Vector3&)bodyB.m_angVel, bodyB.m_invMass,(const b3Matrix3x3 &) m_shapes[bIdx].m_invInertiaWorld,
  382. maxRambdaDt, minRambdaDt );
  383. }
  384. }
  385. }
  386. */
  387. }
  388. b3AlignedObjectArray<b3RigidBodyData>& m_bodies;
  389. b3AlignedObjectArray<b3InertiaData>& m_shapes;
  390. b3AlignedObjectArray<b3GpuConstraint4>& m_constraints;
  391. b3AlignedObjectArray<int>* m_batchSizes;
  392. int m_cellIndex;
  393. int m_curWgidx;
  394. int m_start;
  395. int m_nConstraints;
  396. bool m_solveFriction;
  397. int m_maxNumBatches;
  398. };
  399. void b3Solver::solveContactConstraintHost(b3OpenCLArray<b3RigidBodyData>* bodyBuf, b3OpenCLArray<b3InertiaData>* shapeBuf,
  400. b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches, b3AlignedObjectArray<int>* batchSizes)
  401. {
  402. #if 0
  403. {
  404. int nSplitX = B3_SOLVER_N_SPLIT_X;
  405. int nSplitY = B3_SOLVER_N_SPLIT_Y;
  406. int numWorkgroups = B3_SOLVER_N_CELLS/B3_SOLVER_N_BATCHES;
  407. for (int z=0;z<4;z++)
  408. {
  409. for (int y=0;y<4;y++)
  410. {
  411. for (int x=0;x<4;x++)
  412. {
  413. int newIndex = (x+y*nSplitX+z*nSplitX*nSplitY);
  414. // printf("newIndex=%d\n",newIndex);
  415. int zIdx = newIndex/(nSplitX*nSplitY);
  416. int remain = newIndex%(nSplitX*nSplitY);
  417. int yIdx = remain/nSplitX;
  418. int xIdx = remain%nSplitX;
  419. // printf("newIndex=%d\n",newIndex);
  420. }
  421. }
  422. }
  423. //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
  424. for (int cellBatch=0;cellBatch<B3_SOLVER_N_BATCHES;cellBatch++)
  425. {
  426. for (int wgIdx=0;wgIdx<numWorkgroups;wgIdx++)
  427. {
  428. int zIdx = (wgIdx/((nSplitX*nSplitY)/4))*2+((cellBatch&4)>>2);
  429. int remain= (wgIdx%((nSplitX*nSplitY)/4));
  430. int yIdx = (remain/(nSplitX/2))*2 + ((cellBatch&2)>>1);
  431. int xIdx = (remain%(nSplitX/2))*2 + (cellBatch&1);
  432. /*int zIdx = newIndex/(nSplitX*nSplitY);
  433. int remain = newIndex%(nSplitX*nSplitY);
  434. int yIdx = remain/nSplitX;
  435. int xIdx = remain%nSplitX;
  436. */
  437. int cellIdx = xIdx+yIdx*nSplitX+zIdx*(nSplitX*nSplitY);
  438. // printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
  439. }
  440. }
  441. }
  442. #endif
  443. b3AlignedObjectArray<b3RigidBodyData> bodyNative;
  444. bodyBuf->copyToHost(bodyNative);
  445. b3AlignedObjectArray<b3InertiaData> shapeNative;
  446. shapeBuf->copyToHost(shapeNative);
  447. b3AlignedObjectArray<b3GpuConstraint4> constraintNative;
  448. constraint->copyToHost(constraintNative);
  449. b3AlignedObjectArray<unsigned int> numConstraintsHost;
  450. m_numConstraints->copyToHost(numConstraintsHost);
  451. //printf("------------------------\n");
  452. b3AlignedObjectArray<unsigned int> offsetsHost;
  453. m_offsets->copyToHost(offsetsHost);
  454. static int frame = 0;
  455. bool useBatches = true;
  456. if (useBatches)
  457. {
  458. for (int iter = 0; iter < m_nIterations; iter++)
  459. {
  460. for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
  461. {
  462. int nSplitX = B3_SOLVER_N_SPLIT_X;
  463. int nSplitY = B3_SOLVER_N_SPLIT_Y;
  464. int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
  465. //printf("cell Batch %d\n",cellBatch);
  466. b3AlignedObjectArray<int> usedBodies[B3_SOLVER_N_CELLS];
  467. for (int i = 0; i < B3_SOLVER_N_CELLS; i++)
  468. {
  469. usedBodies[i].resize(0);
  470. }
  471. //for (int wgIdx=numWorkgroups-1;wgIdx>=0;wgIdx--)
  472. for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
  473. {
  474. int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
  475. int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
  476. int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
  477. int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
  478. int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
  479. if (numConstraintsHost[cellIdx] == 0)
  480. continue;
  481. //printf("wgIdx %d: xIdx=%d, yIdx=%d, zIdx=%d, cellIdx=%d, cell Batch %d\n",wgIdx,xIdx,yIdx,zIdx,cellIdx,cellBatch);
  482. //printf("cell %d has %d constraints\n", cellIdx,numConstraintsHost[cellIdx]);
  483. if (zIdx)
  484. {
  485. //printf("?\n");
  486. }
  487. if (iter == 0)
  488. {
  489. //printf("frame=%d, Cell xIdx=%x, yIdx=%d ",frame, xIdx,yIdx);
  490. //printf("cellBatch=%d, wgIdx=%d, #constraints in cell=%d\n",cellBatch,wgIdx,numConstraintsHost[cellIdx]);
  491. }
  492. const int start = offsetsHost[cellIdx];
  493. int numConstraintsInCell = numConstraintsHost[cellIdx];
  494. // const int end = start + numConstraintsInCell;
  495. SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, usedBodies, wgIdx, batchSizes, cellIdx);
  496. task.m_solveFriction = false;
  497. task.run(0);
  498. }
  499. }
  500. }
  501. for (int iter = 0; iter < m_nIterations; iter++)
  502. {
  503. for (int cellBatch = 0; cellBatch < B3_SOLVER_N_BATCHES; cellBatch++)
  504. {
  505. int nSplitX = B3_SOLVER_N_SPLIT_X;
  506. int nSplitY = B3_SOLVER_N_SPLIT_Y;
  507. int numWorkgroups = B3_SOLVER_N_CELLS / B3_SOLVER_N_BATCHES;
  508. for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
  509. {
  510. int zIdx = (wgIdx / ((nSplitX * nSplitY) / 4)) * 2 + ((cellBatch & 4) >> 2);
  511. int remain = (wgIdx % ((nSplitX * nSplitY) / 4));
  512. int yIdx = (remain / (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
  513. int xIdx = (remain % (nSplitX / 2)) * 2 + (cellBatch & 1);
  514. int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
  515. if (numConstraintsHost[cellIdx] == 0)
  516. continue;
  517. //printf("yIdx=%d\n",yIdx);
  518. const int start = offsetsHost[cellIdx];
  519. int numConstraintsInCell = numConstraintsHost[cellIdx];
  520. // const int end = start + numConstraintsInCell;
  521. SolveTask task(bodyNative, shapeNative, constraintNative, start, numConstraintsInCell, maxNumBatches, 0, 0, batchSizes, cellIdx);
  522. task.m_solveFriction = true;
  523. task.run(0);
  524. }
  525. }
  526. }
  527. }
  528. else
  529. {
  530. for (int iter = 0; iter < m_nIterations; iter++)
  531. {
  532. SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
  533. task.m_solveFriction = false;
  534. task.run(0);
  535. }
  536. for (int iter = 0; iter < m_nIterations; iter++)
  537. {
  538. SolveTask task(bodyNative, shapeNative, constraintNative, 0, n, maxNumBatches, 0, 0, 0, 0);
  539. task.m_solveFriction = true;
  540. task.run(0);
  541. }
  542. }
  543. bodyBuf->copyFromHost(bodyNative);
  544. shapeBuf->copyFromHost(shapeNative);
  545. constraint->copyFromHost(constraintNative);
  546. frame++;
  547. }
  548. void checkConstraintBatch(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
  549. const b3OpenCLArray<b3InertiaData>* shapeBuf,
  550. b3OpenCLArray<b3GpuConstraint4>* constraint,
  551. b3OpenCLArray<unsigned int>* m_numConstraints,
  552. b3OpenCLArray<unsigned int>* m_offsets,
  553. int batchId)
  554. {
  555. // b3BufferInfoCL( m_numConstraints->getBufferCL() ),
  556. // b3BufferInfoCL( m_offsets->getBufferCL() )
  557. int cellBatch = batchId;
  558. const int nn = B3_SOLVER_N_CELLS;
  559. // int numWorkItems = 64*nn/B3_SOLVER_N_BATCHES;
  560. b3AlignedObjectArray<unsigned int> gN;
  561. m_numConstraints->copyToHost(gN);
  562. b3AlignedObjectArray<unsigned int> gOffsets;
  563. m_offsets->copyToHost(gOffsets);
  564. int nSplitX = B3_SOLVER_N_SPLIT_X;
  565. int nSplitY = B3_SOLVER_N_SPLIT_Y;
  566. // int bIdx = batchId;
  567. b3AlignedObjectArray<b3GpuConstraint4> cpuConstraints;
  568. constraint->copyToHost(cpuConstraints);
  569. printf("batch = %d\n", batchId);
  570. int numWorkgroups = nn / B3_SOLVER_N_BATCHES;
  571. b3AlignedObjectArray<int> usedBodies;
  572. for (int wgIdx = 0; wgIdx < numWorkgroups; wgIdx++)
  573. {
  574. printf("wgIdx = %d ", wgIdx);
  575. int zIdx = (wgIdx / ((nSplitX * nSplitY)) / 2) * 2 + ((cellBatch & 4) >> 2);
  576. int remain = wgIdx % ((nSplitX * nSplitY));
  577. int yIdx = (remain % (nSplitX / 2)) * 2 + ((cellBatch & 2) >> 1);
  578. int xIdx = (remain / (nSplitX / 2)) * 2 + (cellBatch & 1);
  579. int cellIdx = xIdx + yIdx * nSplitX + zIdx * (nSplitX * nSplitY);
  580. printf("cellIdx=%d\n", cellIdx);
  581. if (gN[cellIdx] == 0)
  582. continue;
  583. const int start = gOffsets[cellIdx];
  584. const int end = start + gN[cellIdx];
  585. for (int c = start; c < end; c++)
  586. {
  587. b3GpuConstraint4& constraint = cpuConstraints[c];
  588. //printf("constraint (%d,%d)\n", constraint.m_bodyA,constraint.m_bodyB);
  589. if (usedBodies.findLinearSearch(constraint.m_bodyA) < usedBodies.size())
  590. {
  591. printf("error?\n");
  592. }
  593. if (usedBodies.findLinearSearch(constraint.m_bodyB) < usedBodies.size())
  594. {
  595. printf("error?\n");
  596. }
  597. }
  598. for (int c = start; c < end; c++)
  599. {
  600. b3GpuConstraint4& constraint = cpuConstraints[c];
  601. usedBodies.push_back(constraint.m_bodyA);
  602. usedBodies.push_back(constraint.m_bodyB);
  603. }
  604. }
  605. }
  606. static bool verify = false;
  607. void b3Solver::solveContactConstraint(const b3OpenCLArray<b3RigidBodyData>* bodyBuf, const b3OpenCLArray<b3InertiaData>* shapeBuf,
  608. b3OpenCLArray<b3GpuConstraint4>* constraint, void* additionalData, int n, int maxNumBatches)
  609. {
  610. b3Int4 cdata = b3MakeInt4(n, 0, 0, 0);
  611. {
  612. const int nn = B3_SOLVER_N_CELLS;
  613. cdata.x = 0;
  614. cdata.y = maxNumBatches; //250;
  615. int numWorkItems = 64 * nn / B3_SOLVER_N_BATCHES;
  616. #ifdef DEBUG_ME
  617. SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
  618. adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
  619. #endif
  620. {
  621. B3_PROFILE("m_batchSolveKernel iterations");
  622. for (int iter = 0; iter < m_nIterations; iter++)
  623. {
  624. for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
  625. {
  626. if (verify)
  627. {
  628. checkConstraintBatch(bodyBuf, shapeBuf, constraint, m_numConstraints, m_offsets, ib);
  629. }
  630. #ifdef DEBUG_ME
  631. memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
  632. gpuDebugInfo.write(debugInfo, numWorkItems);
  633. #endif
  634. cdata.z = ib;
  635. b3LauncherCL launcher(m_queue, m_solveContactKernel, "m_solveContactKernel");
  636. #if 1
  637. b3BufferInfoCL bInfo[] = {
  638. b3BufferInfoCL(bodyBuf->getBufferCL()),
  639. b3BufferInfoCL(shapeBuf->getBufferCL()),
  640. b3BufferInfoCL(constraint->getBufferCL()),
  641. b3BufferInfoCL(m_numConstraints->getBufferCL()),
  642. b3BufferInfoCL(m_offsets->getBufferCL())
  643. #ifdef DEBUG_ME
  644. ,
  645. b3BufferInfoCL(&gpuDebugInfo)
  646. #endif
  647. };
  648. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  649. //launcher.setConst( cdata.x );
  650. launcher.setConst(cdata.y);
  651. launcher.setConst(cdata.z);
  652. b3Int4 nSplit;
  653. nSplit.x = B3_SOLVER_N_SPLIT_X;
  654. nSplit.y = B3_SOLVER_N_SPLIT_Y;
  655. nSplit.z = B3_SOLVER_N_SPLIT_Z;
  656. launcher.setConst(nSplit);
  657. launcher.launch1D(numWorkItems, 64);
  658. #else
  659. const char* fileName = "m_batchSolveKernel.bin";
  660. FILE* f = fopen(fileName, "rb");
  661. if (f)
  662. {
  663. int sizeInBytes = 0;
  664. if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
  665. {
  666. printf("error, cannot get file size\n");
  667. exit(0);
  668. }
  669. unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
  670. fread(buf, sizeInBytes, 1, f);
  671. int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
  672. int num = *(int*)&buf[serializedBytes];
  673. launcher.launch1D(num);
  674. //this clFinish is for testing on errors
  675. clFinish(m_queue);
  676. }
  677. #endif
  678. #ifdef DEBUG_ME
  679. clFinish(m_queue);
  680. gpuDebugInfo.read(debugInfo, numWorkItems);
  681. clFinish(m_queue);
  682. for (int i = 0; i < numWorkItems; i++)
  683. {
  684. if (debugInfo[i].m_valInt2 > 0)
  685. {
  686. printf("debugInfo[i].m_valInt2 = %d\n", i, debugInfo[i].m_valInt2);
  687. }
  688. if (debugInfo[i].m_valInt3 > 0)
  689. {
  690. printf("debugInfo[i].m_valInt3 = %d\n", i, debugInfo[i].m_valInt3);
  691. }
  692. }
  693. #endif //DEBUG_ME
  694. }
  695. }
  696. clFinish(m_queue);
  697. }
  698. cdata.x = 1;
  699. bool applyFriction = true;
  700. if (applyFriction)
  701. {
  702. B3_PROFILE("m_batchSolveKernel iterations2");
  703. for (int iter = 0; iter < m_nIterations; iter++)
  704. {
  705. for (int ib = 0; ib < B3_SOLVER_N_BATCHES; ib++)
  706. {
  707. cdata.z = ib;
  708. b3BufferInfoCL bInfo[] = {
  709. b3BufferInfoCL(bodyBuf->getBufferCL()),
  710. b3BufferInfoCL(shapeBuf->getBufferCL()),
  711. b3BufferInfoCL(constraint->getBufferCL()),
  712. b3BufferInfoCL(m_numConstraints->getBufferCL()),
  713. b3BufferInfoCL(m_offsets->getBufferCL())
  714. #ifdef DEBUG_ME
  715. ,
  716. b3BufferInfoCL(&gpuDebugInfo)
  717. #endif //DEBUG_ME
  718. };
  719. b3LauncherCL launcher(m_queue, m_solveFrictionKernel, "m_solveFrictionKernel");
  720. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  721. //launcher.setConst( cdata.x );
  722. launcher.setConst(cdata.y);
  723. launcher.setConst(cdata.z);
  724. b3Int4 nSplit;
  725. nSplit.x = B3_SOLVER_N_SPLIT_X;
  726. nSplit.y = B3_SOLVER_N_SPLIT_Y;
  727. nSplit.z = B3_SOLVER_N_SPLIT_Z;
  728. launcher.setConst(nSplit);
  729. launcher.launch1D(64 * nn / B3_SOLVER_N_BATCHES, 64);
  730. }
  731. }
  732. clFinish(m_queue);
  733. }
  734. #ifdef DEBUG_ME
  735. delete[] debugInfo;
  736. #endif //DEBUG_ME
  737. }
  738. }
  739. void b3Solver::convertToConstraints(const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
  740. const b3OpenCLArray<b3InertiaData>* shapeBuf,
  741. b3OpenCLArray<b3Contact4>* contactsIn, b3OpenCLArray<b3GpuConstraint4>* contactCOut, void* additionalData,
  742. int nContacts, const ConstraintCfg& cfg)
  743. {
  744. // b3OpenCLArray<b3GpuConstraint4>* constraintNative =0;
  745. contactCOut->resize(nContacts);
  746. struct CB
  747. {
  748. int m_nContacts;
  749. float m_dt;
  750. float m_positionDrift;
  751. float m_positionConstraintCoeff;
  752. };
  753. {
  754. CB cdata;
  755. cdata.m_nContacts = nContacts;
  756. cdata.m_dt = cfg.m_dt;
  757. cdata.m_positionDrift = cfg.m_positionDrift;
  758. cdata.m_positionConstraintCoeff = cfg.m_positionConstraintCoeff;
  759. if (gConvertConstraintOnCpu)
  760. {
  761. b3AlignedObjectArray<b3RigidBodyData> gBodies;
  762. bodyBuf->copyToHost(gBodies);
  763. b3AlignedObjectArray<b3Contact4> gContact;
  764. contactsIn->copyToHost(gContact);
  765. b3AlignedObjectArray<b3InertiaData> gShapes;
  766. shapeBuf->copyToHost(gShapes);
  767. b3AlignedObjectArray<b3GpuConstraint4> gConstraintOut;
  768. gConstraintOut.resize(nContacts);
  769. B3_PROFILE("cpu contactToConstraintKernel");
  770. for (int gIdx = 0; gIdx < nContacts; gIdx++)
  771. {
  772. int aIdx = abs(gContact[gIdx].m_bodyAPtrAndSignBit);
  773. int bIdx = abs(gContact[gIdx].m_bodyBPtrAndSignBit);
  774. b3Float4 posA = gBodies[aIdx].m_pos;
  775. b3Float4 linVelA = gBodies[aIdx].m_linVel;
  776. b3Float4 angVelA = gBodies[aIdx].m_angVel;
  777. float invMassA = gBodies[aIdx].m_invMass;
  778. b3Mat3x3 invInertiaA = gShapes[aIdx].m_initInvInertia;
  779. b3Float4 posB = gBodies[bIdx].m_pos;
  780. b3Float4 linVelB = gBodies[bIdx].m_linVel;
  781. b3Float4 angVelB = gBodies[bIdx].m_angVel;
  782. float invMassB = gBodies[bIdx].m_invMass;
  783. b3Mat3x3 invInertiaB = gShapes[bIdx].m_initInvInertia;
  784. b3ContactConstraint4_t cs;
  785. setConstraint4(posA, linVelA, angVelA, invMassA, invInertiaA, posB, linVelB, angVelB, invMassB, invInertiaB,
  786. &gContact[gIdx], cdata.m_dt, cdata.m_positionDrift, cdata.m_positionConstraintCoeff,
  787. &cs);
  788. cs.m_batchIdx = gContact[gIdx].m_batchIdx;
  789. gConstraintOut[gIdx] = (b3GpuConstraint4&)cs;
  790. }
  791. contactCOut->copyFromHost(gConstraintOut);
  792. }
  793. else
  794. {
  795. B3_PROFILE("gpu m_contactToConstraintKernel");
  796. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(contactsIn->getBufferCL()), b3BufferInfoCL(bodyBuf->getBufferCL()), b3BufferInfoCL(shapeBuf->getBufferCL()),
  797. b3BufferInfoCL(contactCOut->getBufferCL())};
  798. b3LauncherCL launcher(m_queue, m_contactToConstraintKernel, "m_contactToConstraintKernel");
  799. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  800. //launcher.setConst( cdata );
  801. launcher.setConst(cdata.m_nContacts);
  802. launcher.setConst(cdata.m_dt);
  803. launcher.setConst(cdata.m_positionDrift);
  804. launcher.setConst(cdata.m_positionConstraintCoeff);
  805. launcher.launch1D(nContacts, 64);
  806. clFinish(m_queue);
  807. }
  808. }
  809. }
  810. /*
  811. void b3Solver::sortContacts( const b3OpenCLArray<b3RigidBodyData>* bodyBuf,
  812. b3OpenCLArray<b3Contact4>* contactsIn, void* additionalData,
  813. int nContacts, const b3Solver::ConstraintCfg& cfg )
  814. {
  815. const int sortAlignment = 512; // todo. get this out of sort
  816. if( cfg.m_enableParallelSolve )
  817. {
  818. int sortSize = NEXTMULTIPLEOF( nContacts, sortAlignment );
  819. b3OpenCLArray<unsigned int>* countsNative = m_numConstraints;//BufferUtils::map<TYPE_CL, false>( data->m_device, &countsHost );
  820. b3OpenCLArray<unsigned int>* offsetsNative = m_offsets;//BufferUtils::map<TYPE_CL, false>( data->m_device, &offsetsHost );
  821. { // 2. set cell idx
  822. struct CB
  823. {
  824. int m_nContacts;
  825. int m_staticIdx;
  826. float m_scale;
  827. int m_nSplit;
  828. };
  829. b3Assert( sortSize%64 == 0 );
  830. CB cdata;
  831. cdata.m_nContacts = nContacts;
  832. cdata.m_staticIdx = cfg.m_staticIdx;
  833. cdata.m_scale = 1.f/(N_OBJ_PER_SPLIT*cfg.m_averageExtent);
  834. cdata.m_nSplit = B3_SOLVER_N_SPLIT;
  835. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( bodyBuf->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
  836. b3LauncherCL launcher( m_queue, m_setSortDataKernel );
  837. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  838. launcher.setConst( cdata );
  839. launcher.launch1D( sortSize, 64 );
  840. }
  841. { // 3. sort by cell idx
  842. int n = B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT;
  843. int sortBit = 32;
  844. //if( n <= 0xffff ) sortBit = 16;
  845. //if( n <= 0xff ) sortBit = 8;
  846. m_sort32->execute(*m_sortDataBuffer,sortSize);
  847. }
  848. { // 4. find entries
  849. m_search->execute( *m_sortDataBuffer, nContacts, *countsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT, b3BoundSearchCL::COUNT);
  850. m_scan->execute( *countsNative, *offsetsNative, B3_SOLVER_N_SPLIT*B3_SOLVER_N_SPLIT );
  851. }
  852. { // 5. sort constraints by cellIdx
  853. // todo. preallocate this
  854. // b3Assert( contactsIn->getType() == TYPE_HOST );
  855. // b3OpenCLArray<b3Contact4>* out = BufferUtils::map<TYPE_CL, false>( data->m_device, contactsIn ); // copying contacts to this buffer
  856. {
  857. b3Int4 cdata; cdata.x = nContacts;
  858. b3BufferInfoCL bInfo[] = { b3BufferInfoCL( contactsIn->getBufferCL() ), b3BufferInfoCL( m_contactBuffer->getBufferCL() ), b3BufferInfoCL( m_sortDataBuffer->getBufferCL() ) };
  859. b3LauncherCL launcher( m_queue, m_reorderContactKernel );
  860. launcher.setBuffers( bInfo, sizeof(bInfo)/sizeof(b3BufferInfoCL) );
  861. launcher.setConst( cdata );
  862. launcher.launch1D( nContacts, 64 );
  863. }
  864. // BufferUtils::unmap<true>( out, contactsIn, nContacts );
  865. }
  866. }
  867. }
  868. */
  869. void b3Solver::batchContacts(b3OpenCLArray<b3Contact4>* contacts, int nContacts, b3OpenCLArray<unsigned int>* nNative, b3OpenCLArray<unsigned int>* offsetsNative, int staticIdx)
  870. {
  871. int numWorkItems = 64 * B3_SOLVER_N_CELLS;
  872. {
  873. B3_PROFILE("batch generation");
  874. b3Int4 cdata;
  875. cdata.x = nContacts;
  876. cdata.y = 0;
  877. cdata.z = staticIdx;
  878. #ifdef BATCH_DEBUG
  879. SolverDebugInfo* debugInfo = new SolverDebugInfo[numWorkItems];
  880. adl::b3OpenCLArray<SolverDebugInfo> gpuDebugInfo(data->m_device, numWorkItems);
  881. memset(debugInfo, 0, sizeof(SolverDebugInfo) * numWorkItems);
  882. gpuDebugInfo.write(debugInfo, numWorkItems);
  883. #endif
  884. #if 0
  885. b3BufferInfoCL bInfo[] = {
  886. b3BufferInfoCL( contacts->getBufferCL() ),
  887. b3BufferInfoCL( m_contactBuffer2->getBufferCL()),
  888. b3BufferInfoCL( nNative->getBufferCL() ),
  889. b3BufferInfoCL( offsetsNative->getBufferCL() ),
  890. #ifdef BATCH_DEBUG
  891. , b3BufferInfoCL(&gpuDebugInfo)
  892. #endif
  893. };
  894. #endif
  895. {
  896. m_batchSizes.resize(nNative->size());
  897. B3_PROFILE("batchingKernel");
  898. //b3LauncherCL launcher( m_queue, m_batchingKernel);
  899. cl_kernel k = useNewBatchingKernel ? m_batchingKernelNew : m_batchingKernel;
  900. b3LauncherCL launcher(m_queue, k, "*batchingKernel");
  901. if (!useNewBatchingKernel)
  902. {
  903. launcher.setBuffer(contacts->getBufferCL());
  904. }
  905. launcher.setBuffer(m_contactBuffer2->getBufferCL());
  906. launcher.setBuffer(nNative->getBufferCL());
  907. launcher.setBuffer(offsetsNative->getBufferCL());
  908. launcher.setBuffer(m_batchSizes.getBufferCL());
  909. //launcher.setConst( cdata );
  910. launcher.setConst(staticIdx);
  911. launcher.launch1D(numWorkItems, 64);
  912. //clFinish(m_queue);
  913. //b3AlignedObjectArray<int> batchSizesCPU;
  914. //m_batchSizes.copyToHost(batchSizesCPU);
  915. //printf(".\n");
  916. }
  917. #ifdef BATCH_DEBUG
  918. aaaa
  919. b3Contact4* hostContacts = new b3Contact4[nContacts];
  920. m_contactBuffer->read(hostContacts, nContacts);
  921. clFinish(m_queue);
  922. gpuDebugInfo.read(debugInfo, numWorkItems);
  923. clFinish(m_queue);
  924. for (int i = 0; i < numWorkItems; i++)
  925. {
  926. if (debugInfo[i].m_valInt1 > 0)
  927. {
  928. printf("catch\n");
  929. }
  930. if (debugInfo[i].m_valInt2 > 0)
  931. {
  932. printf("catch22\n");
  933. }
  934. if (debugInfo[i].m_valInt3 > 0)
  935. {
  936. printf("catch666\n");
  937. }
  938. if (debugInfo[i].m_valInt4 > 0)
  939. {
  940. printf("catch777\n");
  941. }
  942. }
  943. delete[] debugInfo;
  944. #endif //BATCH_DEBUG
  945. }
  946. // copy buffer to buffer
  947. //b3Assert(m_contactBuffer->size()==nContacts);
  948. //contacts->copyFromOpenCLArray( *m_contactBuffer);
  949. //clFinish(m_queue);//needed?
  950. }