2
0

b3GpuSapBroadphase.cpp 38 KB


  1. bool searchIncremental3dSapOnGpu = true;
  2. #include <limits.h>
  3. #include "b3GpuSapBroadphase.h"
  4. #include "Bullet3Common/b3Vector3.h"
  5. #include "Bullet3OpenCL/ParallelPrimitives/b3LauncherCL.h"
  6. #include "Bullet3OpenCL/ParallelPrimitives/b3PrefixScanFloat4CL.h"
  7. #include "Bullet3OpenCL/Initialize/b3OpenCLUtils.h"
  8. #include "kernels/sapKernels.h"
  9. #include "Bullet3Common/b3MinMax.h"
  10. #define B3_BROADPHASE_SAP_PATH "src/Bullet3OpenCL/BroadphaseCollision/kernels/sap.cl"
  11. /*
  12. b3OpenCLArray<int> m_pairCount;
  13. b3OpenCLArray<b3SapAabb> m_allAabbsGPU;
  14. b3AlignedObjectArray<b3SapAabb> m_allAabbsCPU;
  15. virtual b3OpenCLArray<b3SapAabb>& getAllAabbsGPU()
  16. {
  17. return m_allAabbsGPU;
  18. }
  19. virtual b3AlignedObjectArray<b3SapAabb>& getAllAabbsCPU()
  20. {
  21. return m_allAabbsCPU;
  22. }
  23. b3OpenCLArray<b3Vector3> m_sum;
  24. b3OpenCLArray<b3Vector3> m_sum2;
  25. b3OpenCLArray<b3Vector3> m_dst;
  26. b3OpenCLArray<int> m_smallAabbsMappingGPU;
  27. b3AlignedObjectArray<int> m_smallAabbsMappingCPU;
  28. b3OpenCLArray<int> m_largeAabbsMappingGPU;
  29. b3AlignedObjectArray<int> m_largeAabbsMappingCPU;
  30. b3OpenCLArray<b3Int4> m_overlappingPairs;
  31. //temporary gpu work memory
  32. b3OpenCLArray<b3SortData> m_gpuSmallSortData;
  33. b3OpenCLArray<b3SapAabb> m_gpuSmallSortedAabbs;
  34. class b3PrefixScanFloat4CL* m_prefixScanFloat4;
  35. */
  36. b3GpuSapBroadphase::b3GpuSapBroadphase(cl_context ctx, cl_device_id device, cl_command_queue q, b3GpuSapKernelType kernelType)
  37. : m_context(ctx),
  38. m_device(device),
  39. m_queue(q),
  40. m_objectMinMaxIndexGPUaxis0(ctx, q),
  41. m_objectMinMaxIndexGPUaxis1(ctx, q),
  42. m_objectMinMaxIndexGPUaxis2(ctx, q),
  43. m_objectMinMaxIndexGPUaxis0prev(ctx, q),
  44. m_objectMinMaxIndexGPUaxis1prev(ctx, q),
  45. m_objectMinMaxIndexGPUaxis2prev(ctx, q),
  46. m_sortedAxisGPU0(ctx, q),
  47. m_sortedAxisGPU1(ctx, q),
  48. m_sortedAxisGPU2(ctx, q),
  49. m_sortedAxisGPU0prev(ctx, q),
  50. m_sortedAxisGPU1prev(ctx, q),
  51. m_sortedAxisGPU2prev(ctx, q),
  52. m_addedHostPairsGPU(ctx, q),
  53. m_removedHostPairsGPU(ctx, q),
  54. m_addedCountGPU(ctx, q),
  55. m_removedCountGPU(ctx, q),
  56. m_currentBuffer(-1),
  57. m_pairCount(ctx, q),
  58. m_allAabbsGPU(ctx, q),
  59. m_sum(ctx, q),
  60. m_sum2(ctx, q),
  61. m_dst(ctx, q),
  62. m_smallAabbsMappingGPU(ctx, q),
  63. m_largeAabbsMappingGPU(ctx, q),
  64. m_overlappingPairs(ctx, q),
  65. m_gpuSmallSortData(ctx, q),
  66. m_gpuSmallSortedAabbs(ctx, q)
  67. {
  68. const char* sapSrc = sapCL;
  69. cl_int errNum = 0;
  70. b3Assert(m_context);
  71. b3Assert(m_device);
  72. cl_program sapProg = b3OpenCLUtils::compileCLProgramFromString(m_context, m_device, sapSrc, &errNum, "", B3_BROADPHASE_SAP_PATH);
  73. b3Assert(errNum == CL_SUCCESS);
  74. b3Assert(errNum == CL_SUCCESS);
  75. #ifndef __APPLE__
  76. m_prefixScanFloat4 = new b3PrefixScanFloat4CL(m_context, m_device, m_queue);
  77. #else
  78. m_prefixScanFloat4 = 0;
  79. #endif
  80. m_sapKernel = 0;
  81. switch (kernelType)
  82. {
  83. case B3_GPU_SAP_KERNEL_BRUTE_FORCE_CPU:
  84. {
  85. m_sapKernel = 0;
  86. break;
  87. }
  88. case B3_GPU_SAP_KERNEL_BRUTE_FORCE_GPU:
  89. {
  90. m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBruteForce", &errNum, sapProg);
  91. break;
  92. }
  93. case B3_GPU_SAP_KERNEL_ORIGINAL:
  94. {
  95. m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelOriginal", &errNum, sapProg);
  96. break;
  97. }
  98. case B3_GPU_SAP_KERNEL_BARRIER:
  99. {
  100. m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelBarrier", &errNum, sapProg);
  101. break;
  102. }
  103. case B3_GPU_SAP_KERNEL_LOCAL_SHARED_MEMORY:
  104. {
  105. m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg);
  106. break;
  107. }
  108. default:
  109. {
  110. m_sapKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelLocalSharedMemory", &errNum, sapProg);
  111. b3Error("Unknown 3D GPU SAP provided, fallback to computePairsKernelLocalSharedMemory");
  112. }
  113. };
  114. m_sap2Kernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "computePairsKernelTwoArrays", &errNum, sapProg);
  115. b3Assert(errNum == CL_SUCCESS);
  116. m_prepareSumVarianceKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "prepareSumVarianceKernel", &errNum, sapProg);
  117. b3Assert(errNum == CL_SUCCESS);
  118. m_flipFloatKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "flipFloatKernel", &errNum, sapProg);
  119. m_copyAabbsKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "copyAabbsKernel", &errNum, sapProg);
  120. m_scatterKernel = b3OpenCLUtils::compileCLKernelFromString(m_context, m_device, sapSrc, "scatterKernel", &errNum, sapProg);
  121. m_sorter = new b3RadixSort32CL(m_context, m_device, m_queue);
  122. }
  123. b3GpuSapBroadphase::~b3GpuSapBroadphase()
  124. {
  125. delete m_sorter;
  126. delete m_prefixScanFloat4;
  127. clReleaseKernel(m_scatterKernel);
  128. clReleaseKernel(m_flipFloatKernel);
  129. clReleaseKernel(m_copyAabbsKernel);
  130. clReleaseKernel(m_sapKernel);
  131. clReleaseKernel(m_sap2Kernel);
  132. clReleaseKernel(m_prepareSumVarianceKernel);
  133. }
  134. /// conservative test for overlap between two aabbs
  135. static bool TestAabbAgainstAabb2(const b3Vector3& aabbMin1, const b3Vector3& aabbMax1,
  136. const b3Vector3& aabbMin2, const b3Vector3& aabbMax2)
  137. {
  138. bool overlap = true;
  139. overlap = (aabbMin1.getX() > aabbMax2.getX() || aabbMax1.getX() < aabbMin2.getX()) ? false : overlap;
  140. overlap = (aabbMin1.getZ() > aabbMax2.getZ() || aabbMax1.getZ() < aabbMin2.getZ()) ? false : overlap;
  141. overlap = (aabbMin1.getY() > aabbMax2.getY() || aabbMax1.getY() < aabbMin2.getY()) ? false : overlap;
  142. return overlap;
  143. }
  144. //http://stereopsis.com/radix.html
  145. static unsigned int FloatFlip(float fl)
  146. {
  147. unsigned int f = *(unsigned int*)&fl;
  148. unsigned int mask = -(int)(f >> 31) | 0x80000000;
  149. return f ^ mask;
  150. };
  151. void b3GpuSapBroadphase::init3dSap()
  152. {
  153. if (m_currentBuffer < 0)
  154. {
  155. m_allAabbsGPU.copyToHost(m_allAabbsCPU);
  156. m_currentBuffer = 0;
  157. for (int axis = 0; axis < 3; axis++)
  158. {
  159. for (int buf = 0; buf < 2; buf++)
  160. {
  161. int totalNumAabbs = m_allAabbsCPU.size();
  162. int numEndPoints = 2 * totalNumAabbs;
  163. m_sortedAxisCPU[axis][buf].resize(numEndPoints);
  164. if (buf == m_currentBuffer)
  165. {
  166. for (int i = 0; i < totalNumAabbs; i++)
  167. {
  168. m_sortedAxisCPU[axis][buf][i * 2].m_key = FloatFlip(m_allAabbsCPU[i].m_min[axis]) - 1;
  169. m_sortedAxisCPU[axis][buf][i * 2].m_value = i * 2;
  170. m_sortedAxisCPU[axis][buf][i * 2 + 1].m_key = FloatFlip(m_allAabbsCPU[i].m_max[axis]) + 1;
  171. m_sortedAxisCPU[axis][buf][i * 2 + 1].m_value = i * 2 + 1;
  172. }
  173. }
  174. }
  175. }
  176. for (int axis = 0; axis < 3; axis++)
  177. {
  178. m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
  179. }
  180. for (int axis = 0; axis < 3; axis++)
  181. {
  182. //int totalNumAabbs = m_allAabbsCPU.size();
  183. int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
  184. m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(numEndPoints);
  185. for (int i = 0; i < numEndPoints; i++)
  186. {
  187. int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
  188. int newDest = destIndex / 2;
  189. if (destIndex & 1)
  190. {
  191. m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i;
  192. }
  193. else
  194. {
  195. m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i;
  196. }
  197. }
  198. }
  199. }
  200. }
  201. static bool b3PairCmp(const b3Int4& p, const b3Int4& q)
  202. {
  203. return ((p.x < q.x) || ((p.x == q.x) && (p.y < q.y)));
  204. }
  205. static bool operator==(const b3Int4& a, const b3Int4& b)
  206. {
  207. return a.x == b.x && a.y == b.y;
  208. };
  209. static bool operator<(const b3Int4& a, const b3Int4& b)
  210. {
  211. return a.x < b.x || (a.x == b.x && a.y < b.y);
  212. };
  213. static bool operator>(const b3Int4& a, const b3Int4& b)
  214. {
  215. return a.x > b.x || (a.x == b.x && a.y > b.y);
  216. };
  217. b3AlignedObjectArray<b3Int4> addedHostPairs;
  218. b3AlignedObjectArray<b3Int4> removedHostPairs;
  219. b3AlignedObjectArray<b3SapAabb> preAabbs;
  220. void b3GpuSapBroadphase::calculateOverlappingPairsHostIncremental3Sap()
  221. {
  222. //static int framepje = 0;
  223. //printf("framepje=%d\n",framepje++);
  224. B3_PROFILE("calculateOverlappingPairsHostIncremental3Sap");
  225. addedHostPairs.resize(0);
  226. removedHostPairs.resize(0);
  227. b3Assert(m_currentBuffer >= 0);
  228. {
  229. preAabbs.resize(m_allAabbsCPU.size());
  230. for (int i = 0; i < preAabbs.size(); i++)
  231. {
  232. preAabbs[i] = m_allAabbsCPU[i];
  233. }
  234. }
  235. if (m_currentBuffer < 0)
  236. return;
  237. {
  238. B3_PROFILE("m_allAabbsGPU.copyToHost");
  239. m_allAabbsGPU.copyToHost(m_allAabbsCPU);
  240. }
  241. b3AlignedObjectArray<b3Int4> allPairs;
  242. {
  243. B3_PROFILE("m_overlappingPairs.copyToHost");
  244. m_overlappingPairs.copyToHost(allPairs);
  245. }
  246. if (0)
  247. {
  248. {
  249. printf("ab[40].min=%f,%f,%f,ab[40].max=%f,%f,%f\n",
  250. m_allAabbsCPU[40].m_min[0], m_allAabbsCPU[40].m_min[1], m_allAabbsCPU[40].m_min[2],
  251. m_allAabbsCPU[40].m_max[0], m_allAabbsCPU[40].m_max[1], m_allAabbsCPU[40].m_max[2]);
  252. }
  253. {
  254. printf("ab[53].min=%f,%f,%f,ab[53].max=%f,%f,%f\n",
  255. m_allAabbsCPU[53].m_min[0], m_allAabbsCPU[53].m_min[1], m_allAabbsCPU[53].m_min[2],
  256. m_allAabbsCPU[53].m_max[0], m_allAabbsCPU[53].m_max[1], m_allAabbsCPU[53].m_max[2]);
  257. }
  258. {
  259. b3Int4 newPair;
  260. newPair.x = 40;
  261. newPair.y = 53;
  262. int index = allPairs.findBinarySearch(newPair);
  263. printf("hasPair(40,53)=%d out of %d\n", index, allPairs.size());
  264. {
  265. int overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[40].m_min, (const b3Vector3&)m_allAabbsCPU[40].m_max, (const b3Vector3&)m_allAabbsCPU[53].m_min, (const b3Vector3&)m_allAabbsCPU[53].m_max);
  266. printf("overlap=%d\n", overlap);
  267. }
  268. if (preAabbs.size())
  269. {
  270. int prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[40].m_min, (const b3Vector3&)preAabbs[40].m_max, (const b3Vector3&)preAabbs[53].m_min, (const b3Vector3&)preAabbs[53].m_max);
  271. printf("prevoverlap=%d\n", prevOverlap);
  272. }
  273. else
  274. {
  275. printf("unknown prevoverlap\n");
  276. }
  277. }
  278. }
  279. if (0)
  280. {
  281. for (int i = 0; i < m_allAabbsCPU.size(); i++)
  282. {
  283. //printf("aabb[%d] min=%f,%f,%f max=%f,%f,%f\n",i,m_allAabbsCPU[i].m_min[0],m_allAabbsCPU[i].m_min[1],m_allAabbsCPU[i].m_min[2], m_allAabbsCPU[i].m_max[0],m_allAabbsCPU[i].m_max[1],m_allAabbsCPU[i].m_max[2]);
  284. }
  285. for (int axis = 0; axis < 3; axis++)
  286. {
  287. for (int buf = 0; buf < 2; buf++)
  288. {
  289. b3Assert(m_sortedAxisCPU[axis][buf].size() == m_allAabbsCPU.size() * 2);
  290. }
  291. }
  292. }
  293. m_currentBuffer = 1 - m_currentBuffer;
  294. int totalNumAabbs = m_allAabbsCPU.size();
  295. {
  296. B3_PROFILE("assign m_sortedAxisCPU(FloatFlip)");
  297. for (int i = 0; i < totalNumAabbs; i++)
  298. {
  299. unsigned int keyMin[3];
  300. unsigned int keyMax[3];
  301. for (int axis = 0; axis < 3; axis++)
  302. {
  303. float vmin = m_allAabbsCPU[i].m_min[axis];
  304. float vmax = m_allAabbsCPU[i].m_max[axis];
  305. keyMin[axis] = FloatFlip(vmin);
  306. keyMax[axis] = FloatFlip(vmax);
  307. m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_key = keyMin[axis] - 1;
  308. m_sortedAxisCPU[axis][m_currentBuffer][i * 2].m_value = i * 2;
  309. m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_key = keyMax[axis] + 1;
  310. m_sortedAxisCPU[axis][m_currentBuffer][i * 2 + 1].m_value = i * 2 + 1;
  311. }
  312. //printf("aabb[%d] min=%u,%u,%u max %u,%u,%u\n", i,keyMin[0],keyMin[1],keyMin[2],keyMax[0],keyMax[1],keyMax[2]);
  313. }
  314. }
  315. {
  316. B3_PROFILE("sort m_sortedAxisCPU");
  317. for (int axis = 0; axis < 3; axis++)
  318. m_sorter->executeHost(m_sortedAxisCPU[axis][m_currentBuffer]);
  319. }
  320. #if 0
  321. if (0)
  322. {
  323. for (int axis=0;axis<3;axis++)
  324. {
  325. //printf("axis %d\n",axis);
  326. for (int i=0;i<m_sortedAxisCPU[axis][m_currentBuffer].size();i++)
  327. {
  328. //int key = m_sortedAxisCPU[axis][m_currentBuffer][i].m_key;
  329. //int value = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
  330. //printf("[%d]=%d\n",i,value);
  331. }
  332. }
  333. }
  334. #endif
  335. {
  336. B3_PROFILE("assign m_objectMinMaxIndexCPU");
  337. for (int axis = 0; axis < 3; axis++)
  338. {
  339. int totalNumAabbs = m_allAabbsCPU.size();
  340. int numEndPoints = m_sortedAxisCPU[axis][m_currentBuffer].size();
  341. m_objectMinMaxIndexCPU[axis][m_currentBuffer].resize(totalNumAabbs);
  342. for (int i = 0; i < numEndPoints; i++)
  343. {
  344. int destIndex = m_sortedAxisCPU[axis][m_currentBuffer][i].m_value;
  345. int newDest = destIndex / 2;
  346. if (destIndex & 1)
  347. {
  348. m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].y = i;
  349. }
  350. else
  351. {
  352. m_objectMinMaxIndexCPU[axis][m_currentBuffer][newDest].x = i;
  353. }
  354. }
  355. }
  356. }
  357. #if 0
  358. if (0)
  359. {
  360. printf("==========================\n");
  361. for (int axis=0;axis<3;axis++)
  362. {
  363. unsigned int curMinIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].x;
  364. unsigned int curMaxIndex40 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][40].y;
  365. unsigned int prevMaxIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].y;
  366. unsigned int prevMinIndex40 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][40].x;
  367. int dmin40 = curMinIndex40 - prevMinIndex40;
  368. int dmax40 = curMinIndex40 - prevMinIndex40;
  369. printf("axis %d curMinIndex40=%d prevMinIndex40=%d\n",axis,curMinIndex40, prevMinIndex40);
  370. printf("axis %d curMaxIndex40=%d prevMaxIndex40=%d\n",axis,curMaxIndex40, prevMaxIndex40);
  371. }
  372. printf(".........................\n");
  373. for (int axis=0;axis<3;axis++)
  374. {
  375. unsigned int curMinIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].x;
  376. unsigned int curMaxIndex53 = m_objectMinMaxIndexCPU[axis][m_currentBuffer][53].y;
  377. unsigned int prevMaxIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].y;
  378. unsigned int prevMinIndex53 = m_objectMinMaxIndexCPU[axis][1-m_currentBuffer][53].x;
  379. int dmin40 = curMinIndex53 - prevMinIndex53;
  380. int dmax40 = curMinIndex53 - prevMinIndex53;
  381. printf("axis %d curMinIndex53=%d prevMinIndex53=%d\n",axis,curMinIndex53, prevMinIndex53);
  382. printf("axis %d curMaxIndex53=%d prevMaxIndex53=%d\n",axis,curMaxIndex53, prevMaxIndex53);
  383. }
  384. }
  385. #endif
  386. int a = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
  387. int b = m_objectMinMaxIndexCPU[1][m_currentBuffer].size();
  388. int c = m_objectMinMaxIndexCPU[2][m_currentBuffer].size();
  389. b3Assert(a == b);
  390. b3Assert(b == c);
  391. /*
  392. if (searchIncremental3dSapOnGpu)
  393. {
  394. B3_PROFILE("computePairsIncremental3dSapKernelGPU");
  395. int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
  396. int maxCapacity = 1024*1024;
  397. {
  398. B3_PROFILE("copy from host");
  399. m_objectMinMaxIndexGPUaxis0.copyFromHost(m_objectMinMaxIndexCPU[0][m_currentBuffer]);
  400. m_objectMinMaxIndexGPUaxis1.copyFromHost(m_objectMinMaxIndexCPU[1][m_currentBuffer]);
  401. m_objectMinMaxIndexGPUaxis2.copyFromHost(m_objectMinMaxIndexCPU[2][m_currentBuffer]);
  402. m_objectMinMaxIndexGPUaxis0prev.copyFromHost(m_objectMinMaxIndexCPU[0][1-m_currentBuffer]);
  403. m_objectMinMaxIndexGPUaxis1prev.copyFromHost(m_objectMinMaxIndexCPU[1][1-m_currentBuffer]);
  404. m_objectMinMaxIndexGPUaxis2prev.copyFromHost(m_objectMinMaxIndexCPU[2][1-m_currentBuffer]);
  405. m_sortedAxisGPU0.copyFromHost(m_sortedAxisCPU[0][m_currentBuffer]);
  406. m_sortedAxisGPU1.copyFromHost(m_sortedAxisCPU[1][m_currentBuffer]);
  407. m_sortedAxisGPU2.copyFromHost(m_sortedAxisCPU[2][m_currentBuffer]);
  408. m_sortedAxisGPU0prev.copyFromHost(m_sortedAxisCPU[0][1-m_currentBuffer]);
  409. m_sortedAxisGPU1prev.copyFromHost(m_sortedAxisCPU[1][1-m_currentBuffer]);
  410. m_sortedAxisGPU2prev.copyFromHost(m_sortedAxisCPU[2][1-m_currentBuffer]);
  411. m_addedHostPairsGPU.resize(maxCapacity);
  412. m_removedHostPairsGPU.resize(maxCapacity);
  413. m_addedCountGPU.resize(0);
  414. m_addedCountGPU.push_back(0);
  415. m_removedCountGPU.resize(0);
  416. m_removedCountGPU.push_back(0);
  417. }
  418. {
  419. B3_PROFILE("launch1D");
  420. b3LauncherCL launcher(m_queue, m_computePairsIncremental3dSapKernel,"m_computePairsIncremental3dSapKernel");
  421. launcher.setBuffer(m_objectMinMaxIndexGPUaxis0.getBufferCL());
  422. launcher.setBuffer(m_objectMinMaxIndexGPUaxis1.getBufferCL());
  423. launcher.setBuffer(m_objectMinMaxIndexGPUaxis2.getBufferCL());
  424. launcher.setBuffer(m_objectMinMaxIndexGPUaxis0prev.getBufferCL());
  425. launcher.setBuffer(m_objectMinMaxIndexGPUaxis1prev.getBufferCL());
  426. launcher.setBuffer(m_objectMinMaxIndexGPUaxis2prev.getBufferCL());
  427. launcher.setBuffer(m_sortedAxisGPU0.getBufferCL());
  428. launcher.setBuffer(m_sortedAxisGPU1.getBufferCL());
  429. launcher.setBuffer(m_sortedAxisGPU2.getBufferCL());
  430. launcher.setBuffer(m_sortedAxisGPU0prev.getBufferCL());
  431. launcher.setBuffer(m_sortedAxisGPU1prev.getBufferCL());
  432. launcher.setBuffer(m_sortedAxisGPU2prev.getBufferCL());
  433. launcher.setBuffer(m_addedHostPairsGPU.getBufferCL());
  434. launcher.setBuffer(m_removedHostPairsGPU.getBufferCL());
  435. launcher.setBuffer(m_addedCountGPU.getBufferCL());
  436. launcher.setBuffer(m_removedCountGPU.getBufferCL());
  437. launcher.setConst(maxCapacity);
  438. launcher.setConst( numObjects);
  439. launcher.launch1D( numObjects);
  440. clFinish(m_queue);
  441. }
  442. {
  443. B3_PROFILE("copy to host");
  444. int addedCountGPU = m_addedCountGPU.at(0);
  445. m_addedHostPairsGPU.resize(addedCountGPU);
  446. m_addedHostPairsGPU.copyToHost(addedHostPairs);
  447. //printf("addedCountGPU=%d\n",addedCountGPU);
  448. int removedCountGPU = m_removedCountGPU.at(0);
  449. m_removedHostPairsGPU.resize(removedCountGPU);
  450. m_removedHostPairsGPU.copyToHost(removedHostPairs);
  451. //printf("removedCountGPU=%d\n",removedCountGPU);
  452. }
  453. }
  454. else
  455. */
  456. {
  457. int numObjects = m_objectMinMaxIndexCPU[0][m_currentBuffer].size();
  458. B3_PROFILE("actual search");
  459. for (int i = 0; i < numObjects; i++)
  460. {
  461. //int numObjects = m_objectMinMaxIndexCPU[axis][m_currentBuffer].size();
  462. //int checkObjects[]={40,53};
  463. //int numCheckObjects = sizeof(checkObjects)/sizeof(int);
  464. //for (int a=0;a<numCheckObjects ;a++)
  465. for (int axis = 0; axis < 3; axis++)
  466. {
  467. //int i = checkObjects[a];
  468. unsigned int curMinIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].x;
  469. unsigned int curMaxIndex = m_objectMinMaxIndexCPU[axis][m_currentBuffer][i].y;
  470. unsigned int prevMinIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].x;
  471. int dmin = curMinIndex - prevMinIndex;
  472. unsigned int prevMaxIndex = m_objectMinMaxIndexCPU[axis][1 - m_currentBuffer][i].y;
  473. int dmax = curMaxIndex - prevMaxIndex;
  474. if (dmin != 0)
  475. {
  476. //printf("for object %d, dmin=%d\n",i,dmin);
  477. }
  478. if (dmax != 0)
  479. {
  480. //printf("for object %d, dmax=%d\n",i,dmax);
  481. }
  482. for (int otherbuffer = 0; otherbuffer < 2; otherbuffer++)
  483. {
  484. if (dmin != 0)
  485. {
  486. int stepMin = dmin < 0 ? -1 : 1;
  487. for (int j = prevMinIndex; j != curMinIndex; j += stepMin)
  488. {
  489. int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
  490. int otherIndex = otherIndex2 / 2;
  491. if (otherIndex != i)
  492. {
  493. bool otherIsMax = ((otherIndex2 & 1) != 0);
  494. if (otherIsMax)
  495. {
  496. //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
  497. //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
  498. bool overlap = true;
  499. for (int ax = 0; ax < 3; ax++)
  500. {
  501. if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
  502. (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
  503. overlap = false;
  504. }
  505. // b3Assert(overlap2==overlap);
  506. bool prevOverlap = true;
  507. for (int ax = 0; ax < 3; ax++)
  508. {
  509. if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) ||
  510. (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x))
  511. prevOverlap = false;
  512. }
  513. //b3Assert(overlap==overlap2);
  514. if (dmin < 0)
  515. {
  516. if (overlap && !prevOverlap)
  517. {
  518. //add a pair
  519. b3Int4 newPair;
  520. if (i <= otherIndex)
  521. {
  522. newPair.x = i;
  523. newPair.y = otherIndex;
  524. }
  525. else
  526. {
  527. newPair.x = otherIndex;
  528. newPair.y = i;
  529. }
  530. addedHostPairs.push_back(newPair);
  531. }
  532. }
  533. else
  534. {
  535. if (!overlap && prevOverlap)
  536. {
  537. //remove a pair
  538. b3Int4 removedPair;
  539. if (i <= otherIndex)
  540. {
  541. removedPair.x = i;
  542. removedPair.y = otherIndex;
  543. }
  544. else
  545. {
  546. removedPair.x = otherIndex;
  547. removedPair.y = i;
  548. }
  549. removedHostPairs.push_back(removedPair);
  550. }
  551. } //otherisMax
  552. } //if (dmin<0)
  553. } //if (otherIndex!=i)
  554. } //for (int j=
  555. }
  556. if (dmax != 0)
  557. {
  558. int stepMax = dmax < 0 ? -1 : 1;
  559. for (int j = prevMaxIndex; j != curMaxIndex; j += stepMax)
  560. {
  561. int otherIndex2 = m_sortedAxisCPU[axis][otherbuffer][j].y;
  562. int otherIndex = otherIndex2 / 2;
  563. if (otherIndex != i)
  564. {
  565. //bool otherIsMin = ((otherIndex2&1)==0);
  566. //if (otherIsMin)
  567. {
  568. //bool overlap = TestAabbAgainstAabb2((const b3Vector3&)m_allAabbsCPU[i].m_min, (const b3Vector3&)m_allAabbsCPU[i].m_max,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_min,(const b3Vector3&)m_allAabbsCPU[otherIndex].m_max);
  569. //bool prevOverlap = TestAabbAgainstAabb2((const b3Vector3&)preAabbs[i].m_min, (const b3Vector3&)preAabbs[i].m_max,(const b3Vector3&)preAabbs[otherIndex].m_min,(const b3Vector3&)preAabbs[otherIndex].m_max);
  570. bool overlap = true;
  571. for (int ax = 0; ax < 3; ax++)
  572. {
  573. if ((m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].y) ||
  574. (m_objectMinMaxIndexCPU[ax][m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][m_currentBuffer][otherIndex].x))
  575. overlap = false;
  576. }
  577. //b3Assert(overlap2==overlap);
  578. bool prevOverlap = true;
  579. for (int ax = 0; ax < 3; ax++)
  580. {
  581. if ((m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].x > m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].y) ||
  582. (m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][i].y < m_objectMinMaxIndexCPU[ax][1 - m_currentBuffer][otherIndex].x))
  583. prevOverlap = false;
  584. }
  585. if (dmax > 0)
  586. {
  587. if (overlap && !prevOverlap)
  588. {
  589. //add a pair
  590. b3Int4 newPair;
  591. if (i <= otherIndex)
  592. {
  593. newPair.x = i;
  594. newPair.y = otherIndex;
  595. }
  596. else
  597. {
  598. newPair.x = otherIndex;
  599. newPair.y = i;
  600. }
  601. addedHostPairs.push_back(newPair);
  602. }
  603. }
  604. else
  605. {
  606. if (!overlap && prevOverlap)
  607. {
  608. //if (otherIndex2&1==0) -> min?
  609. //remove a pair
  610. b3Int4 removedPair;
  611. if (i <= otherIndex)
  612. {
  613. removedPair.x = i;
  614. removedPair.y = otherIndex;
  615. }
  616. else
  617. {
  618. removedPair.x = otherIndex;
  619. removedPair.y = i;
  620. }
  621. removedHostPairs.push_back(removedPair);
  622. }
  623. }
  624. } //if (dmin<0)
  625. } //if (otherIndex!=i)
  626. } //for (int j=
  627. }
  628. } //for (int otherbuffer
  629. } //for (int axis=0;
  630. } //for (int i=0;i<numObjects
  631. }
  632. //remove duplicates and add/remove then to existing m_overlappingPairs
  633. {
  634. {
  635. B3_PROFILE("sort allPairs");
  636. allPairs.quickSort(b3PairCmp);
  637. }
  638. {
  639. B3_PROFILE("sort addedHostPairs");
  640. addedHostPairs.quickSort(b3PairCmp);
  641. }
  642. {
  643. B3_PROFILE("sort removedHostPairs");
  644. removedHostPairs.quickSort(b3PairCmp);
  645. }
  646. }
  647. b3Int4 prevPair;
  648. prevPair.x = -1;
  649. prevPair.y = -1;
  650. int uniqueRemovedPairs = 0;
  651. b3AlignedObjectArray<int> removedPositions;
  652. {
  653. B3_PROFILE("actual removing");
  654. for (int i = 0; i < removedHostPairs.size(); i++)
  655. {
  656. b3Int4 removedPair = removedHostPairs[i];
  657. if ((removedPair.x != prevPair.x) || (removedPair.y != prevPair.y))
  658. {
  659. int index1 = allPairs.findBinarySearch(removedPair);
  660. //#ifdef _DEBUG
  661. int index2 = allPairs.findLinearSearch(removedPair);
  662. b3Assert(index1 == index2);
  663. //b3Assert(index1!=allPairs.size());
  664. if (index1 < allPairs.size())
  665. //#endif//_DEBUG
  666. {
  667. uniqueRemovedPairs++;
  668. removedPositions.push_back(index1);
  669. {
  670. //printf("framepje(%d) remove pair(%d):%d,%d\n",framepje,i,removedPair.x,removedPair.y);
  671. }
  672. }
  673. }
  674. prevPair = removedPair;
  675. }
  676. if (uniqueRemovedPairs)
  677. {
  678. for (int i = 0; i < removedPositions.size(); i++)
  679. {
  680. allPairs[removedPositions[i]].x = INT_MAX;
  681. allPairs[removedPositions[i]].y = INT_MAX;
  682. }
  683. allPairs.quickSort(b3PairCmp);
  684. allPairs.resize(allPairs.size() - uniqueRemovedPairs);
  685. }
  686. }
  687. //if (uniqueRemovedPairs)
  688. // printf("uniqueRemovedPairs=%d\n",uniqueRemovedPairs);
  689. //printf("removedHostPairs.size = %d\n",removedHostPairs.size());
  690. prevPair.x = -1;
  691. prevPair.y = -1;
  692. int uniqueAddedPairs = 0;
  693. b3AlignedObjectArray<b3Int4> actualAddedPairs;
  694. {
  695. B3_PROFILE("actual adding");
  696. for (int i = 0; i < addedHostPairs.size(); i++)
  697. {
  698. b3Int4 newPair = addedHostPairs[i];
  699. if ((newPair.x != prevPair.x) || (newPair.y != prevPair.y))
  700. {
  701. //#ifdef _DEBUG
  702. int index1 = allPairs.findBinarySearch(newPair);
  703. int index2 = allPairs.findLinearSearch(newPair);
  704. b3Assert(index1 == index2);
  705. b3Assert(index1 == allPairs.size());
  706. if (index1 != allPairs.size())
  707. {
  708. printf("??\n");
  709. }
  710. if (index1 == allPairs.size())
  711. //#endif //_DEBUG
  712. {
  713. uniqueAddedPairs++;
  714. actualAddedPairs.push_back(newPair);
  715. }
  716. }
  717. prevPair = newPair;
  718. }
  719. for (int i = 0; i < actualAddedPairs.size(); i++)
  720. {
  721. //printf("framepje (%d), new pair(%d):%d,%d\n",framepje,i,actualAddedPairs[i].x,actualAddedPairs[i].y);
  722. allPairs.push_back(actualAddedPairs[i]);
  723. }
  724. }
  725. //if (uniqueAddedPairs)
  726. // printf("uniqueAddedPairs=%d\n", uniqueAddedPairs);
  727. {
  728. B3_PROFILE("m_overlappingPairs.copyFromHost");
  729. m_overlappingPairs.copyFromHost(allPairs);
  730. }
  731. }
  732. void b3GpuSapBroadphase::calculateOverlappingPairsHost(int maxPairs)
  733. {
  734. //test
  735. // if (m_currentBuffer>=0)
  736. // return calculateOverlappingPairsHostIncremental3Sap();
  737. b3Assert(m_allAabbsCPU.size() == m_allAabbsGPU.size());
  738. m_allAabbsGPU.copyToHost(m_allAabbsCPU);
  739. int axis = 0;
  740. {
  741. B3_PROFILE("CPU compute best variance axis");
  742. b3Vector3 s = b3MakeVector3(0, 0, 0), s2 = b3MakeVector3(0, 0, 0);
  743. int numRigidBodies = m_smallAabbsMappingCPU.size();
  744. for (int i = 0; i < numRigidBodies; i++)
  745. {
  746. b3SapAabb aabb = this->m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
  747. b3Vector3 maxAabb = b3MakeVector3(aabb.m_max[0], aabb.m_max[1], aabb.m_max[2]);
  748. b3Vector3 minAabb = b3MakeVector3(aabb.m_min[0], aabb.m_min[1], aabb.m_min[2]);
  749. b3Vector3 centerAabb = (maxAabb + minAabb) * 0.5f;
  750. s += centerAabb;
  751. s2 += centerAabb * centerAabb;
  752. }
  753. b3Vector3 v = s2 - (s * s) / (float)numRigidBodies;
  754. if (v[1] > v[0])
  755. axis = 1;
  756. if (v[2] > v[axis])
  757. axis = 2;
  758. }
  759. b3AlignedObjectArray<b3Int4> hostPairs;
  760. {
  761. int numSmallAabbs = m_smallAabbsMappingCPU.size();
  762. for (int i = 0; i < numSmallAabbs; i++)
  763. {
  764. b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
  765. //float reference = smallAabbi.m_max[axis];
  766. for (int j = i + 1; j < numSmallAabbs; j++)
  767. {
  768. b3SapAabb smallAabbj = m_allAabbsCPU[m_smallAabbsMappingCPU[j]];
  769. if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max,
  770. (b3Vector3&)smallAabbj.m_min, (b3Vector3&)smallAabbj.m_max))
  771. {
  772. b3Int4 pair;
  773. int a = smallAabbi.m_minIndices[3];
  774. int b = smallAabbj.m_minIndices[3];
  775. if (a <= b)
  776. {
  777. pair.x = a; //store the original index in the unsorted aabb array
  778. pair.y = b;
  779. }
  780. else
  781. {
  782. pair.x = b; //store the original index in the unsorted aabb array
  783. pair.y = a;
  784. }
  785. hostPairs.push_back(pair);
  786. }
  787. }
  788. }
  789. }
  790. {
  791. int numSmallAabbs = m_smallAabbsMappingCPU.size();
  792. for (int i = 0; i < numSmallAabbs; i++)
  793. {
  794. b3SapAabb smallAabbi = m_allAabbsCPU[m_smallAabbsMappingCPU[i]];
  795. //float reference = smallAabbi.m_max[axis];
  796. int numLargeAabbs = m_largeAabbsMappingCPU.size();
  797. for (int j = 0; j < numLargeAabbs; j++)
  798. {
  799. b3SapAabb largeAabbj = m_allAabbsCPU[m_largeAabbsMappingCPU[j]];
  800. if (TestAabbAgainstAabb2((b3Vector3&)smallAabbi.m_min, (b3Vector3&)smallAabbi.m_max,
  801. (b3Vector3&)largeAabbj.m_min, (b3Vector3&)largeAabbj.m_max))
  802. {
  803. b3Int4 pair;
  804. int a = largeAabbj.m_minIndices[3];
  805. int b = smallAabbi.m_minIndices[3];
  806. if (a <= b)
  807. {
  808. pair.x = a;
  809. pair.y = b; //store the original index in the unsorted aabb array
  810. }
  811. else
  812. {
  813. pair.x = b;
  814. pair.y = a; //store the original index in the unsorted aabb array
  815. }
  816. hostPairs.push_back(pair);
  817. }
  818. }
  819. }
  820. }
  821. if (hostPairs.size() > maxPairs)
  822. {
  823. hostPairs.resize(maxPairs);
  824. }
  825. if (hostPairs.size())
  826. {
  827. m_overlappingPairs.copyFromHost(hostPairs);
  828. }
  829. else
  830. {
  831. m_overlappingPairs.resize(0);
  832. }
  833. //init3dSap();
  834. }
  835. void b3GpuSapBroadphase::reset()
  836. {
  837. m_allAabbsGPU.resize(0);
  838. m_allAabbsCPU.resize(0);
  839. m_smallAabbsMappingGPU.resize(0);
  840. m_smallAabbsMappingCPU.resize(0);
  841. m_pairCount.resize(0);
  842. m_largeAabbsMappingGPU.resize(0);
  843. m_largeAabbsMappingCPU.resize(0);
  844. }
  845. void b3GpuSapBroadphase::calculateOverlappingPairs(int maxPairs)
  846. {
  847. if (m_sapKernel == 0)
  848. {
  849. calculateOverlappingPairsHost(maxPairs);
  850. return;
  851. }
  852. //if (m_currentBuffer>=0)
  853. // return calculateOverlappingPairsHostIncremental3Sap();
  854. //calculateOverlappingPairsHost(maxPairs);
  855. B3_PROFILE("GPU 1-axis SAP calculateOverlappingPairs");
  856. int axis = 0;
  857. {
  858. //bool syncOnHost = false;
  859. int numSmallAabbs = m_smallAabbsMappingCPU.size();
  860. if (m_prefixScanFloat4 && numSmallAabbs)
  861. {
  862. B3_PROFILE("GPU compute best variance axis");
  863. if (m_dst.size() != (numSmallAabbs + 1))
  864. {
  865. m_dst.resize(numSmallAabbs + 128);
  866. m_sum.resize(numSmallAabbs + 128);
  867. m_sum2.resize(numSmallAabbs + 128);
  868. m_sum.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow?
  869. m_sum2.at(numSmallAabbs) = b3MakeVector3(0, 0, 0); //slow?
  870. }
  871. b3LauncherCL launcher(m_queue, m_prepareSumVarianceKernel, "m_prepareSumVarianceKernel");
  872. launcher.setBuffer(m_allAabbsGPU.getBufferCL());
  873. launcher.setBuffer(m_smallAabbsMappingGPU.getBufferCL());
  874. launcher.setBuffer(m_sum.getBufferCL());
  875. launcher.setBuffer(m_sum2.getBufferCL());
  876. launcher.setConst(numSmallAabbs);
  877. int num = numSmallAabbs;
  878. launcher.launch1D(num);
  879. b3Vector3 s;
  880. b3Vector3 s2;
  881. m_prefixScanFloat4->execute(m_sum, m_dst, numSmallAabbs + 1, &s);
  882. m_prefixScanFloat4->execute(m_sum2, m_dst, numSmallAabbs + 1, &s2);
  883. b3Vector3 v = s2 - (s * s) / (float)numSmallAabbs;
  884. if (v[1] > v[0])
  885. axis = 1;
  886. if (v[2] > v[axis])
  887. axis = 2;
  888. }
  889. m_gpuSmallSortData.resize(numSmallAabbs);
  890. #if 1
  891. if (m_smallAabbsMappingGPU.size())
  892. {
  893. B3_PROFILE("flipFloatKernel");
  894. b3BufferInfoCL bInfo[] = {
  895. b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true),
  896. b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true),
  897. b3BufferInfoCL(m_gpuSmallSortData.getBufferCL())};
  898. b3LauncherCL launcher(m_queue, m_flipFloatKernel, "m_flipFloatKernel");
  899. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  900. launcher.setConst(numSmallAabbs);
  901. launcher.setConst(axis);
  902. int num = numSmallAabbs;
  903. launcher.launch1D(num);
  904. clFinish(m_queue);
  905. }
  906. if (m_gpuSmallSortData.size())
  907. {
  908. B3_PROFILE("gpu radix sort");
  909. m_sorter->execute(m_gpuSmallSortData);
  910. clFinish(m_queue);
  911. }
  912. m_gpuSmallSortedAabbs.resize(numSmallAabbs);
  913. if (numSmallAabbs)
  914. {
  915. B3_PROFILE("scatterKernel");
  916. b3BufferInfoCL bInfo[] = {
  917. b3BufferInfoCL(m_allAabbsGPU.getBufferCL(), true),
  918. b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL(), true),
  919. b3BufferInfoCL(m_gpuSmallSortData.getBufferCL(), true),
  920. b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL())};
  921. b3LauncherCL launcher(m_queue, m_scatterKernel, "m_scatterKernel ");
  922. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  923. launcher.setConst(numSmallAabbs);
  924. int num = numSmallAabbs;
  925. launcher.launch1D(num);
  926. clFinish(m_queue);
  927. }
  928. m_overlappingPairs.resize(maxPairs);
  929. m_pairCount.resize(0);
  930. m_pairCount.push_back(0);
  931. int numPairs = 0;
  932. {
  933. int numLargeAabbs = m_largeAabbsMappingGPU.size();
  934. if (numLargeAabbs && numSmallAabbs)
  935. {
  936. //@todo
  937. B3_PROFILE("sap2Kernel");
  938. b3BufferInfoCL bInfo[] = {
  939. b3BufferInfoCL(m_allAabbsGPU.getBufferCL()),
  940. b3BufferInfoCL(m_largeAabbsMappingGPU.getBufferCL()),
  941. b3BufferInfoCL(m_smallAabbsMappingGPU.getBufferCL()),
  942. b3BufferInfoCL(m_overlappingPairs.getBufferCL()),
  943. b3BufferInfoCL(m_pairCount.getBufferCL())};
  944. b3LauncherCL launcher(m_queue, m_sap2Kernel, "m_sap2Kernel");
  945. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  946. launcher.setConst(numLargeAabbs);
  947. launcher.setConst(numSmallAabbs);
  948. launcher.setConst(axis);
  949. launcher.setConst(maxPairs);
  950. //@todo: use actual maximum work item sizes of the device instead of hardcoded values
  951. launcher.launch2D(numLargeAabbs, numSmallAabbs, 4, 64);
  952. numPairs = m_pairCount.at(0);
  953. if (numPairs > maxPairs)
  954. {
  955. b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
  956. numPairs = maxPairs;
  957. }
  958. }
  959. }
  960. if (m_gpuSmallSortedAabbs.size())
  961. {
  962. B3_PROFILE("sapKernel");
  963. b3BufferInfoCL bInfo[] = {b3BufferInfoCL(m_gpuSmallSortedAabbs.getBufferCL()), b3BufferInfoCL(m_overlappingPairs.getBufferCL()), b3BufferInfoCL(m_pairCount.getBufferCL())};
  964. b3LauncherCL launcher(m_queue, m_sapKernel, "m_sapKernel");
  965. launcher.setBuffers(bInfo, sizeof(bInfo) / sizeof(b3BufferInfoCL));
  966. launcher.setConst(numSmallAabbs);
  967. launcher.setConst(axis);
  968. launcher.setConst(maxPairs);
  969. int num = numSmallAabbs;
  970. #if 0
  971. int buffSize = launcher.getSerializationBufferSize();
  972. unsigned char* buf = new unsigned char[buffSize+sizeof(int)];
  973. for (int i=0;i<buffSize+1;i++)
  974. {
  975. unsigned char* ptr = (unsigned char*)&buf[i];
  976. *ptr = 0xff;
  977. }
  978. int actualWrite = launcher.serializeArguments(buf,buffSize);
  979. unsigned char* cptr = (unsigned char*)&buf[buffSize];
  980. // printf("buf[buffSize] = %d\n",*cptr);
  981. assert(buf[buffSize]==0xff);//check for buffer overrun
  982. int* ptr = (int*)&buf[buffSize];
  983. *ptr = num;
  984. FILE* f = fopen("m_sapKernelArgs.bin","wb");
  985. fwrite(buf,buffSize+sizeof(int),1,f);
  986. fclose(f);
  987. #endif //
  988. launcher.launch1D(num);
  989. clFinish(m_queue);
  990. numPairs = m_pairCount.at(0);
  991. if (numPairs > maxPairs)
  992. {
  993. b3Error("Error running out of pairs: numPairs = %d, maxPairs = %d.\n", numPairs, maxPairs);
  994. numPairs = maxPairs;
  995. m_pairCount.resize(0);
  996. m_pairCount.push_back(maxPairs);
  997. }
  998. }
  999. #else
  1000. int numPairs = 0;
  1001. b3LauncherCL launcher(m_queue, m_sapKernel);
  1002. const char* fileName = "m_sapKernelArgs.bin";
  1003. FILE* f = fopen(fileName, "rb");
  1004. if (f)
  1005. {
  1006. int sizeInBytes = 0;
  1007. if (fseek(f, 0, SEEK_END) || (sizeInBytes = ftell(f)) == EOF || fseek(f, 0, SEEK_SET))
  1008. {
  1009. printf("error, cannot get file size\n");
  1010. exit(0);
  1011. }
  1012. unsigned char* buf = (unsigned char*)malloc(sizeInBytes);
  1013. fread(buf, sizeInBytes, 1, f);
  1014. int serializedBytes = launcher.deserializeArgs(buf, sizeInBytes, m_context);
  1015. int num = *(int*)&buf[serializedBytes];
  1016. launcher.launch1D(num);
  1017. b3OpenCLArray<int> pairCount(m_context, m_queue);
  1018. int numElements = launcher.m_arrays[2]->size() / sizeof(int);
  1019. pairCount.setFromOpenCLBuffer(launcher.m_arrays[2]->getBufferCL(), numElements);
  1020. numPairs = pairCount.at(0);
  1021. //printf("overlapping pairs = %d\n",numPairs);
  1022. b3AlignedObjectArray<b3Int4> hostOoverlappingPairs;
  1023. b3OpenCLArray<b3Int4> tmpGpuPairs(m_context, m_queue);
  1024. tmpGpuPairs.setFromOpenCLBuffer(launcher.m_arrays[1]->getBufferCL(), numPairs);
  1025. tmpGpuPairs.copyToHost(hostOoverlappingPairs);
  1026. m_overlappingPairs.copyFromHost(hostOoverlappingPairs);
  1027. //printf("hello %d\n", m_overlappingPairs.size());
  1028. free(buf);
  1029. fclose(f);
  1030. }
  1031. else
  1032. {
  1033. printf("error: cannot find file %s\n", fileName);
  1034. }
  1035. clFinish(m_queue);
  1036. #endif
  1037. m_overlappingPairs.resize(numPairs);
  1038. } //B3_PROFILE("GPU_RADIX SORT");
  1039. //init3dSap();
  1040. }
  1041. void b3GpuSapBroadphase::writeAabbsToGpu()
  1042. {
  1043. m_smallAabbsMappingGPU.copyFromHost(m_smallAabbsMappingCPU);
  1044. m_largeAabbsMappingGPU.copyFromHost(m_largeAabbsMappingCPU);
  1045. m_allAabbsGPU.copyFromHost(m_allAabbsCPU); //might not be necessary, the 'setupGpuAabbsFull' already takes care of this
  1046. }
  1047. void b3GpuSapBroadphase::createLargeProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
  1048. {
  1049. int index = userPtr;
  1050. b3SapAabb aabb;
  1051. for (int i = 0; i < 4; i++)
  1052. {
  1053. aabb.m_min[i] = aabbMin[i];
  1054. aabb.m_max[i] = aabbMax[i];
  1055. }
  1056. aabb.m_minIndices[3] = index;
  1057. aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
  1058. m_largeAabbsMappingCPU.push_back(m_allAabbsCPU.size());
  1059. m_allAabbsCPU.push_back(aabb);
  1060. }
  1061. void b3GpuSapBroadphase::createProxy(const b3Vector3& aabbMin, const b3Vector3& aabbMax, int userPtr, int collisionFilterGroup, int collisionFilterMask)
  1062. {
  1063. int index = userPtr;
  1064. b3SapAabb aabb;
  1065. for (int i = 0; i < 4; i++)
  1066. {
  1067. aabb.m_min[i] = aabbMin[i];
  1068. aabb.m_max[i] = aabbMax[i];
  1069. }
  1070. aabb.m_minIndices[3] = index;
  1071. aabb.m_signedMaxIndices[3] = m_allAabbsCPU.size();
  1072. m_smallAabbsMappingCPU.push_back(m_allAabbsCPU.size());
  1073. m_allAabbsCPU.push_back(aabb);
  1074. }
  1075. cl_mem b3GpuSapBroadphase::getAabbBufferWS()
  1076. {
  1077. return m_allAabbsGPU.getBufferCL();
  1078. }
  1079. int b3GpuSapBroadphase::getNumOverlap()
  1080. {
  1081. return m_overlappingPairs.size();
  1082. }
  1083. cl_mem b3GpuSapBroadphase::getOverlappingPairBuffer()
  1084. {
  1085. return m_overlappingPairs.getBufferCL();
  1086. }
  1087. b3OpenCLArray<b3Int4>& b3GpuSapBroadphase::getOverlappingPairsGPU()
  1088. {
  1089. return m_overlappingPairs;
  1090. }
  1091. b3OpenCLArray<int>& b3GpuSapBroadphase::getSmallAabbIndicesGPU()
  1092. {
  1093. return m_smallAabbsMappingGPU;
  1094. }
  1095. b3OpenCLArray<int>& b3GpuSapBroadphase::getLargeAabbIndicesGPU()
  1096. {
  1097. return m_largeAabbsMappingGPU;
  1098. }