basisu_opencl.cpp 39 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342
  1. // basisu_opencl.cpp
  2. // Copyright (C) 2019-2024 Binomial LLC. All Rights Reserved.
  3. //
  4. // Licensed under the Apache License, Version 2.0 (the "License");
  5. // you may not use this file except in compliance with the License.
  6. // You may obtain a copy of the License at
  7. //
  8. // http://www.apache.org/licenses/LICENSE-2.0
  9. //
  10. // Unless required by applicable law or agreed to in writing, software
  11. // distributed under the License is distributed on an "AS IS" BASIS,
  12. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. // See the License for the specific language governing permissions and
  14. // limitations under the License.
  15. #include "basisu_opencl.h"
  16. // If 1, the kernel source code will come from encoders/ocl_kernels.h. Otherwise, it will be read from the "ocl_kernels.cl" file in the current directory (for development).
  17. #define BASISU_USE_OCL_KERNELS_HEADER (1)
  18. #define BASISU_OCL_KERNELS_FILENAME "ocl_kernels.cl"
  19. #if BASISU_SUPPORT_OPENCL
  20. #include "basisu_enc.h"
  21. // We only use OpenCL v1.2 or less.
  22. #define CL_TARGET_OPENCL_VERSION 120
  23. #ifdef __APPLE__
  24. #include <OpenCL/opencl.h>
  25. #else
  26. #include <CL/cl.h>
  27. #endif
  28. #ifndef BASISU_OPENCL_ASSERT_ON_ANY_ERRORS
  29. #define BASISU_OPENCL_ASSERT_ON_ANY_ERRORS (0)
  30. #endif
  31. namespace basisu
  32. {
  33. #if BASISU_USE_OCL_KERNELS_HEADER
  34. #include "basisu_ocl_kernels.h"
  35. #endif
  36. static void ocl_error_printf(const char* pFmt, ...)
  37. {
  38. va_list args;
  39. va_start(args, pFmt);
  40. error_vprintf(pFmt, args);
  41. va_end(args);
  42. #if BASISU_OPENCL_ASSERT_ON_ANY_ERRORS
  43. assert(0);
  44. #endif
  45. }
  46. class ocl
  47. {
  48. public:
  49. ocl()
  50. {
  51. memset(&m_dev_fp_config, 0, sizeof(m_dev_fp_config));
  52. m_ocl_mutex.lock();
  53. m_ocl_mutex.unlock();
  54. }
  55. ~ocl()
  56. {
  57. }
  58. bool is_initialized() const { return m_device_id != nullptr; }
  59. cl_device_id get_device_id() const { return m_device_id; }
  60. cl_context get_context() const { return m_context; }
  61. cl_command_queue get_command_queue() { return m_command_queue; }
  62. cl_program get_program() const { return m_program; }
  63. bool init(bool force_serialization)
  64. {
  65. deinit();
  66. interval_timer tm;
  67. tm.start();
  68. cl_uint num_platforms = 0;
  69. cl_int ret = clGetPlatformIDs(0, NULL, &num_platforms);
  70. if (ret != CL_SUCCESS)
  71. {
  72. ocl_error_printf("ocl::init: clGetPlatformIDs() failed with %i\n", ret);
  73. return false;
  74. }
  75. if ((!num_platforms) || (num_platforms > INT_MAX))
  76. {
  77. ocl_error_printf("ocl::init: clGetPlatformIDs() returned an invalid number of num_platforms\n");
  78. return false;
  79. }
  80. std::vector<cl_platform_id> platforms(num_platforms);
  81. ret = clGetPlatformIDs(num_platforms, platforms.data(), NULL);
  82. if (ret != CL_SUCCESS)
  83. {
  84. ocl_error_printf("ocl::init: clGetPlatformIDs() failed\n");
  85. return false;
  86. }
  87. cl_uint num_devices = 0;
  88. ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 1, &m_device_id, &num_devices);
  89. if (ret == CL_DEVICE_NOT_FOUND)
  90. {
  91. ocl_error_printf("ocl::init: Couldn't get any GPU device ID's, trying CL_DEVICE_TYPE_CPU\n");
  92. ret = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_CPU, 1, &m_device_id, &num_devices);
  93. }
  94. if (ret != CL_SUCCESS)
  95. {
  96. ocl_error_printf("ocl::init: Unable to get any device ID's\n");
  97. m_device_id = nullptr;
  98. return false;
  99. }
  100. ret = clGetDeviceInfo(m_device_id,
  101. CL_DEVICE_SINGLE_FP_CONFIG,
  102. sizeof(m_dev_fp_config),
  103. &m_dev_fp_config,
  104. nullptr);
  105. if (ret != CL_SUCCESS)
  106. {
  107. ocl_error_printf("ocl::init: clGetDeviceInfo() failed\n");
  108. return false;
  109. }
  110. char plat_vers[256];
  111. size_t rv = 0;
  112. ret = clGetPlatformInfo(platforms[0], CL_PLATFORM_VERSION, sizeof(plat_vers), plat_vers, &rv);
  113. if (ret == CL_SUCCESS)
  114. printf("OpenCL platform version: \"%s\"\n", plat_vers);
  115. // Serialize CL calls with the AMD driver to avoid lockups when multiple command queues per thread are used. This sucks, but what can we do?
  116. m_use_mutex = (strstr(plat_vers, "AMD") != nullptr) || force_serialization;
  117. printf("Serializing OpenCL calls across threads: %u\n", (uint32_t)m_use_mutex);
  118. m_context = clCreateContext(nullptr, 1, &m_device_id, nullptr, nullptr, &ret);
  119. if (ret != CL_SUCCESS)
  120. {
  121. ocl_error_printf("ocl::init: clCreateContext() failed\n");
  122. m_device_id = nullptr;
  123. m_context = nullptr;
  124. return false;
  125. }
  126. m_command_queue = clCreateCommandQueue(m_context, m_device_id, 0, &ret);
  127. if (ret != CL_SUCCESS)
  128. {
  129. ocl_error_printf("ocl::init: clCreateCommandQueue() failed\n");
  130. deinit();
  131. return false;
  132. }
  133. printf("OpenCL init time: %3.3f secs\n", tm.get_elapsed_secs());
  134. return true;
  135. }
  136. bool deinit()
  137. {
  138. if (m_program)
  139. {
  140. clReleaseProgram(m_program);
  141. m_program = nullptr;
  142. }
  143. if (m_command_queue)
  144. {
  145. clReleaseCommandQueue(m_command_queue);
  146. m_command_queue = nullptr;
  147. }
  148. if (m_context)
  149. {
  150. clReleaseContext(m_context);
  151. m_context = nullptr;
  152. }
  153. m_device_id = nullptr;
  154. return true;
  155. }
  156. cl_command_queue create_command_queue()
  157. {
  158. cl_serializer serializer(this);
  159. cl_int ret = 0;
  160. cl_command_queue p = clCreateCommandQueue(m_context, m_device_id, 0, &ret);
  161. if (ret != CL_SUCCESS)
  162. return nullptr;
  163. return p;
  164. }
  165. void destroy_command_queue(cl_command_queue p)
  166. {
  167. if (p)
  168. {
  169. cl_serializer serializer(this);
  170. clReleaseCommandQueue(p);
  171. }
  172. }
  173. bool init_program(const char* pSrc, size_t src_size)
  174. {
  175. cl_int ret;
  176. if (m_program != nullptr)
  177. {
  178. clReleaseProgram(m_program);
  179. m_program = nullptr;
  180. }
  181. m_program = clCreateProgramWithSource(m_context, 1, (const char**)&pSrc, (const size_t*)&src_size, &ret);
  182. if (ret != CL_SUCCESS)
  183. {
  184. ocl_error_printf("ocl::init_program: clCreateProgramWithSource() failed!\n");
  185. return false;
  186. }
  187. std::string options;
  188. if (m_dev_fp_config & CL_FP_CORRECTLY_ROUNDED_DIVIDE_SQRT)
  189. {
  190. options += "-cl-fp32-correctly-rounded-divide-sqrt";
  191. }
  192. options += " -cl-std=CL1.2";
  193. //options += " -cl-opt-disable";
  194. //options += " -cl-mad-enable";
  195. //options += " -cl-fast-relaxed-math";
  196. ret = clBuildProgram(m_program, 1, &m_device_id,
  197. options.size() ? options.c_str() : nullptr, // options
  198. nullptr, // notify
  199. nullptr); // user_data
  200. if (ret != CL_SUCCESS)
  201. {
  202. const cl_int build_program_result = ret;
  203. size_t ret_val_size;
  204. ret = clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size);
  205. if (ret != CL_SUCCESS)
  206. {
  207. ocl_error_printf("ocl::init_program: clGetProgramBuildInfo() failed!\n");
  208. return false;
  209. }
  210. std::vector<char> build_log(ret_val_size + 1);
  211. ret = clGetProgramBuildInfo(m_program, m_device_id, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log.data(), NULL);
  212. ocl_error_printf("\nclBuildProgram() failed with error %i:\n%s", build_program_result, build_log.data());
  213. return false;
  214. }
  215. return true;
  216. }
  217. cl_kernel create_kernel(const char* pName)
  218. {
  219. if (!m_program)
  220. return nullptr;
  221. cl_serializer serializer(this);
  222. cl_int ret;
  223. cl_kernel kernel = clCreateKernel(m_program, pName, &ret);
  224. if (ret != CL_SUCCESS)
  225. {
  226. ocl_error_printf("ocl::create_kernel: clCreateKernel() failed!\n");
  227. return nullptr;
  228. }
  229. return kernel;
  230. }
  231. bool destroy_kernel(cl_kernel k)
  232. {
  233. if (k)
  234. {
  235. cl_serializer serializer(this);
  236. cl_int ret = clReleaseKernel(k);
  237. if (ret != CL_SUCCESS)
  238. {
  239. ocl_error_printf("ocl::destroy_kernel: clReleaseKernel() failed!\n");
  240. return false;
  241. }
  242. }
  243. return true;
  244. }
  245. cl_mem alloc_read_buffer(size_t size)
  246. {
  247. cl_serializer serializer(this);
  248. cl_int ret;
  249. cl_mem obj = clCreateBuffer(m_context, CL_MEM_READ_ONLY, size, NULL, &ret);
  250. if (ret != CL_SUCCESS)
  251. {
  252. ocl_error_printf("ocl::alloc_read_buffer: clCreateBuffer() failed!\n");
  253. return nullptr;
  254. }
  255. return obj;
  256. }
  257. cl_mem alloc_and_init_read_buffer(cl_command_queue command_queue, const void *pInit, size_t size)
  258. {
  259. cl_serializer serializer(this);
  260. cl_int ret;
  261. cl_mem obj = clCreateBuffer(m_context, CL_MEM_READ_ONLY, size, NULL, &ret);
  262. if (ret != CL_SUCCESS)
  263. {
  264. ocl_error_printf("ocl::alloc_and_init_read_buffer: clCreateBuffer() failed!\n");
  265. return nullptr;
  266. }
  267. #if 0
  268. if (!write_to_buffer(command_queue, obj, pInit, size))
  269. {
  270. destroy_buffer(obj);
  271. return nullptr;
  272. }
  273. #else
  274. ret = clEnqueueWriteBuffer(command_queue, obj, CL_TRUE, 0, size, pInit, 0, NULL, NULL);
  275. if (ret != CL_SUCCESS)
  276. {
  277. ocl_error_printf("ocl::alloc_and_init_read_buffer: clEnqueueWriteBuffer() failed!\n");
  278. return nullptr;
  279. }
  280. #endif
  281. return obj;
  282. }
  283. cl_mem alloc_write_buffer(size_t size)
  284. {
  285. cl_serializer serializer(this);
  286. cl_int ret;
  287. cl_mem obj = clCreateBuffer(m_context, CL_MEM_WRITE_ONLY, size, NULL, &ret);
  288. if (ret != CL_SUCCESS)
  289. {
  290. ocl_error_printf("ocl::alloc_write_buffer: clCreateBuffer() failed!\n");
  291. return nullptr;
  292. }
  293. return obj;
  294. }
  295. bool destroy_buffer(cl_mem buf)
  296. {
  297. if (buf)
  298. {
  299. cl_serializer serializer(this);
  300. cl_int ret = clReleaseMemObject(buf);
  301. if (ret != CL_SUCCESS)
  302. {
  303. ocl_error_printf("ocl::destroy_buffer: clReleaseMemObject() failed!\n");
  304. return false;
  305. }
  306. }
  307. return true;
  308. }
  309. bool write_to_buffer(cl_command_queue command_queue, cl_mem clmem, const void* d, const size_t m)
  310. {
  311. cl_serializer serializer(this);
  312. cl_int ret = clEnqueueWriteBuffer(command_queue, clmem, CL_TRUE, 0, m, d, 0, NULL, NULL);
  313. if (ret != CL_SUCCESS)
  314. {
  315. ocl_error_printf("ocl::write_to_buffer: clEnqueueWriteBuffer() failed!\n");
  316. return false;
  317. }
  318. return true;
  319. }
  320. bool read_from_buffer(cl_command_queue command_queue, const cl_mem clmem, void* d, size_t m)
  321. {
  322. cl_serializer serializer(this);
  323. cl_int ret = clEnqueueReadBuffer(command_queue, clmem, CL_TRUE, 0, m, d, 0, NULL, NULL);
  324. if (ret != CL_SUCCESS)
  325. {
  326. ocl_error_printf("ocl::read_from_buffer: clEnqueueReadBuffer() failed!\n");
  327. return false;
  328. }
  329. return true;
  330. }
  331. cl_mem create_read_image_u8(uint32_t width, uint32_t height, const void* pPixels, uint32_t bytes_per_pixel, bool normalized)
  332. {
  333. cl_image_format fmt = get_image_format(bytes_per_pixel, normalized);
  334. cl_image_desc desc;
  335. memset(&desc, 0, sizeof(desc));
  336. desc.image_type = CL_MEM_OBJECT_IMAGE2D;
  337. desc.image_width = width;
  338. desc.image_height = height;
  339. desc.image_row_pitch = width * bytes_per_pixel;
  340. cl_serializer serializer(this);
  341. cl_int ret;
  342. cl_mem img = clCreateImage(m_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &fmt, &desc, (void*)pPixels, &ret);
  343. if (ret != CL_SUCCESS)
  344. {
  345. ocl_error_printf("ocl::create_read_image_u8: clCreateImage() failed!\n");
  346. return nullptr;
  347. }
  348. return img;
  349. }
  350. cl_mem create_write_image_u8(uint32_t width, uint32_t height, uint32_t bytes_per_pixel, bool normalized)
  351. {
  352. cl_image_format fmt = get_image_format(bytes_per_pixel, normalized);
  353. cl_image_desc desc;
  354. memset(&desc, 0, sizeof(desc));
  355. desc.image_type = CL_MEM_OBJECT_IMAGE2D;
  356. desc.image_width = width;
  357. desc.image_height = height;
  358. cl_serializer serializer(this);
  359. cl_int ret;
  360. cl_mem img = clCreateImage(m_context, CL_MEM_WRITE_ONLY, &fmt, &desc, nullptr, &ret);
  361. if (ret != CL_SUCCESS)
  362. {
  363. ocl_error_printf("ocl::create_write_image_u8: clCreateImage() failed!\n");
  364. return nullptr;
  365. }
  366. return img;
  367. }
  368. bool read_from_image(cl_command_queue command_queue, cl_mem img, void* pPixels, uint32_t ofs_x, uint32_t ofs_y, uint32_t width, uint32_t height)
  369. {
  370. cl_serializer serializer(this);
  371. size_t origin[3] = { ofs_x, ofs_y, 0 }, region[3] = { width, height, 1 };
  372. cl_int err = clEnqueueReadImage(command_queue, img, CL_TRUE, origin, region, 0, 0, pPixels, 0, NULL, NULL);
  373. if (err != CL_SUCCESS)
  374. {
  375. ocl_error_printf("ocl::read_from_image: clEnqueueReadImage() failed!\n");
  376. return false;
  377. }
  378. return true;
  379. }
  380. bool run_1D(cl_command_queue command_queue, const cl_kernel kernel, size_t num_items)
  381. {
  382. cl_serializer serializer(this);
  383. cl_int ret = clEnqueueNDRangeKernel(command_queue, kernel,
  384. 1, // work_dim
  385. nullptr, // global_work_offset
  386. &num_items, // global_work_size
  387. nullptr, // local_work_size
  388. 0, // num_events_in_wait_list
  389. nullptr, // event_wait_list
  390. nullptr // event
  391. );
  392. if (ret != CL_SUCCESS)
  393. {
  394. ocl_error_printf("ocl::run_1D: clEnqueueNDRangeKernel() failed!\n");
  395. return false;
  396. }
  397. return true;
  398. }
  399. bool run_2D(cl_command_queue command_queue, const cl_kernel kernel, size_t width, size_t height)
  400. {
  401. cl_serializer serializer(this);
  402. size_t num_global_items[2] = { width, height };
  403. //size_t num_local_items[2] = { 1, 1 };
  404. cl_int ret = clEnqueueNDRangeKernel(command_queue, kernel,
  405. 2, // work_dim
  406. nullptr, // global_work_offset
  407. num_global_items, // global_work_size
  408. nullptr, // local_work_size
  409. 0, // num_events_in_wait_list
  410. nullptr, // event_wait_list
  411. nullptr // event
  412. );
  413. if (ret != CL_SUCCESS)
  414. {
  415. ocl_error_printf("ocl::run_2D: clEnqueueNDRangeKernel() failed!\n");
  416. return false;
  417. }
  418. return true;
  419. }
  420. bool run_2D(cl_command_queue command_queue, const cl_kernel kernel, size_t ofs_x, size_t ofs_y, size_t width, size_t height)
  421. {
  422. cl_serializer serializer(this);
  423. size_t global_ofs[2] = { ofs_x, ofs_y };
  424. size_t num_global_items[2] = { width, height };
  425. //size_t num_local_items[2] = { 1, 1 };
  426. cl_int ret = clEnqueueNDRangeKernel(command_queue, kernel,
  427. 2, // work_dim
  428. global_ofs, // global_work_offset
  429. num_global_items, // global_work_size
  430. nullptr, // local_work_size
  431. 0, // num_events_in_wait_list
  432. nullptr, // event_wait_list
  433. nullptr // event
  434. );
  435. if (ret != CL_SUCCESS)
  436. {
  437. ocl_error_printf("ocl::run_2D: clEnqueueNDRangeKernel() failed!\n");
  438. return false;
  439. }
  440. return true;
  441. }
  442. void flush(cl_command_queue command_queue)
  443. {
  444. cl_serializer serializer(this);
  445. clFlush(command_queue);
  446. clFinish(command_queue);
  447. }
  448. template<typename T>
  449. bool set_kernel_arg(cl_kernel kernel, uint32_t index, const T& obj)
  450. {
  451. cl_serializer serializer(this);
  452. cl_int ret = clSetKernelArg(kernel, index, sizeof(T), (void*)&obj);
  453. if (ret != CL_SUCCESS)
  454. {
  455. ocl_error_printf("ocl::set_kernel_arg: clSetKernelArg() failed!\n");
  456. return false;
  457. }
  458. return true;
  459. }
  460. template<typename T>
  461. bool set_kernel_args(cl_kernel kernel, const T& obj1)
  462. {
  463. cl_serializer serializer(this);
  464. cl_int ret = clSetKernelArg(kernel, 0, sizeof(T), (void*)&obj1);
  465. if (ret != CL_SUCCESS)
  466. {
  467. ocl_error_printf("ocl::set_kernel_arg: clSetKernelArg() failed!\n");
  468. return false;
  469. }
  470. return true;
  471. }
  472. #define BASISU_CHECK_ERR if (ret != CL_SUCCESS) { ocl_error_printf("ocl::set_kernel_args: clSetKernelArg() failed!\n"); return false; }
  473. template<typename T, typename U>
  474. bool set_kernel_args(cl_kernel kernel, const T& obj1, const U& obj2)
  475. {
  476. cl_serializer serializer(this);
  477. cl_int ret = clSetKernelArg(kernel, 0, sizeof(T), (void*)&obj1); BASISU_CHECK_ERR
  478. ret = clSetKernelArg(kernel, 1, sizeof(U), (void*)&obj2); BASISU_CHECK_ERR
  479. return true;
  480. }
  481. template<typename T, typename U, typename V>
  482. bool set_kernel_args(cl_kernel kernel, const T& obj1, const U& obj2, const V& obj3)
  483. {
  484. cl_serializer serializer(this);
  485. cl_int ret = clSetKernelArg(kernel, 0, sizeof(T), (void*)&obj1); BASISU_CHECK_ERR
  486. ret = clSetKernelArg(kernel, 1, sizeof(U), (void*)&obj2); BASISU_CHECK_ERR
  487. ret = clSetKernelArg(kernel, 2, sizeof(V), (void*)&obj3); BASISU_CHECK_ERR
  488. return true;
  489. }
  490. template<typename T, typename U, typename V, typename W>
  491. bool set_kernel_args(cl_kernel kernel, const T& obj1, const U& obj2, const V& obj3, const W& obj4)
  492. {
  493. cl_serializer serializer(this);
  494. cl_int ret = clSetKernelArg(kernel, 0, sizeof(T), (void*)&obj1); BASISU_CHECK_ERR
  495. ret = clSetKernelArg(kernel, 1, sizeof(U), (void*)&obj2); BASISU_CHECK_ERR
  496. ret = clSetKernelArg(kernel, 2, sizeof(V), (void*)&obj3); BASISU_CHECK_ERR
  497. ret = clSetKernelArg(kernel, 3, sizeof(W), (void*)&obj4); BASISU_CHECK_ERR
  498. return true;
  499. }
  500. template<typename T, typename U, typename V, typename W, typename X>
  501. bool set_kernel_args(cl_kernel kernel, const T& obj1, const U& obj2, const V& obj3, const W& obj4, const X& obj5)
  502. {
  503. cl_serializer serializer(this);
  504. cl_int ret = clSetKernelArg(kernel, 0, sizeof(T), (void*)&obj1); BASISU_CHECK_ERR
  505. ret = clSetKernelArg(kernel, 1, sizeof(U), (void*)&obj2); BASISU_CHECK_ERR
  506. ret = clSetKernelArg(kernel, 2, sizeof(V), (void*)&obj3); BASISU_CHECK_ERR
  507. ret = clSetKernelArg(kernel, 3, sizeof(W), (void*)&obj4); BASISU_CHECK_ERR
  508. ret = clSetKernelArg(kernel, 4, sizeof(X), (void*)&obj5); BASISU_CHECK_ERR
  509. return true;
  510. }
  511. template<typename T, typename U, typename V, typename W, typename X, typename Y>
  512. bool set_kernel_args(cl_kernel kernel, const T& obj1, const U& obj2, const V& obj3, const W& obj4, const X& obj5, const Y& obj6)
  513. {
  514. cl_serializer serializer(this);
  515. cl_int ret = clSetKernelArg(kernel, 0, sizeof(T), (void*)&obj1); BASISU_CHECK_ERR
  516. ret = clSetKernelArg(kernel, 1, sizeof(U), (void*)&obj2); BASISU_CHECK_ERR
  517. ret = clSetKernelArg(kernel, 2, sizeof(V), (void*)&obj3); BASISU_CHECK_ERR
  518. ret = clSetKernelArg(kernel, 3, sizeof(W), (void*)&obj4); BASISU_CHECK_ERR
  519. ret = clSetKernelArg(kernel, 4, sizeof(X), (void*)&obj5); BASISU_CHECK_ERR
  520. ret = clSetKernelArg(kernel, 5, sizeof(Y), (void*)&obj6); BASISU_CHECK_ERR
  521. return true;
  522. }
  523. template<typename T, typename U, typename V, typename W, typename X, typename Y, typename Z>
  524. bool set_kernel_args(cl_kernel kernel, const T& obj1, const U& obj2, const V& obj3, const W& obj4, const X& obj5, const Y& obj6, const Z& obj7)
  525. {
  526. cl_serializer serializer(this);
  527. cl_int ret = clSetKernelArg(kernel, 0, sizeof(T), (void*)&obj1); BASISU_CHECK_ERR
  528. ret = clSetKernelArg(kernel, 1, sizeof(U), (void*)&obj2); BASISU_CHECK_ERR
  529. ret = clSetKernelArg(kernel, 2, sizeof(V), (void*)&obj3); BASISU_CHECK_ERR
  530. ret = clSetKernelArg(kernel, 3, sizeof(W), (void*)&obj4); BASISU_CHECK_ERR
  531. ret = clSetKernelArg(kernel, 4, sizeof(X), (void*)&obj5); BASISU_CHECK_ERR
  532. ret = clSetKernelArg(kernel, 5, sizeof(Y), (void*)&obj6); BASISU_CHECK_ERR
  533. ret = clSetKernelArg(kernel, 6, sizeof(Z), (void*)&obj7); BASISU_CHECK_ERR
  534. return true;
  535. }
  536. template<typename T, typename U, typename V, typename W, typename X, typename Y, typename Z, typename A>
  537. bool set_kernel_args(cl_kernel kernel, const T& obj1, const U& obj2, const V& obj3, const W& obj4, const X& obj5, const Y& obj6, const Z& obj7, const A& obj8)
  538. {
  539. cl_serializer serializer(this);
  540. cl_int ret = clSetKernelArg(kernel, 0, sizeof(T), (void*)&obj1); BASISU_CHECK_ERR
  541. ret = clSetKernelArg(kernel, 1, sizeof(U), (void*)&obj2); BASISU_CHECK_ERR
  542. ret = clSetKernelArg(kernel, 2, sizeof(V), (void*)&obj3); BASISU_CHECK_ERR
  543. ret = clSetKernelArg(kernel, 3, sizeof(W), (void*)&obj4); BASISU_CHECK_ERR
  544. ret = clSetKernelArg(kernel, 4, sizeof(X), (void*)&obj5); BASISU_CHECK_ERR
  545. ret = clSetKernelArg(kernel, 5, sizeof(Y), (void*)&obj6); BASISU_CHECK_ERR
  546. ret = clSetKernelArg(kernel, 6, sizeof(Z), (void*)&obj7); BASISU_CHECK_ERR
  547. ret = clSetKernelArg(kernel, 7, sizeof(A), (void*)&obj8); BASISU_CHECK_ERR
  548. return true;
  549. }
  550. #undef BASISU_CHECK_ERR
  551. private:
  552. cl_device_id m_device_id = nullptr;
  553. cl_context m_context = nullptr;
  554. cl_command_queue m_command_queue = nullptr;
  555. cl_program m_program = nullptr;
  556. cl_device_fp_config m_dev_fp_config;
  557. bool m_use_mutex = false;
  558. std::mutex m_ocl_mutex;
  559. // This helper object is used to optionally serialize all calls to the CL driver after initialization.
  560. // Currently this is only used to work around race conditions in the Windows AMD driver.
  561. struct cl_serializer
  562. {
  563. inline cl_serializer(const cl_serializer&);
  564. cl_serializer& operator= (const cl_serializer&);
  565. inline cl_serializer(ocl *p) : m_p(p)
  566. {
  567. if (m_p->m_use_mutex)
  568. m_p->m_ocl_mutex.lock();
  569. }
  570. inline ~cl_serializer()
  571. {
  572. if (m_p->m_use_mutex)
  573. m_p->m_ocl_mutex.unlock();
  574. }
  575. private:
  576. ocl* m_p;
  577. };
  578. cl_image_format get_image_format(uint32_t bytes_per_pixel, bool normalized)
  579. {
  580. cl_image_format fmt;
  581. switch (bytes_per_pixel)
  582. {
  583. case 1: fmt.image_channel_order = CL_LUMINANCE; break;
  584. case 2: fmt.image_channel_order = CL_RG; break;
  585. case 3: fmt.image_channel_order = CL_RGB; break;
  586. case 4: fmt.image_channel_order = CL_RGBA; break;
  587. default: assert(0); fmt.image_channel_order = CL_LUMINANCE; break;
  588. }
  589. fmt.image_channel_data_type = normalized ? CL_UNORM_INT8 : CL_UNSIGNED_INT8;
  590. return fmt;
  591. }
  592. };
  593. // Library blobal state
  594. ocl g_ocl;
  595. bool opencl_init(bool force_serialization)
  596. {
  597. if (g_ocl.is_initialized())
  598. {
  599. assert(0);
  600. return false;
  601. }
  602. if (!g_ocl.init(force_serialization))
  603. {
  604. ocl_error_printf("opencl_init: Failed initializing OpenCL\n");
  605. return false;
  606. }
  607. const char* pKernel_src = nullptr;
  608. size_t kernel_src_size = 0;
  609. uint8_vec kernel_src;
  610. #if BASISU_USE_OCL_KERNELS_HEADER
  611. pKernel_src = reinterpret_cast<const char*>(ocl_kernels_cl);
  612. kernel_src_size = ocl_kernels_cl_len;
  613. #else
  614. if (!read_file_to_vec(BASISU_OCL_KERNELS_FILENAME, kernel_src))
  615. {
  616. ocl_error_printf("opencl_init: Cannot read OpenCL kernel source file \"%s\"\n", BASISU_OCL_KERNELS_FILENAME);
  617. g_ocl.deinit();
  618. return false;
  619. }
  620. pKernel_src = (char*)kernel_src.data();
  621. kernel_src_size = kernel_src.size();
  622. #endif
  623. if (!kernel_src_size)
  624. {
  625. ocl_error_printf("opencl_init: Invalid OpenCL kernel source file \"%s\"\n", BASISU_OCL_KERNELS_FILENAME);
  626. g_ocl.deinit();
  627. return false;
  628. }
  629. if (!g_ocl.init_program(pKernel_src, kernel_src_size))
  630. {
  631. ocl_error_printf("opencl_init: Failed compiling OpenCL program\n");
  632. g_ocl.deinit();
  633. return false;
  634. }
  635. printf("OpenCL support initialized successfully\n");
  636. return true;
  637. }
  638. void opencl_deinit()
  639. {
  640. g_ocl.deinit();
  641. }
  642. bool opencl_is_available()
  643. {
  644. return g_ocl.is_initialized();
  645. }
  646. struct opencl_context
  647. {
  648. size_t m_ocl_total_pixel_blocks;
  649. cl_mem m_ocl_pixel_blocks;
  650. cl_command_queue m_command_queue;
  651. cl_kernel m_ocl_encode_etc1s_blocks_kernel;
  652. cl_kernel m_ocl_refine_endpoint_clusterization_kernel;
  653. cl_kernel m_ocl_encode_etc1s_from_pixel_cluster_kernel;
  654. cl_kernel m_ocl_find_optimal_selector_clusters_for_each_block_kernel;
  655. cl_kernel m_ocl_determine_selectors_kernel;
  656. };
  657. opencl_context_ptr opencl_create_context()
  658. {
  659. if (!opencl_is_available())
  660. {
  661. ocl_error_printf("opencl_create_context: OpenCL not initialized\n");
  662. assert(0);
  663. return nullptr;
  664. }
  665. interval_timer tm;
  666. tm.start();
  667. opencl_context* pContext = static_cast<opencl_context * >(calloc(sizeof(opencl_context), 1));
  668. if (!pContext)
  669. return nullptr;
  670. // To avoid driver bugs in some drivers - serialize this. Likely not necessary, we don't know.
  671. // https://community.intel.com/t5/OpenCL-for-CPU/Bug-report-clCreateKernelsInProgram-is-not-thread-safe/td-p/1159771
  672. pContext->m_command_queue = g_ocl.create_command_queue();
  673. if (!pContext->m_command_queue)
  674. {
  675. ocl_error_printf("opencl_create_context: Failed creating OpenCL command queue!\n");
  676. opencl_destroy_context(pContext);
  677. return nullptr;
  678. }
  679. pContext->m_ocl_encode_etc1s_blocks_kernel = g_ocl.create_kernel("encode_etc1s_blocks");
  680. if (!pContext->m_ocl_encode_etc1s_blocks_kernel)
  681. {
  682. ocl_error_printf("opencl_create_context: Failed creating OpenCL kernel encode_etc1s_block\n");
  683. opencl_destroy_context(pContext);
  684. return nullptr;
  685. }
  686. pContext->m_ocl_refine_endpoint_clusterization_kernel = g_ocl.create_kernel("refine_endpoint_clusterization");
  687. if (!pContext->m_ocl_refine_endpoint_clusterization_kernel)
  688. {
  689. ocl_error_printf("opencl_create_context: Failed creating OpenCL kernel refine_endpoint_clusterization\n");
  690. opencl_destroy_context(pContext);
  691. return nullptr;
  692. }
  693. pContext->m_ocl_encode_etc1s_from_pixel_cluster_kernel = g_ocl.create_kernel("encode_etc1s_from_pixel_cluster");
  694. if (!pContext->m_ocl_encode_etc1s_from_pixel_cluster_kernel)
  695. {
  696. ocl_error_printf("opencl_create_context: Failed creating OpenCL kernel encode_etc1s_from_pixel_cluster\n");
  697. opencl_destroy_context(pContext);
  698. return nullptr;
  699. }
  700. pContext->m_ocl_find_optimal_selector_clusters_for_each_block_kernel = g_ocl.create_kernel("find_optimal_selector_clusters_for_each_block");
  701. if (!pContext->m_ocl_find_optimal_selector_clusters_for_each_block_kernel)
  702. {
  703. ocl_error_printf("opencl_create_context: Failed creating OpenCL kernel find_optimal_selector_clusters_for_each_block\n");
  704. opencl_destroy_context(pContext);
  705. return nullptr;
  706. }
  707. pContext->m_ocl_determine_selectors_kernel = g_ocl.create_kernel("determine_selectors");
  708. if (!pContext->m_ocl_determine_selectors_kernel)
  709. {
  710. ocl_error_printf("opencl_create_context: Failed creating OpenCL kernel determine_selectors\n");
  711. opencl_destroy_context(pContext);
  712. return nullptr;
  713. }
  714. debug_printf("opencl_create_context: Elapsed time: %f secs\n", tm.get_elapsed_secs());
  715. return pContext;
  716. }
  717. void opencl_destroy_context(opencl_context_ptr pContext)
  718. {
  719. if (!pContext)
  720. return;
  721. interval_timer tm;
  722. tm.start();
  723. g_ocl.destroy_buffer(pContext->m_ocl_pixel_blocks);
  724. g_ocl.destroy_kernel(pContext->m_ocl_determine_selectors_kernel);
  725. g_ocl.destroy_kernel(pContext->m_ocl_find_optimal_selector_clusters_for_each_block_kernel);
  726. g_ocl.destroy_kernel(pContext->m_ocl_encode_etc1s_from_pixel_cluster_kernel);
  727. g_ocl.destroy_kernel(pContext->m_ocl_encode_etc1s_blocks_kernel);
  728. g_ocl.destroy_kernel(pContext->m_ocl_refine_endpoint_clusterization_kernel);
  729. g_ocl.destroy_command_queue(pContext->m_command_queue);
  730. memset(pContext, 0, sizeof(opencl_context));
  731. free(pContext);
  732. debug_printf("opencl_destroy_context: Elapsed time: %f secs\n", tm.get_elapsed_secs());
  733. }
  734. #pragma pack(push, 1)
  735. struct cl_encode_etc1s_param_struct
  736. {
  737. int m_total_blocks;
  738. int m_perceptual;
  739. int m_total_perms;
  740. };
  741. #pragma pack(pop)
  742. bool opencl_set_pixel_blocks(opencl_context_ptr pContext, size_t total_blocks, const cl_pixel_block* pPixel_blocks)
  743. {
  744. if (!opencl_is_available())
  745. return false;
  746. if (pContext->m_ocl_pixel_blocks)
  747. {
  748. g_ocl.destroy_buffer(pContext->m_ocl_pixel_blocks);
  749. pContext->m_ocl_pixel_blocks = nullptr;
  750. }
  751. pContext->m_ocl_pixel_blocks = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pPixel_blocks, sizeof(cl_pixel_block) * total_blocks);
  752. if (!pContext->m_ocl_pixel_blocks)
  753. return false;
  754. pContext->m_ocl_total_pixel_blocks = total_blocks;
  755. return true;
  756. }
  757. bool opencl_encode_etc1s_blocks(opencl_context_ptr pContext, etc_block* pOutput_blocks, bool perceptual, uint32_t total_perms)
  758. {
  759. if (!opencl_is_available())
  760. return false;
  761. interval_timer tm;
  762. tm.start();
  763. assert(pContext->m_ocl_pixel_blocks);
  764. if (!pContext->m_ocl_pixel_blocks)
  765. return false;
  766. assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX);
  767. cl_encode_etc1s_param_struct ps;
  768. ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks;
  769. ps.m_perceptual = perceptual;
  770. ps.m_total_perms = total_perms;
  771. bool status = false;
  772. cl_mem vars = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue , &ps, sizeof(ps));
  773. cl_mem block_buf = g_ocl.alloc_write_buffer(sizeof(etc_block) * pContext->m_ocl_total_pixel_blocks);
  774. if (!vars || !block_buf)
  775. goto exit;
  776. if (!g_ocl.set_kernel_args(pContext->m_ocl_encode_etc1s_blocks_kernel, vars, pContext->m_ocl_pixel_blocks, block_buf))
  777. goto exit;
  778. if (!g_ocl.run_2D(pContext->m_command_queue, pContext->m_ocl_encode_etc1s_blocks_kernel, pContext->m_ocl_total_pixel_blocks, 1))
  779. goto exit;
  780. if (!g_ocl.read_from_buffer(pContext->m_command_queue, block_buf, pOutput_blocks, pContext->m_ocl_total_pixel_blocks * sizeof(etc_block)))
  781. goto exit;
  782. status = true;
  783. debug_printf("opencl_encode_etc1s_blocks: Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
  784. exit:
  785. g_ocl.destroy_buffer(block_buf);
  786. g_ocl.destroy_buffer(vars);
  787. return status;
  788. }
  789. bool opencl_encode_etc1s_pixel_clusters(
  790. opencl_context_ptr pContext,
  791. etc_block* pOutput_blocks,
  792. uint32_t total_clusters,
  793. const cl_pixel_cluster* pClusters,
  794. uint64_t total_pixels,
  795. const color_rgba* pPixels, const uint32_t* pPixel_weights,
  796. bool perceptual, uint32_t total_perms)
  797. {
  798. if (!opencl_is_available())
  799. return false;
  800. interval_timer tm;
  801. tm.start();
  802. cl_encode_etc1s_param_struct ps;
  803. ps.m_total_blocks = total_clusters;
  804. ps.m_perceptual = perceptual;
  805. ps.m_total_perms = total_perms;
  806. bool status = false;
  807. if (sizeof(size_t) == sizeof(uint32_t))
  808. {
  809. if ( ((sizeof(cl_pixel_cluster) * total_clusters) > UINT32_MAX) ||
  810. ((sizeof(color_rgba) * total_pixels) > UINT32_MAX) ||
  811. ((sizeof(uint32_t) * total_pixels) > UINT32_MAX) )
  812. {
  813. return false;
  814. }
  815. }
  816. cl_mem vars = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue , &ps, sizeof(ps));
  817. cl_mem input_clusters = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pClusters, (size_t)(sizeof(cl_pixel_cluster) * total_clusters));
  818. cl_mem input_pixels = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pPixels, (size_t)(sizeof(color_rgba) * total_pixels));
  819. cl_mem weights_buf = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pPixel_weights, (size_t)(sizeof(uint32_t) * total_pixels));
  820. cl_mem block_buf = g_ocl.alloc_write_buffer(sizeof(etc_block) * total_clusters);
  821. if (!vars || !input_clusters || !input_pixels || !weights_buf || !block_buf)
  822. goto exit;
  823. if (!g_ocl.set_kernel_args(pContext->m_ocl_encode_etc1s_from_pixel_cluster_kernel, vars, input_clusters, input_pixels, weights_buf, block_buf))
  824. goto exit;
  825. if (!g_ocl.run_2D(pContext->m_command_queue, pContext->m_ocl_encode_etc1s_from_pixel_cluster_kernel, total_clusters, 1))
  826. goto exit;
  827. if (!g_ocl.read_from_buffer(pContext->m_command_queue, block_buf, pOutput_blocks, sizeof(etc_block) * total_clusters))
  828. goto exit;
  829. status = true;
  830. debug_printf("opencl_encode_etc1s_pixel_clusters: Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
  831. exit:
  832. g_ocl.destroy_buffer(block_buf);
  833. g_ocl.destroy_buffer(weights_buf);
  834. g_ocl.destroy_buffer(input_pixels);
  835. g_ocl.destroy_buffer(input_clusters);
  836. g_ocl.destroy_buffer(vars);
  837. return status;
  838. }
  839. #pragma pack(push, 1)
  840. struct cl_rec_param_struct
  841. {
  842. int m_total_blocks;
  843. int m_perceptual;
  844. };
  845. #pragma pack(pop)
  846. bool opencl_refine_endpoint_clusterization(
  847. opencl_context_ptr pContext,
  848. const cl_block_info_struct* pPixel_block_info,
  849. uint32_t total_clusters,
  850. const cl_endpoint_cluster_struct* pCluster_info,
  851. const uint32_t* pSorted_block_indices,
  852. uint32_t* pOutput_cluster_indices,
  853. bool perceptual)
  854. {
  855. if (!opencl_is_available())
  856. return false;
  857. interval_timer tm;
  858. tm.start();
  859. assert(pContext->m_ocl_pixel_blocks);
  860. if (!pContext->m_ocl_pixel_blocks)
  861. return false;
  862. assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX);
  863. cl_rec_param_struct ps;
  864. ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks;
  865. ps.m_perceptual = perceptual;
  866. bool status = false;
  867. cl_mem pixel_block_info = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pPixel_block_info, sizeof(cl_block_info_struct) * pContext->m_ocl_total_pixel_blocks);
  868. cl_mem cluster_info = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pCluster_info, sizeof(cl_endpoint_cluster_struct) * total_clusters);
  869. cl_mem sorted_block_indices = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pSorted_block_indices, sizeof(uint32_t) * pContext->m_ocl_total_pixel_blocks);
  870. cl_mem output_buf = g_ocl.alloc_write_buffer(sizeof(uint32_t) * pContext->m_ocl_total_pixel_blocks);
  871. if (!pixel_block_info || !cluster_info || !sorted_block_indices || !output_buf)
  872. goto exit;
  873. if (!g_ocl.set_kernel_args(pContext->m_ocl_refine_endpoint_clusterization_kernel, ps, pContext->m_ocl_pixel_blocks, pixel_block_info, cluster_info, sorted_block_indices, output_buf))
  874. goto exit;
  875. if (!g_ocl.run_2D(pContext->m_command_queue, pContext->m_ocl_refine_endpoint_clusterization_kernel, pContext->m_ocl_total_pixel_blocks, 1))
  876. goto exit;
  877. if (!g_ocl.read_from_buffer(pContext->m_command_queue, output_buf, pOutput_cluster_indices, pContext->m_ocl_total_pixel_blocks * sizeof(uint32_t)))
  878. goto exit;
  879. debug_printf("opencl_refine_endpoint_clusterization: Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
  880. status = true;
  881. exit:
  882. g_ocl.destroy_buffer(pixel_block_info);
  883. g_ocl.destroy_buffer(cluster_info);
  884. g_ocl.destroy_buffer(sorted_block_indices);
  885. g_ocl.destroy_buffer(output_buf);
  886. return status;
  887. }
  888. bool opencl_find_optimal_selector_clusters_for_each_block(
  889. opencl_context_ptr pContext,
  890. const fosc_block_struct* pInput_block_info, // one per block
  891. uint32_t total_input_selectors,
  892. const fosc_selector_struct* pInput_selectors,
  893. const uint32_t* pSelector_cluster_indices,
  894. uint32_t* pOutput_selector_cluster_indices, // one per block
  895. bool perceptual)
  896. {
  897. if (!opencl_is_available())
  898. return false;
  899. interval_timer tm;
  900. tm.start();
  901. assert(pContext->m_ocl_pixel_blocks);
  902. if (!pContext->m_ocl_pixel_blocks)
  903. return false;
  904. assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX);
  905. fosc_param_struct ps;
  906. ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks;
  907. ps.m_perceptual = perceptual;
  908. bool status = false;
  909. cl_mem input_block_info = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pInput_block_info, sizeof(fosc_block_struct) * pContext->m_ocl_total_pixel_blocks);
  910. cl_mem input_selectors = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pInput_selectors, sizeof(fosc_selector_struct) * total_input_selectors);
  911. cl_mem selector_cluster_indices = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pSelector_cluster_indices, sizeof(uint32_t) * total_input_selectors);
  912. cl_mem output_selector_cluster_indices = g_ocl.alloc_write_buffer(sizeof(uint32_t) * pContext->m_ocl_total_pixel_blocks);
  913. if (!input_block_info || !input_selectors || !selector_cluster_indices || !output_selector_cluster_indices)
  914. goto exit;
  915. if (!g_ocl.set_kernel_args(pContext->m_ocl_find_optimal_selector_clusters_for_each_block_kernel, ps, pContext->m_ocl_pixel_blocks, input_block_info, input_selectors, selector_cluster_indices, output_selector_cluster_indices))
  916. goto exit;
  917. if (!g_ocl.run_2D(pContext->m_command_queue, pContext->m_ocl_find_optimal_selector_clusters_for_each_block_kernel, pContext->m_ocl_total_pixel_blocks, 1))
  918. goto exit;
  919. if (!g_ocl.read_from_buffer(pContext->m_command_queue, output_selector_cluster_indices, pOutput_selector_cluster_indices, pContext->m_ocl_total_pixel_blocks * sizeof(uint32_t)))
  920. goto exit;
  921. debug_printf("opencl_find_optimal_selector_clusters_for_each_block: Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
  922. status = true;
  923. exit:
  924. g_ocl.destroy_buffer(input_block_info);
  925. g_ocl.destroy_buffer(input_selectors);
  926. g_ocl.destroy_buffer(selector_cluster_indices);
  927. g_ocl.destroy_buffer(output_selector_cluster_indices);
  928. return status;
  929. }
  930. bool opencl_determine_selectors(
  931. opencl_context_ptr pContext,
  932. const color_rgba* pInput_etc_color5_and_inten,
  933. etc_block* pOutput_blocks,
  934. bool perceptual)
  935. {
  936. if (!opencl_is_available())
  937. return false;
  938. interval_timer tm;
  939. tm.start();
  940. assert(pContext->m_ocl_pixel_blocks);
  941. if (!pContext->m_ocl_pixel_blocks)
  942. return false;
  943. assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX);
  944. ds_param_struct ps;
  945. ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks;
  946. ps.m_perceptual = perceptual;
  947. bool status = false;
  948. cl_mem input_etc_color5_intens = g_ocl.alloc_and_init_read_buffer(pContext->m_command_queue, pInput_etc_color5_and_inten, sizeof(color_rgba) * pContext->m_ocl_total_pixel_blocks);
  949. cl_mem output_blocks = g_ocl.alloc_write_buffer(sizeof(etc_block) * pContext->m_ocl_total_pixel_blocks);
  950. if (!input_etc_color5_intens || !output_blocks)
  951. goto exit;
  952. if (!g_ocl.set_kernel_args(pContext->m_ocl_determine_selectors_kernel, ps, pContext->m_ocl_pixel_blocks, input_etc_color5_intens, output_blocks))
  953. goto exit;
  954. if (!g_ocl.run_2D(pContext->m_command_queue, pContext->m_ocl_determine_selectors_kernel, pContext->m_ocl_total_pixel_blocks, 1))
  955. goto exit;
  956. if (!g_ocl.read_from_buffer(pContext->m_command_queue, output_blocks, pOutput_blocks, pContext->m_ocl_total_pixel_blocks * sizeof(etc_block)))
  957. goto exit;
  958. debug_printf("opencl_determine_selectors: Elapsed time: %3.3f secs\n", tm.get_elapsed_secs());
  959. status = true;
  960. exit:
  961. g_ocl.destroy_buffer(input_etc_color5_intens);
  962. g_ocl.destroy_buffer(output_blocks);
  963. return status;
  964. }
  965. #else
  966. namespace basisu
  967. {
  968. // No OpenCL support - all dummy functions that return false;
  969. bool opencl_init(bool force_serialization)
  970. {
  971. BASISU_NOTE_UNUSED(force_serialization);
  972. return false;
  973. }
  974. void opencl_deinit()
  975. {
  976. }
  977. bool opencl_is_available()
  978. {
  979. return false;
  980. }
  981. opencl_context_ptr opencl_create_context()
  982. {
  983. return nullptr;
  984. }
  985. void opencl_destroy_context(opencl_context_ptr context)
  986. {
  987. BASISU_NOTE_UNUSED(context);
  988. }
  989. bool opencl_set_pixel_blocks(opencl_context_ptr pContext, size_t total_blocks, const cl_pixel_block* pPixel_blocks)
  990. {
  991. BASISU_NOTE_UNUSED(pContext);
  992. BASISU_NOTE_UNUSED(total_blocks);
  993. BASISU_NOTE_UNUSED(pPixel_blocks);
  994. return false;
  995. }
  996. bool opencl_encode_etc1s_blocks(opencl_context_ptr pContext, etc_block* pOutput_blocks, bool perceptual, uint32_t total_perms)
  997. {
  998. BASISU_NOTE_UNUSED(pContext);
  999. BASISU_NOTE_UNUSED(pOutput_blocks);
  1000. BASISU_NOTE_UNUSED(perceptual);
  1001. BASISU_NOTE_UNUSED(total_perms);
  1002. return false;
  1003. }
  1004. bool opencl_encode_etc1s_pixel_clusters(
  1005. opencl_context_ptr pContext,
  1006. etc_block* pOutput_blocks,
  1007. uint32_t total_clusters,
  1008. const cl_pixel_cluster* pClusters,
  1009. uint64_t total_pixels,
  1010. const color_rgba* pPixels, const uint32_t *pPixel_weights,
  1011. bool perceptual, uint32_t total_perms)
  1012. {
  1013. BASISU_NOTE_UNUSED(pContext);
  1014. BASISU_NOTE_UNUSED(pOutput_blocks);
  1015. BASISU_NOTE_UNUSED(total_clusters);
  1016. BASISU_NOTE_UNUSED(pClusters);
  1017. BASISU_NOTE_UNUSED(total_pixels);
  1018. BASISU_NOTE_UNUSED(pPixels);
  1019. BASISU_NOTE_UNUSED(pPixel_weights);
  1020. BASISU_NOTE_UNUSED(perceptual);
  1021. BASISU_NOTE_UNUSED(total_perms);
  1022. return false;
  1023. }
  1024. bool opencl_refine_endpoint_clusterization(
  1025. opencl_context_ptr pContext,
  1026. const cl_block_info_struct* pPixel_block_info,
  1027. uint32_t total_clusters,
  1028. const cl_endpoint_cluster_struct* pCluster_info,
  1029. const uint32_t* pSorted_block_indices,
  1030. uint32_t* pOutput_cluster_indices,
  1031. bool perceptual)
  1032. {
  1033. BASISU_NOTE_UNUSED(pContext);
  1034. BASISU_NOTE_UNUSED(pPixel_block_info);
  1035. BASISU_NOTE_UNUSED(total_clusters);
  1036. BASISU_NOTE_UNUSED(pCluster_info);
  1037. BASISU_NOTE_UNUSED(pSorted_block_indices);
  1038. BASISU_NOTE_UNUSED(pOutput_cluster_indices);
  1039. BASISU_NOTE_UNUSED(perceptual);
  1040. return false;
  1041. }
  1042. bool opencl_find_optimal_selector_clusters_for_each_block(
  1043. opencl_context_ptr pContext,
  1044. const fosc_block_struct* pInput_block_info, // one per block
  1045. uint32_t total_input_selectors,
  1046. const fosc_selector_struct* pInput_selectors,
  1047. const uint32_t* pSelector_cluster_indices,
  1048. uint32_t* pOutput_selector_cluster_indices, // one per block
  1049. bool perceptual)
  1050. {
  1051. BASISU_NOTE_UNUSED(pContext);
  1052. BASISU_NOTE_UNUSED(pInput_block_info);
  1053. BASISU_NOTE_UNUSED(total_input_selectors);
  1054. BASISU_NOTE_UNUSED(pInput_selectors);
  1055. BASISU_NOTE_UNUSED(pSelector_cluster_indices);
  1056. BASISU_NOTE_UNUSED(pOutput_selector_cluster_indices);
  1057. BASISU_NOTE_UNUSED(perceptual);
  1058. return false;
  1059. }
  1060. bool opencl_determine_selectors(
  1061. opencl_context_ptr pContext,
  1062. const color_rgba* pInput_etc_color5_and_inten,
  1063. etc_block* pOutput_blocks,
  1064. bool perceptual)
  1065. {
  1066. BASISU_NOTE_UNUSED(pContext);
  1067. BASISU_NOTE_UNUSED(pInput_etc_color5_and_inten);
  1068. BASISU_NOTE_UNUSED(pOutput_blocks);
  1069. BASISU_NOTE_UNUSED(perceptual);
  1070. return false;
  1071. }
  1072. #endif // BASISU_SUPPORT_OPENCL
  1073. } // namespace basisu