astcenc_entry.cpp 43 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434
  1. // SPDX-License-Identifier: Apache-2.0
  2. // ----------------------------------------------------------------------------
  3. // Copyright 2011-2022 Arm Limited
  4. //
  5. // Licensed under the Apache License, Version 2.0 (the "License"); you may not
  6. // use this file except in compliance with the License. You may obtain a copy
  7. // of the License at:
  8. //
  9. // http://www.apache.org/licenses/LICENSE-2.0
  10. //
  11. // Unless required by applicable law or agreed to in writing, software
  12. // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
  13. // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
  14. // License for the specific language governing permissions and limitations
  15. // under the License.
  16. // ----------------------------------------------------------------------------
  17. /**
  18. * @brief Functions for the library entrypoint.
  19. */
  20. #include <array>
  21. #include <cstring>
  22. #include <new>
  23. #include "astcenc.h"
  24. #include "astcenc_internal_entry.h"
  25. #include "astcenc_diagnostic_trace.h"
  26. /**
  27. * @brief Record of the quality tuning parameter values.
  28. *
  29. * See the @c astcenc_config structure for detailed parameter documentation.
  30. *
  31. * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit.
  32. * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios
  33. * for the more through search presets because the underlying db_limit is so much higher.
  34. */
  35. struct astcenc_preset_config
  36. {
  37. float quality;
  38. unsigned int tune_partition_count_limit;
  39. unsigned int tune_2partition_index_limit;
  40. unsigned int tune_3partition_index_limit;
  41. unsigned int tune_4partition_index_limit;
  42. unsigned int tune_block_mode_limit;
  43. unsigned int tune_refinement_limit;
  44. unsigned int tune_candidate_limit;
  45. unsigned int tune_2partitioning_candidate_limit;
  46. unsigned int tune_3partitioning_candidate_limit;
  47. unsigned int tune_4partitioning_candidate_limit;
  48. float tune_db_limit_a_base;
  49. float tune_db_limit_b_base;
  50. float tune_mse_overshoot;
  51. float tune_2_partition_early_out_limit_factor;
  52. float tune_3_partition_early_out_limit_factor;
  53. float tune_2_plane_early_out_limit_correlation;
  54. };
  55. /**
  56. * @brief The static presets for high bandwidth encodings (x < 25 texels per block).
  57. */
  58. static const std::array<astcenc_preset_config, 6> preset_configs_high {{
  59. {
  60. ASTCENC_PRE_FASTEST,
  61. 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
  62. }, {
  63. ASTCENC_PRE_FAST,
  64. 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f
  65. }, {
  66. ASTCENC_PRE_MEDIUM,
  67. 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f
  68. }, {
  69. ASTCENC_PRE_THOROUGH,
  70. 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f
  71. }, {
  72. ASTCENC_PRE_VERYTHOROUGH,
  73. 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
  74. }, {
  75. ASTCENC_PRE_EXHAUSTIVE,
  76. 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
  77. }
  78. }};
  79. /**
  80. * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block).
  81. */
  82. static const std::array<astcenc_preset_config, 6> preset_configs_mid {{
  83. {
  84. ASTCENC_PRE_FASTEST,
  85. 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f
  86. }, {
  87. ASTCENC_PRE_FAST,
  88. 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f
  89. }, {
  90. ASTCENC_PRE_MEDIUM,
  91. 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f
  92. }, {
  93. ASTCENC_PRE_THOROUGH,
  94. 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f
  95. }, {
  96. ASTCENC_PRE_VERYTHOROUGH,
  97. 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
  98. }, {
  99. ASTCENC_PRE_EXHAUSTIVE,
  100. 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
  101. }
  102. }};
  103. /**
  104. * @brief The static presets for low bandwidth encodings (64 <= x texels per block).
  105. */
  106. static const std::array<astcenc_preset_config, 6> preset_configs_low {{
  107. {
  108. ASTCENC_PRE_FASTEST,
  109. 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f
  110. }, {
  111. ASTCENC_PRE_FAST,
  112. 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f
  113. }, {
  114. ASTCENC_PRE_MEDIUM,
  115. 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f
  116. }, {
  117. ASTCENC_PRE_THOROUGH,
  118. 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f
  119. }, {
  120. ASTCENC_PRE_VERYTHOROUGH,
  121. 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f
  122. }, {
  123. ASTCENC_PRE_EXHAUSTIVE,
  124. 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f
  125. }
  126. }};
  127. /**
  128. * @brief Validate CPU floating point meets assumptions made in the codec.
  129. *
  130. * The codec is written with the assumption that a float threaded through the @c if32 union will be
  131. * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the
  132. * case in an IEEE-754 compliant system, however not every system or compilation mode is actually
  133. * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled.
  134. *
  135. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  136. */
  137. static astcenc_error validate_cpu_float()
  138. {
  139. if32 p;
  140. volatile float xprec_testval = 2.51f;
  141. p.f = xprec_testval + 12582912.0f;
  142. float q = p.f - 12582912.0f;
  143. if (q != 3.0f)
  144. {
  145. return ASTCENC_ERR_BAD_CPU_FLOAT;
  146. }
  147. return ASTCENC_SUCCESS;
  148. }
  149. /**
  150. * @brief Validate CPU ISA support meets the requirements of this build of the library.
  151. *
  152. * Each library build is statically compiled for a particular set of CPU ISA features, such as the
  153. * SIMD support or other ISA extensions such as POPCNT. This function checks that the host CPU
  154. * actually supports everything this build needs.
  155. *
  156. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  157. */
  158. static astcenc_error validate_cpu_isa()
  159. {
  160. #if ASTCENC_SSE >= 41
  161. if (!cpu_supports_sse41())
  162. {
  163. return ASTCENC_ERR_BAD_CPU_ISA;
  164. }
  165. #endif
  166. #if ASTCENC_POPCNT >= 1
  167. if (!cpu_supports_popcnt())
  168. {
  169. return ASTCENC_ERR_BAD_CPU_ISA;
  170. }
  171. #endif
  172. #if ASTCENC_F16C >= 1
  173. if (!cpu_supports_f16c())
  174. {
  175. return ASTCENC_ERR_BAD_CPU_ISA;
  176. }
  177. #endif
  178. #if ASTCENC_AVX >= 2
  179. if (!cpu_supports_avx2())
  180. {
  181. return ASTCENC_ERR_BAD_CPU_ISA;
  182. }
  183. #endif
  184. return ASTCENC_SUCCESS;
  185. }
  186. /**
  187. * @brief Validate config profile.
  188. *
  189. * @param profile The profile to check.
  190. *
  191. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  192. */
  193. static astcenc_error validate_profile(
  194. astcenc_profile profile
  195. ) {
  196. // Values in this enum are from an external user, so not guaranteed to be
  197. // bounded to the enum values
  198. switch (static_cast<int>(profile))
  199. {
  200. case ASTCENC_PRF_LDR_SRGB:
  201. case ASTCENC_PRF_LDR:
  202. case ASTCENC_PRF_HDR_RGB_LDR_A:
  203. case ASTCENC_PRF_HDR:
  204. return ASTCENC_SUCCESS;
  205. default:
  206. return ASTCENC_ERR_BAD_PROFILE;
  207. }
  208. }
  209. /**
  210. * @brief Validate block size.
  211. *
  212. * @param block_x The block x dimensions.
  213. * @param block_y The block y dimensions.
  214. * @param block_z The block z dimensions.
  215. *
  216. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  217. */
  218. static astcenc_error validate_block_size(
  219. unsigned int block_x,
  220. unsigned int block_y,
  221. unsigned int block_z
  222. ) {
  223. // Test if this is a legal block size at all
  224. bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) ||
  225. ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z)));
  226. if (!is_legal)
  227. {
  228. return ASTCENC_ERR_BAD_BLOCK_SIZE;
  229. }
  230. // Test if this build has sufficient capacity for this block size
  231. bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS;
  232. if (!have_capacity)
  233. {
  234. return ASTCENC_ERR_NOT_IMPLEMENTED;
  235. }
  236. return ASTCENC_SUCCESS;
  237. }
  238. /**
  239. * @brief Validate flags.
  240. *
  241. * @param flags The flags to check.
  242. *
  243. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  244. */
  245. static astcenc_error validate_flags(
  246. unsigned int flags
  247. ) {
  248. // Flags field must not contain any unknown flag bits
  249. unsigned int exMask = ~ASTCENC_ALL_FLAGS;
  250. if (popcount(flags & exMask) != 0)
  251. {
  252. return ASTCENC_ERR_BAD_FLAGS;
  253. }
  254. // Flags field must only contain at most a single map type
  255. exMask = ASTCENC_FLG_MAP_MASK
  256. | ASTCENC_FLG_MAP_NORMAL
  257. | ASTCENC_FLG_MAP_RGBM;
  258. if (popcount(flags & exMask) > 1)
  259. {
  260. return ASTCENC_ERR_BAD_FLAGS;
  261. }
  262. return ASTCENC_SUCCESS;
  263. }
  264. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  265. /**
  266. * @brief Validate single channel compression swizzle.
  267. *
  268. * @param swizzle The swizzle to check.
  269. *
  270. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  271. */
  272. static astcenc_error validate_compression_swz(
  273. astcenc_swz swizzle
  274. ) {
  275. // Not all enum values are handled; SWZ_Z is invalid for compression
  276. switch (static_cast<int>(swizzle))
  277. {
  278. case ASTCENC_SWZ_R:
  279. case ASTCENC_SWZ_G:
  280. case ASTCENC_SWZ_B:
  281. case ASTCENC_SWZ_A:
  282. case ASTCENC_SWZ_0:
  283. case ASTCENC_SWZ_1:
  284. return ASTCENC_SUCCESS;
  285. default:
  286. return ASTCENC_ERR_BAD_SWIZZLE;
  287. }
  288. }
  289. /**
  290. * @brief Validate overall compression swizzle.
  291. *
  292. * @param swizzle The swizzle to check.
  293. *
  294. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  295. */
  296. static astcenc_error validate_compression_swizzle(
  297. const astcenc_swizzle& swizzle
  298. ) {
  299. if (validate_compression_swz(swizzle.r) ||
  300. validate_compression_swz(swizzle.g) ||
  301. validate_compression_swz(swizzle.b) ||
  302. validate_compression_swz(swizzle.a))
  303. {
  304. return ASTCENC_ERR_BAD_SWIZZLE;
  305. }
  306. return ASTCENC_SUCCESS;
  307. }
  308. #endif
  309. /**
  310. * @brief Validate single channel decompression swizzle.
  311. *
  312. * @param swizzle The swizzle to check.
  313. *
  314. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  315. */
  316. static astcenc_error validate_decompression_swz(
  317. astcenc_swz swizzle
  318. ) {
  319. // Values in this enum are from an external user, so not guaranteed to be
  320. // bounded to the enum values
  321. switch (static_cast<int>(swizzle))
  322. {
  323. case ASTCENC_SWZ_R:
  324. case ASTCENC_SWZ_G:
  325. case ASTCENC_SWZ_B:
  326. case ASTCENC_SWZ_A:
  327. case ASTCENC_SWZ_0:
  328. case ASTCENC_SWZ_1:
  329. case ASTCENC_SWZ_Z:
  330. return ASTCENC_SUCCESS;
  331. default:
  332. return ASTCENC_ERR_BAD_SWIZZLE;
  333. }
  334. }
  335. /**
  336. * @brief Validate overall decompression swizzle.
  337. *
  338. * @param swizzle The swizzle to check.
  339. *
  340. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  341. */
  342. static astcenc_error validate_decompression_swizzle(
  343. const astcenc_swizzle& swizzle
  344. ) {
  345. if (validate_decompression_swz(swizzle.r) ||
  346. validate_decompression_swz(swizzle.g) ||
  347. validate_decompression_swz(swizzle.b) ||
  348. validate_decompression_swz(swizzle.a))
  349. {
  350. return ASTCENC_ERR_BAD_SWIZZLE;
  351. }
  352. return ASTCENC_SUCCESS;
  353. }
  354. /**
  355. * Validate that an incoming configuration is in-spec.
  356. *
  357. * This function can respond in two ways:
  358. *
  359. * * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown
  360. * for out-of-range inputs in this case.
  361. * * Numerical inputs and logic inputs are are logically invalid and which make no sense
  362. * algorithmically will return an error.
  363. *
  364. * @param[in,out] config The input compressor configuration.
  365. *
  366. * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure.
  367. */
  368. static astcenc_error validate_config(
  369. astcenc_config &config
  370. ) {
  371. astcenc_error status;
  372. status = validate_profile(config.profile);
  373. if (status != ASTCENC_SUCCESS)
  374. {
  375. return status;
  376. }
  377. status = validate_flags(config.flags);
  378. if (status != ASTCENC_SUCCESS)
  379. {
  380. return status;
  381. }
  382. status = validate_block_size(config.block_x, config.block_y, config.block_z);
  383. if (status != ASTCENC_SUCCESS)
  384. {
  385. return status;
  386. }
  387. #if defined(ASTCENC_DECOMPRESS_ONLY)
  388. // Decompress-only builds only support decompress-only contexts
  389. if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY))
  390. {
  391. return ASTCENC_ERR_BAD_PARAM;
  392. }
  393. #endif
  394. config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f);
  395. config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u);
  396. config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
  397. config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
  398. config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS);
  399. config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u);
  400. config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u);
  401. config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES);
  402. config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
  403. config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
  404. config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIIONING_CANDIDATES);
  405. config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f);
  406. config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f);
  407. config.tune_2_partition_early_out_limit_factor = astc::max(config.tune_2_partition_early_out_limit_factor, 0.0f);
  408. config.tune_3_partition_early_out_limit_factor = astc::max(config.tune_3_partition_early_out_limit_factor, 0.0f);
  409. config.tune_2_plane_early_out_limit_correlation = astc::max(config.tune_2_plane_early_out_limit_correlation, 0.0f);
  410. // Specifying a zero weight color component is not allowed; force to small value
  411. float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight),
  412. astc::max(config.cw_b_weight, config.cw_a_weight));
  413. if (max_weight > 0.0f)
  414. {
  415. max_weight /= 1000.0f;
  416. config.cw_r_weight = astc::max(config.cw_r_weight, max_weight);
  417. config.cw_g_weight = astc::max(config.cw_g_weight, max_weight);
  418. config.cw_b_weight = astc::max(config.cw_b_weight, max_weight);
  419. config.cw_a_weight = astc::max(config.cw_a_weight, max_weight);
  420. }
  421. // If all color components error weights are zero then return an error
  422. else
  423. {
  424. return ASTCENC_ERR_BAD_PARAM;
  425. }
  426. return ASTCENC_SUCCESS;
  427. }
  428. /* See header for documentation. */
  429. astcenc_error astcenc_config_init(
  430. astcenc_profile profile,
  431. unsigned int block_x,
  432. unsigned int block_y,
  433. unsigned int block_z,
  434. float quality,
  435. unsigned int flags,
  436. astcenc_config* configp
  437. ) {
  438. astcenc_error status;
  439. // Check basic library compatibility options here so they are checked early. Note, these checks
  440. // are repeated in context_alloc for cases where callers use a manually defined config struct
  441. status = validate_cpu_isa();
  442. if (status != ASTCENC_SUCCESS)
  443. {
  444. return status;
  445. }
  446. status = validate_cpu_float();
  447. if (status != ASTCENC_SUCCESS)
  448. {
  449. return status;
  450. }
  451. // Zero init all config fields; although most of will be over written
  452. astcenc_config& config = *configp;
  453. std::memset(&config, 0, sizeof(config));
  454. // Process the block size
  455. block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1
  456. status = validate_block_size(block_x, block_y, block_z);
  457. if (status != ASTCENC_SUCCESS)
  458. {
  459. return status;
  460. }
  461. config.block_x = block_x;
  462. config.block_y = block_y;
  463. config.block_z = block_z;
  464. float texels = static_cast<float>(block_x * block_y * block_z);
  465. float ltexels = logf(texels) / logf(10.0f);
  466. // Process the performance quality level or preset; note that this must be done before we
  467. // process any additional settings, such as color profile and flags, which may replace some of
  468. // these settings with more use case tuned values
  469. if (quality < ASTCENC_PRE_FASTEST ||
  470. quality > ASTCENC_PRE_EXHAUSTIVE)
  471. {
  472. return ASTCENC_ERR_BAD_QUALITY;
  473. }
  474. static const std::array<astcenc_preset_config, 6>* preset_configs;
  475. int texels_int = block_x * block_y * block_z;
  476. if (texels_int < 25)
  477. {
  478. preset_configs = &preset_configs_high;
  479. }
  480. else if (texels_int < 64)
  481. {
  482. preset_configs = &preset_configs_mid;
  483. }
  484. else
  485. {
  486. preset_configs = &preset_configs_low;
  487. }
  488. // Determine which preset to use, or which pair to interpolate
  489. size_t start;
  490. size_t end;
  491. for (end = 0; end < preset_configs->size(); end++)
  492. {
  493. if ((*preset_configs)[end].quality >= quality)
  494. {
  495. break;
  496. }
  497. }
  498. start = end == 0 ? 0 : end - 1;
  499. // Start and end node are the same - so just transfer the values.
  500. if (start == end)
  501. {
  502. config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit;
  503. config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit;
  504. config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit;
  505. config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit;
  506. config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit;
  507. config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit;
  508. config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES);
  509. config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
  510. config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
  511. config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIIONING_CANDIDATES);
  512. config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels,
  513. (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels);
  514. config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot;
  515. config.tune_2_partition_early_out_limit_factor = (*preset_configs)[start].tune_2_partition_early_out_limit_factor;
  516. config.tune_3_partition_early_out_limit_factor =(*preset_configs)[start].tune_3_partition_early_out_limit_factor;
  517. config.tune_2_plane_early_out_limit_correlation = (*preset_configs)[start].tune_2_plane_early_out_limit_correlation;
  518. }
  519. // Start and end node are not the same - so interpolate between them
  520. else
  521. {
  522. auto& node_a = (*preset_configs)[start];
  523. auto& node_b = (*preset_configs)[end];
  524. float wt_range = node_b.quality - node_a.quality;
  525. assert(wt_range > 0);
  526. // Compute interpolation factors
  527. float wt_node_a = (node_b.quality - quality) / wt_range;
  528. float wt_node_b = (quality - node_a.quality) / wt_range;
  529. #define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b))
  530. #define LERPI(param) astc::flt2int_rtn(\
  531. (static_cast<float>(node_a.param) * wt_node_a) + \
  532. (static_cast<float>(node_b.param) * wt_node_b))
  533. #define LERPUI(param) static_cast<unsigned int>(LERPI(param))
  534. config.tune_partition_count_limit = LERPI(tune_partition_count_limit);
  535. config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit);
  536. config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit);
  537. config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit);
  538. config.tune_block_mode_limit = LERPI(tune_block_mode_limit);
  539. config.tune_refinement_limit = LERPI(tune_refinement_limit);
  540. config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit),
  541. TUNE_MAX_TRIAL_CANDIDATES);
  542. config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit),
  543. BLOCK_MAX_PARTITIONINGS);
  544. config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit),
  545. BLOCK_MAX_PARTITIONINGS);
  546. config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit),
  547. BLOCK_MAX_PARTITIONINGS);
  548. config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels,
  549. LERP(tune_db_limit_b_base) - 19 * ltexels);
  550. config.tune_mse_overshoot = LERP(tune_mse_overshoot);
  551. config.tune_2_partition_early_out_limit_factor = LERP(tune_2_partition_early_out_limit_factor);
  552. config.tune_3_partition_early_out_limit_factor = LERP(tune_3_partition_early_out_limit_factor);
  553. config.tune_2_plane_early_out_limit_correlation = LERP(tune_2_plane_early_out_limit_correlation);
  554. #undef LERP
  555. #undef LERPI
  556. #undef LERPUI
  557. }
  558. // Set heuristics to the defaults for each color profile
  559. config.cw_r_weight = 1.0f;
  560. config.cw_g_weight = 1.0f;
  561. config.cw_b_weight = 1.0f;
  562. config.cw_a_weight = 1.0f;
  563. config.a_scale_radius = 0;
  564. config.rgbm_m_scale = 0.0f;
  565. config.profile = profile;
  566. // Values in this enum are from an external user, so not guaranteed to be
  567. // bounded to the enum values
  568. switch (static_cast<int>(profile))
  569. {
  570. case ASTCENC_PRF_LDR:
  571. case ASTCENC_PRF_LDR_SRGB:
  572. break;
  573. case ASTCENC_PRF_HDR_RGB_LDR_A:
  574. case ASTCENC_PRF_HDR:
  575. config.tune_db_limit = 999.0f;
  576. break;
  577. default:
  578. return ASTCENC_ERR_BAD_PROFILE;
  579. }
  580. // Flags field must not contain any unknown flag bits
  581. status = validate_flags(flags);
  582. if (status != ASTCENC_SUCCESS)
  583. {
  584. return status;
  585. }
  586. if (flags & ASTCENC_FLG_MAP_NORMAL)
  587. {
  588. // Normal map encoding uses L+A blocks, so allow one more partitioning
  589. // than normal. We need need fewer bits for endpoints, so more likely
  590. // to be able to use more partitions than an RGB/RGBA block
  591. config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u);
  592. config.cw_g_weight = 0.0f;
  593. config.cw_b_weight = 0.0f;
  594. config.tune_2_partition_early_out_limit_factor *= 1.5f;
  595. config.tune_3_partition_early_out_limit_factor *= 1.5f;
  596. config.tune_2_plane_early_out_limit_correlation = 0.99f;
  597. // Normals are prone to blocking artifacts on smooth curves
  598. // so force compressor to try harder here ...
  599. config.tune_db_limit *= 1.03f;
  600. }
  601. else if (flags & ASTCENC_FLG_MAP_MASK)
  602. {
  603. // Masks are prone to blocking artifacts on mask edges
  604. // so force compressor to try harder here ...
  605. config.tune_db_limit *= 1.03f;
  606. }
  607. else if (flags & ASTCENC_FLG_MAP_RGBM)
  608. {
  609. config.rgbm_m_scale = 5.0f;
  610. config.cw_a_weight = 2.0f * config.rgbm_m_scale;
  611. }
  612. else // (This is color data)
  613. {
  614. // This is a very basic perceptual metric for RGB color data, which weights error
  615. // significance by the perceptual luminance contribution of each color channel. For
  616. // luminance the usual weights to compute luminance from a linear RGB value are as
  617. // follows:
  618. //
  619. // l = r * 0.3 + g * 0.59 + b * 0.11
  620. //
  621. // ... but we scale these up to keep a better balance between color and alpha. Note
  622. // that if the content is using alpha we'd recommend using the -a option to weight
  623. // the color contribution by the alpha transparency.
  624. if (flags & ASTCENC_FLG_USE_PERCEPTUAL)
  625. {
  626. config.cw_r_weight = 0.30f * 2.25f;
  627. config.cw_g_weight = 0.59f * 2.25f;
  628. config.cw_b_weight = 0.11f * 2.25f;
  629. }
  630. }
  631. config.flags = flags;
  632. return ASTCENC_SUCCESS;
  633. }
  634. /* See header for documentation. */
  635. astcenc_error astcenc_context_alloc(
  636. const astcenc_config* configp,
  637. unsigned int thread_count,
  638. astcenc_context** context
  639. ) {
  640. astcenc_error status;
  641. const astcenc_config& config = *configp;
  642. status = validate_cpu_isa();
  643. if (status != ASTCENC_SUCCESS)
  644. {
  645. return status;
  646. }
  647. status = validate_cpu_float();
  648. if (status != ASTCENC_SUCCESS)
  649. {
  650. return status;
  651. }
  652. if (thread_count == 0)
  653. {
  654. return ASTCENC_ERR_BAD_PARAM;
  655. }
  656. #if defined(ASTCENC_DIAGNOSTICS)
  657. // Force single threaded compressor use in diagnostic mode.
  658. if (thread_count != 1)
  659. {
  660. return ASTCENC_ERR_BAD_PARAM;
  661. }
  662. #endif
  663. astcenc_context* ctxo = new astcenc_context;
  664. astcenc_contexti* ctx = &ctxo->context;
  665. ctx->thread_count = thread_count;
  666. ctx->config = config;
  667. ctx->working_buffers = nullptr;
  668. // These are allocated per-compress, as they depend on image size
  669. ctx->input_alpha_averages = nullptr;
  670. // Copy the config first and validate the copy (we may modify it)
  671. status = validate_config(ctx->config);
  672. if (status != ASTCENC_SUCCESS)
  673. {
  674. delete ctxo;
  675. return status;
  676. }
  677. ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN);
  678. bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY);
  679. init_block_size_descriptor(config.block_x, config.block_y, config.block_z,
  680. can_omit_modes,
  681. config.tune_partition_count_limit,
  682. static_cast<float>(config.tune_block_mode_limit) / 100.0f,
  683. *ctx->bsd);
  684. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  685. // Do setup only needed by compression
  686. if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY))
  687. {
  688. // Turn a dB limit into a per-texel error for faster use later
  689. if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB))
  690. {
  691. ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f;
  692. }
  693. else
  694. {
  695. ctx->config.tune_db_limit = 0.0f;
  696. }
  697. size_t worksize = sizeof(compression_working_buffers) * thread_count;
  698. ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN);
  699. static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0,
  700. "compression_working_buffers size must be multiple of vector alignment");
  701. if (!ctx->working_buffers)
  702. {
  703. aligned_free<block_size_descriptor>(ctx->bsd);
  704. delete ctxo;
  705. *context = nullptr;
  706. return ASTCENC_ERR_OUT_OF_MEM;
  707. }
  708. }
  709. #endif
  710. #if defined(ASTCENC_DIAGNOSTICS)
  711. ctx->trace_log = new TraceLog(ctx->config.trace_file_path);
  712. if (!ctx->trace_log->m_file)
  713. {
  714. return ASTCENC_ERR_DTRACE_FAILURE;
  715. }
  716. trace_add_data("block_x", config.block_x);
  717. trace_add_data("block_y", config.block_y);
  718. trace_add_data("block_z", config.block_z);
  719. #endif
  720. *context = ctxo;
  721. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  722. prepare_angular_tables();
  723. #endif
  724. return ASTCENC_SUCCESS;
  725. }
  726. /* See header dor documentation. */
  727. void astcenc_context_free(
  728. astcenc_context* ctxo
  729. ) {
  730. if (ctxo)
  731. {
  732. astcenc_contexti* ctx = &ctxo->context;
  733. aligned_free<compression_working_buffers>(ctx->working_buffers);
  734. aligned_free<block_size_descriptor>(ctx->bsd);
  735. #if defined(ASTCENC_DIAGNOSTICS)
  736. delete ctx->trace_log;
  737. #endif
  738. delete ctxo;
  739. }
  740. }
  741. #if !defined(ASTCENC_DECOMPRESS_ONLY)
  742. /**
  743. * @brief Compress an image, after any preflight has completed.
  744. *
  745. * @param[out] ctxo The compressor context.
  746. * @param thread_index The thread index.
  747. * @param image The intput image.
  748. * @param swizzle The input swizzle.
  749. * @param[out] buffer The output array for the compressed data.
  750. */
  751. static void compress_image(
  752. astcenc_context& ctxo,
  753. unsigned int thread_index,
  754. const astcenc_image& image,
  755. const astcenc_swizzle& swizzle,
  756. uint8_t* buffer
  757. ) {
  758. astcenc_contexti& ctx = ctxo.context;
  759. const block_size_descriptor& bsd = *ctx.bsd;
  760. astcenc_profile decode_mode = ctx.config.profile;
  761. image_block blk;
  762. int block_x = bsd.xdim;
  763. int block_y = bsd.ydim;
  764. int block_z = bsd.zdim;
  765. blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
  766. int dim_x = image.dim_x;
  767. int dim_y = image.dim_y;
  768. int dim_z = image.dim_z;
  769. int xblocks = (dim_x + block_x - 1) / block_x;
  770. int yblocks = (dim_y + block_y - 1) / block_y;
  771. int zblocks = (dim_z + block_z - 1) / block_z;
  772. int block_count = zblocks * yblocks * xblocks;
  773. int row_blocks = xblocks;
  774. int plane_blocks = xblocks * yblocks;
  775. // Populate the block channel weights
  776. blk.channel_weight = vfloat4(ctx.config.cw_r_weight,
  777. ctx.config.cw_g_weight,
  778. ctx.config.cw_b_weight,
  779. ctx.config.cw_a_weight);
  780. // Use preallocated scratch buffer
  781. auto& temp_buffers = ctx.working_buffers[thread_index];
  782. // Only the first thread actually runs the initializer
  783. ctxo.manage_compress.init(block_count);
  784. // Determine if we can use an optimized load function
  785. bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) ||
  786. (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A);
  787. bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) ||
  788. (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A);
  789. bool use_fast_load = !needs_swz && !needs_hdr &&
  790. block_z == 1 && image.data_type == ASTCENC_TYPE_U8;
  791. auto load_func = load_image_block;
  792. if (use_fast_load)
  793. {
  794. load_func = load_image_block_fast_ldr;
  795. }
  796. // All threads run this processing loop until there is no work remaining
  797. while (true)
  798. {
  799. unsigned int count;
  800. unsigned int base = ctxo.manage_compress.get_task_assignment(16, count);
  801. if (!count)
  802. {
  803. break;
  804. }
  805. for (unsigned int i = base; i < base + count; i++)
  806. {
  807. // Decode i into x, y, z block indices
  808. int z = i / plane_blocks;
  809. unsigned int rem = i - (z * plane_blocks);
  810. int y = rem / row_blocks;
  811. int x = rem - (y * row_blocks);
  812. // Test if we can apply some basic alpha-scale RDO
  813. bool use_full_block = true;
  814. if (ctx.config.a_scale_radius != 0 && block_z == 1)
  815. {
  816. int start_x = x * block_x;
  817. int end_x = astc::min(dim_x, start_x + block_x);
  818. int start_y = y * block_y;
  819. int end_y = astc::min(dim_y, start_y + block_y);
  820. // SATs accumulate error, so don't test exactly zero. Test for
  821. // less than 1 alpha in the expanded block footprint that
  822. // includes the alpha radius.
  823. int x_footprint = block_x + 2 * (ctx.config.a_scale_radius - 1);
  824. int y_footprint = block_y + 2 * (ctx.config.a_scale_radius - 1);
  825. float footprint = static_cast<float>(x_footprint * y_footprint);
  826. float threshold = 0.9f / (255.0f * footprint);
  827. // Do we have any alpha values?
  828. use_full_block = false;
  829. for (int ay = start_y; ay < end_y; ay++)
  830. {
  831. for (int ax = start_x; ax < end_x; ax++)
  832. {
  833. float a_avg = ctx.input_alpha_averages[ay * dim_x + ax];
  834. if (a_avg > threshold)
  835. {
  836. use_full_block = true;
  837. ax = end_x;
  838. ay = end_y;
  839. }
  840. }
  841. }
  842. }
  843. // Fetch the full block for compression
  844. if (use_full_block)
  845. {
  846. load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle);
  847. // Scale RGB error contribution by the maximum alpha in the block
  848. // This encourages preserving alpha accuracy in regions with high
  849. // transparency, and can buy up to 0.5 dB PSNR.
  850. if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT)
  851. {
  852. float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f);
  853. blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale,
  854. ctx.config.cw_g_weight * alpha_scale,
  855. ctx.config.cw_b_weight * alpha_scale,
  856. ctx.config.cw_a_weight);
  857. }
  858. }
  859. // Apply alpha scale RDO - substitute constant color block
  860. else
  861. {
  862. blk.origin_texel = vfloat4::zero();
  863. blk.data_min = vfloat4::zero();
  864. blk.data_mean = vfloat4::zero();
  865. blk.data_max = vfloat4::zero();
  866. blk.grayscale = true;
  867. }
  868. int offset = ((z * yblocks + y) * xblocks + x) * 16;
  869. uint8_t *bp = buffer + offset;
  870. physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp);
  871. compress_block(ctx, blk, *pcb, temp_buffers);
  872. }
  873. ctxo.manage_compress.complete_task_assignment(count);
  874. }
  875. }
  876. /**
  877. * @brief Compute regional averages in an image.
  878. *
  879. * This function can be called by multiple threads, but only after a single
  880. * thread calls the setup function @c init_compute_averages().
  881. *
  882. * Results are written back into @c img->input_alpha_averages.
  883. *
  884. * @param[out] ctx The context.
  885. * @param ag The average and variance arguments created during setup.
  886. */
  887. static void compute_averages(
  888. astcenc_context& ctx,
  889. const avg_args &ag
  890. ) {
  891. pixel_region_args arg = ag.arg;
  892. arg.work_memory = new vfloat4[ag.work_memory_size];
  893. int size_x = ag.img_size_x;
  894. int size_y = ag.img_size_y;
  895. int size_z = ag.img_size_z;
  896. int step_xy = ag.blk_size_xy;
  897. int step_z = ag.blk_size_z;
  898. int y_tasks = (size_y + step_xy - 1) / step_xy;
  899. // All threads run this processing loop until there is no work remaining
  900. while (true)
  901. {
  902. unsigned int count;
  903. unsigned int base = ctx.manage_avg.get_task_assignment(16, count);
  904. if (!count)
  905. {
  906. break;
  907. }
  908. for (unsigned int i = base; i < base + count; i++)
  909. {
  910. int z = (i / (y_tasks)) * step_z;
  911. int y = (i - (z * y_tasks)) * step_xy;
  912. arg.size_z = astc::min(step_z, size_z - z);
  913. arg.offset_z = z;
  914. arg.size_y = astc::min(step_xy, size_y - y);
  915. arg.offset_y = y;
  916. for (int x = 0; x < size_x; x += step_xy)
  917. {
  918. arg.size_x = astc::min(step_xy, size_x - x);
  919. arg.offset_x = x;
  920. compute_pixel_region_variance(ctx.context, arg);
  921. }
  922. }
  923. ctx.manage_avg.complete_task_assignment(count);
  924. }
  925. delete[] arg.work_memory;
  926. }
  927. #endif
  928. /* See header for documentation. */
  929. astcenc_error astcenc_compress_image(
  930. astcenc_context* ctxo,
  931. astcenc_image* imagep,
  932. const astcenc_swizzle* swizzle,
  933. uint8_t* data_out,
  934. size_t data_len,
  935. unsigned int thread_index
  936. ) {
  937. #if defined(ASTCENC_DECOMPRESS_ONLY)
  938. (void)ctxo;
  939. (void)imagep;
  940. (void)swizzle;
  941. (void)data_out;
  942. (void)data_len;
  943. (void)thread_index;
  944. return ASTCENC_ERR_BAD_CONTEXT;
  945. #else
  946. astcenc_contexti* ctx = &ctxo->context;
  947. astcenc_error status;
  948. astcenc_image& image = *imagep;
  949. if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
  950. {
  951. return ASTCENC_ERR_BAD_CONTEXT;
  952. }
  953. status = validate_compression_swizzle(*swizzle);
  954. if (status != ASTCENC_SUCCESS)
  955. {
  956. return status;
  957. }
  958. if (thread_index >= ctx->thread_count)
  959. {
  960. return ASTCENC_ERR_BAD_PARAM;
  961. }
  962. unsigned int block_x = ctx->config.block_x;
  963. unsigned int block_y = ctx->config.block_y;
  964. unsigned int block_z = ctx->config.block_z;
  965. unsigned int xblocks = (image.dim_x + block_x - 1) / block_x;
  966. unsigned int yblocks = (image.dim_y + block_y - 1) / block_y;
  967. unsigned int zblocks = (image.dim_z + block_z - 1) / block_z;
  968. // Check we have enough output space (16 bytes per block)
  969. size_t size_needed = xblocks * yblocks * zblocks * 16;
  970. if (data_len < size_needed)
  971. {
  972. return ASTCENC_ERR_OUT_OF_MEM;
  973. }
  974. // If context thread count is one then implicitly reset
  975. if (ctx->thread_count == 1)
  976. {
  977. astcenc_compress_reset(ctxo);
  978. }
  979. if (ctx->config.a_scale_radius != 0)
  980. {
  981. // First thread to enter will do setup, other threads will subsequently
  982. // enter the critical section but simply skip over the initialization
  983. auto init_avg = [ctx, &image, swizzle]() {
  984. // Perform memory allocations for the destination buffers
  985. size_t texel_count = image.dim_x * image.dim_y * image.dim_z;
  986. ctx->input_alpha_averages = new float[texel_count];
  987. return init_compute_averages(
  988. image, ctx->config.a_scale_radius, *swizzle,
  989. ctx->avg_preprocess_args);
  990. };
  991. // Only the first thread actually runs the initializer
  992. ctxo->manage_avg.init(init_avg);
  993. // All threads will enter this function and dynamically grab work
  994. compute_averages(*ctxo, ctx->avg_preprocess_args);
  995. }
  996. // Wait for compute_averages to complete before compressing
  997. ctxo->manage_avg.wait();
  998. compress_image(*ctxo, thread_index, image, *swizzle, data_out);
  999. // Wait for compress to complete before freeing memory
  1000. ctxo->manage_compress.wait();
  1001. auto term_compress = [ctx]() {
  1002. delete[] ctx->input_alpha_averages;
  1003. ctx->input_alpha_averages = nullptr;
  1004. };
  1005. // Only the first thread to arrive actually runs the term
  1006. ctxo->manage_compress.term(term_compress);
  1007. return ASTCENC_SUCCESS;
  1008. #endif
  1009. }
  1010. /* See header for documentation. */
  1011. astcenc_error astcenc_compress_reset(
  1012. astcenc_context* ctxo
  1013. ) {
  1014. #if defined(ASTCENC_DECOMPRESS_ONLY)
  1015. (void)ctxo;
  1016. return ASTCENC_ERR_BAD_CONTEXT;
  1017. #else
  1018. astcenc_contexti* ctx = &ctxo->context;
  1019. if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)
  1020. {
  1021. return ASTCENC_ERR_BAD_CONTEXT;
  1022. }
  1023. ctxo->manage_avg.reset();
  1024. ctxo->manage_compress.reset();
  1025. return ASTCENC_SUCCESS;
  1026. #endif
  1027. }
  1028. /* See header for documentation. */
  1029. astcenc_error astcenc_decompress_image(
  1030. astcenc_context* ctxo,
  1031. const uint8_t* data,
  1032. size_t data_len,
  1033. astcenc_image* image_outp,
  1034. const astcenc_swizzle* swizzle,
  1035. unsigned int thread_index
  1036. ) {
  1037. astcenc_error status;
  1038. astcenc_image& image_out = *image_outp;
  1039. astcenc_contexti* ctx = &ctxo->context;
  1040. // Today this doesn't matter (working set on stack) but might in future ...
  1041. if (thread_index >= ctx->thread_count)
  1042. {
  1043. return ASTCENC_ERR_BAD_PARAM;
  1044. }
  1045. status = validate_decompression_swizzle(*swizzle);
  1046. if (status != ASTCENC_SUCCESS)
  1047. {
  1048. return status;
  1049. }
  1050. unsigned int block_x = ctx->config.block_x;
  1051. unsigned int block_y = ctx->config.block_y;
  1052. unsigned int block_z = ctx->config.block_z;
  1053. unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x;
  1054. unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y;
  1055. unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z;
  1056. int row_blocks = xblocks;
  1057. int plane_blocks = xblocks * yblocks;
  1058. // Check we have enough output space (16 bytes per block)
  1059. size_t size_needed = xblocks * yblocks * zblocks * 16;
  1060. if (data_len < size_needed)
  1061. {
  1062. return ASTCENC_ERR_OUT_OF_MEM;
  1063. }
  1064. image_block blk;
  1065. blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z);
  1066. // If context thread count is one then implicitly reset
  1067. if (ctx->thread_count == 1)
  1068. {
  1069. astcenc_decompress_reset(ctxo);
  1070. }
  1071. // Only the first thread actually runs the initializer
  1072. ctxo->manage_decompress.init(zblocks * yblocks * xblocks);
  1073. // All threads run this processing loop until there is no work remaining
  1074. while (true)
  1075. {
  1076. unsigned int count;
  1077. unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count);
  1078. if (!count)
  1079. {
  1080. break;
  1081. }
  1082. for (unsigned int i = base; i < base + count; i++)
  1083. {
  1084. // Decode i into x, y, z block indices
  1085. int z = i / plane_blocks;
  1086. unsigned int rem = i - (z * plane_blocks);
  1087. int y = rem / row_blocks;
  1088. int x = rem - (y * row_blocks);
  1089. unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16;
  1090. const uint8_t* bp = data + offset;
  1091. const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp);
  1092. symbolic_compressed_block scb;
  1093. physical_to_symbolic(*ctx->bsd, pcb, scb);
  1094. decompress_symbolic_block(ctx->config.profile, *ctx->bsd,
  1095. x * block_x, y * block_y, z * block_z,
  1096. scb, blk);
  1097. store_image_block(image_out, blk, *ctx->bsd,
  1098. x * block_x, y * block_y, z * block_z, *swizzle);
  1099. }
  1100. ctxo->manage_decompress.complete_task_assignment(count);
  1101. }
  1102. return ASTCENC_SUCCESS;
  1103. }
  1104. /* See header for documentation. */
  1105. astcenc_error astcenc_decompress_reset(
  1106. astcenc_context* ctxo
  1107. ) {
  1108. ctxo->manage_decompress.reset();
  1109. return ASTCENC_SUCCESS;
  1110. }
  1111. /* See header for documentation. */
  1112. astcenc_error astcenc_get_block_info(
  1113. astcenc_context* ctxo,
  1114. const uint8_t data[16],
  1115. astcenc_block_info* info
  1116. ) {
  1117. #if defined(ASTCENC_DECOMPRESS_ONLY)
  1118. (void)ctxo;
  1119. (void)data;
  1120. (void)info;
  1121. return ASTCENC_ERR_BAD_CONTEXT;
  1122. #else
  1123. astcenc_contexti* ctx = &ctxo->context;
  1124. // Decode the compressed data into a symbolic form
  1125. const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data);
  1126. symbolic_compressed_block scb;
  1127. physical_to_symbolic(*ctx->bsd, pcb, scb);
  1128. // Fetch the appropriate partition and decimation tables
  1129. block_size_descriptor& bsd = *ctx->bsd;
  1130. // Start from a clean slate
  1131. memset(info, 0, sizeof(*info));
  1132. // Basic info we can always populate
  1133. info->profile = ctx->config.profile;
  1134. info->block_x = ctx->config.block_x;
  1135. info->block_y = ctx->config.block_y;
  1136. info->block_z = ctx->config.block_z;
  1137. info->texel_count = bsd.texel_count;
  1138. // Check for error blocks first
  1139. info->is_error_block = scb.block_type == SYM_BTYPE_ERROR;
  1140. if (info->is_error_block)
  1141. {
  1142. return ASTCENC_SUCCESS;
  1143. }
  1144. // Check for constant color blocks second
  1145. info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 ||
  1146. scb.block_type == SYM_BTYPE_CONST_U16;
  1147. if (info->is_constant_block)
  1148. {
  1149. return ASTCENC_SUCCESS;
  1150. }
  1151. // Otherwise handle a full block ; known to be valid after conditions above have been checked
  1152. int partition_count = scb.partition_count;
  1153. const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index);
  1154. const block_mode& bm = bsd.get_block_mode(scb.block_mode);
  1155. const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode);
  1156. info->weight_x = di.weight_x;
  1157. info->weight_y = di.weight_y;
  1158. info->weight_z = di.weight_z;
  1159. info->is_dual_plane_block = bm.is_dual_plane != 0;
  1160. info->partition_count = scb.partition_count;
  1161. info->partition_index = scb.partition_index;
  1162. info->dual_plane_component = scb.plane2_component;
  1163. info->color_level_count = get_quant_level(scb.get_color_quant_mode());
  1164. info->weight_level_count = get_quant_level(bm.get_weight_quant_mode());
  1165. // Unpack color endpoints for each active partition
  1166. for (unsigned int i = 0; i < scb.partition_count; i++)
  1167. {
  1168. bool rgb_hdr;
  1169. bool a_hdr;
  1170. vint4 endpnt[2];
  1171. unpack_color_endpoints(ctx->config.profile,
  1172. scb.color_formats[i],
  1173. scb.color_values[i],
  1174. rgb_hdr, a_hdr,
  1175. endpnt[0], endpnt[1]);
  1176. // Store the color endpoint mode info
  1177. info->color_endpoint_modes[i] = scb.color_formats[i];
  1178. info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr;
  1179. // Store the unpacked and decoded color endpoint
  1180. vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr);
  1181. for (int j = 0; j < 2; j++)
  1182. {
  1183. vint4 color_lns = lns_to_sf16(endpnt[j]);
  1184. vint4 color_unorm = unorm16_to_sf16(endpnt[j]);
  1185. vint4 datai = select(color_unorm, color_lns, hdr_mask);
  1186. store(float16_to_float(datai), info->color_endpoints[i][j]);
  1187. }
  1188. }
  1189. // Unpack weights for each texel
  1190. int weight_plane1[BLOCK_MAX_TEXELS];
  1191. int weight_plane2[BLOCK_MAX_TEXELS];
  1192. unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2);
  1193. for (unsigned int i = 0; i < bsd.texel_count; i++)
  1194. {
  1195. info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
  1196. if (info->is_dual_plane_block)
  1197. {
  1198. info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM);
  1199. }
  1200. }
  1201. // Unpack partition assignments for each texel
  1202. for (unsigned int i = 0; i < bsd.texel_count; i++)
  1203. {
  1204. info->partition_assignment[i] = pi.partition_of_texel[i];
  1205. }
  1206. return ASTCENC_SUCCESS;
  1207. #endif
  1208. }
  1209. /* See header for documentation. */
  1210. const char* astcenc_get_error_string(
  1211. astcenc_error status
  1212. ) {
  1213. // Values in this enum are from an external user, so not guaranteed to be
  1214. // bounded to the enum values
  1215. switch (static_cast<int>(status))
  1216. {
  1217. case ASTCENC_SUCCESS:
  1218. return "ASTCENC_SUCCESS";
  1219. case ASTCENC_ERR_OUT_OF_MEM:
  1220. return "ASTCENC_ERR_OUT_OF_MEM";
  1221. case ASTCENC_ERR_BAD_CPU_FLOAT:
  1222. return "ASTCENC_ERR_BAD_CPU_FLOAT";
  1223. case ASTCENC_ERR_BAD_CPU_ISA:
  1224. return "ASTCENC_ERR_BAD_CPU_ISA";
  1225. case ASTCENC_ERR_BAD_PARAM:
  1226. return "ASTCENC_ERR_BAD_PARAM";
  1227. case ASTCENC_ERR_BAD_BLOCK_SIZE:
  1228. return "ASTCENC_ERR_BAD_BLOCK_SIZE";
  1229. case ASTCENC_ERR_BAD_PROFILE:
  1230. return "ASTCENC_ERR_BAD_PROFILE";
  1231. case ASTCENC_ERR_BAD_QUALITY:
  1232. return "ASTCENC_ERR_BAD_QUALITY";
  1233. case ASTCENC_ERR_BAD_FLAGS:
  1234. return "ASTCENC_ERR_BAD_FLAGS";
  1235. case ASTCENC_ERR_BAD_SWIZZLE:
  1236. return "ASTCENC_ERR_BAD_SWIZZLE";
  1237. case ASTCENC_ERR_BAD_CONTEXT:
  1238. return "ASTCENC_ERR_BAD_CONTEXT";
  1239. case ASTCENC_ERR_NOT_IMPLEMENTED:
  1240. return "ASTCENC_ERR_NOT_IMPLEMENTED";
  1241. #if defined(ASTCENC_DIAGNOSTICS)
  1242. case ASTCENC_ERR_DTRACE_FAILURE:
  1243. return "ASTCENC_ERR_DTRACE_FAILURE";
  1244. #endif
  1245. default:
  1246. return nullptr;
  1247. }
  1248. }