bcn_common_api.h 36 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408
  1. //===============================================================================
  2. // Copyright (c) 2021 Advanced Micro Devices, Inc. All rights reserved.
  3. //
  4. // Permission is hereby granted, free of charge, to any person obtaining a copy
  5. // of this software and associated documentation files(the "Software"), to deal
  6. // in the Software without restriction, including without limitation the rights to
  7. // use, copy, modify, merge, publish, distribute, sublicense, and / or sell
  8. // copies of the Software, and to permit persons to whom the Software is
  9. // furnished to do so, subject to the following conditions :
  10. //
  11. // The above copyright notice and this permission notice shall be included in
  12. // all copies or substantial portions of the Software.
  13. //
  14. // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15. // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16. // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
  17. // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18. // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19. // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20. // THE SOFTWARE.
  21. //
  22. //===============================================================================
  23. #ifndef BCN_COMMON_API_H_
  24. #define BCN_COMMON_API_H_
  25. //===================================================================
  26. // NOTE: Do not use these API in production code, subject to changes
  27. //===================================================================
  28. #ifndef ASPM_GPU
  29. #pragma warning(disable : 4244)
  30. #pragma warning(disable : 4201)
  31. #endif
  32. #include "common_def.h"
  33. #define CMP_MAX_16BITFLOAT 65504.0f
  34. #define CMP_FLT_MAX 3.402823466e+38F
  35. #define BC1ConstructColour(r, g, b) (((r) << 11) | ((g) << 5) | (b))
  36. #ifdef ASPM_HLSL
  37. #define fabs(x) abs(x)
  38. #endif
  39. CMP_STATIC CGU_FLOAT cmp_fabs(CMP_IN CGU_FLOAT x)
  40. {
  41. return fabs(x);
  42. }
  43. CMP_STATIC CGU_FLOAT cmp_linearToSrgbf(CMP_IN CGU_FLOAT Color)
  44. {
  45. if (Color <= 0.0f)
  46. return (0.0f);
  47. if (Color >= 1.0f)
  48. return (1.0f);
  49. // standard : 0.0031308f
  50. if (Color <= 0.00313066844250063)
  51. return (Color * 12.92f);
  52. return (pow(fabs(Color), 1.0f / 2.4f) * 1.055f - 0.055f);
  53. }
  54. CMP_STATIC CGU_Vec3f cmp_linearToSrgb(CMP_IN CGU_Vec3f Color)
  55. {
  56. Color.x = cmp_linearToSrgbf(Color.x);
  57. Color.y = cmp_linearToSrgbf(Color.y);
  58. Color.z = cmp_linearToSrgbf(Color.z);
  59. return Color;
  60. }
  61. CMP_STATIC CGU_FLOAT cmp_srgbToLinearf(CMP_IN CGU_FLOAT Color)
  62. {
  63. if (Color <= 0.0f)
  64. return (0.0f);
  65. if (Color >= 1.0f)
  66. return (1.0f);
  67. // standard 0.04045f
  68. if (Color <= 0.0404482362771082)
  69. return (Color / 12.92f);
  70. return pow((Color + 0.055f) / 1.055f, 2.4f);
  71. }
  72. CMP_STATIC CGU_Vec3f cmp_srgbToLinear(CMP_IN CGU_Vec3f Color)
  73. {
  74. Color.x = cmp_srgbToLinearf(Color.x);
  75. Color.y = cmp_srgbToLinearf(Color.y);
  76. Color.z = cmp_srgbToLinearf(Color.z);
  77. return Color;
  78. }
  79. CMP_STATIC CGU_Vec3f cmp_565ToLinear(CMP_IN CGU_UINT32 n565)
  80. {
  81. CGU_UINT32 r0;
  82. CGU_UINT32 g0;
  83. CGU_UINT32 b0;
  84. r0 = ((n565 & 0xf800) >> 8);
  85. g0 = ((n565 & 0x07e0) >> 3);
  86. b0 = ((n565 & 0x001f) << 3);
  87. // Apply the lower bit replication to give full dynamic range (5,6,5)
  88. r0 += (r0 >> 5);
  89. g0 += (g0 >> 6);
  90. b0 += (b0 >> 5);
  91. CGU_Vec3f LinearColor;
  92. LinearColor.x = (CGU_FLOAT)r0;
  93. LinearColor.y = (CGU_FLOAT)g0;
  94. LinearColor.z = (CGU_FLOAT)b0;
  95. return LinearColor;
  96. }
  97. CMP_STATIC CGU_UINT32 cmp_get2Bit32(CMP_IN CGU_UINT32 value, CMP_IN CGU_UINT32 indexPos)
  98. {
  99. return (value >> (indexPos * 2)) & 0x3;
  100. }
  101. CMP_STATIC CGU_UINT32 cmp_set2Bit32(CMP_IN CGU_UINT32 value, CMP_IN CGU_UINT32 indexPos)
  102. {
  103. return ((value & 0x3) << (indexPos * 2));
  104. }
  105. CMP_STATIC CGU_UINT32 cmp_constructColor(CMP_IN CGU_UINT32 R, CMP_IN CGU_UINT32 G, CMP_IN CGU_UINT32 B)
  106. {
  107. return (((R & 0x000000F8) << 8) | ((G & 0x000000FC) << 3) | ((B & 0x000000F8) >> 3));
  108. }
  109. CMP_STATIC CGU_FLOAT cmp_clampf(CMP_IN CGU_FLOAT v, CMP_IN CGU_FLOAT a, CMP_IN CGU_FLOAT b)
  110. {
  111. if (v < a)
  112. return a;
  113. else if (v > b)
  114. return b;
  115. return v;
  116. }
  117. CMP_STATIC CGU_Vec3f cmp_clampVec3f(CMP_IN CGU_Vec3f value, CMP_IN CGU_FLOAT minValue, CMP_IN CGU_FLOAT maxValue)
  118. {
  119. #ifdef ASPM_GPU
  120. return clamp(value, minValue, maxValue);
  121. #else
  122. CGU_Vec3f revalue;
  123. revalue.x = cmp_clampf(value.x, minValue, maxValue);
  124. revalue.y = cmp_clampf(value.y, minValue, maxValue);
  125. revalue.z = cmp_clampf(value.z, minValue, maxValue);
  126. return revalue;
  127. #endif
  128. }
  129. CMP_STATIC CGU_Vec3f cmp_saturate(CMP_IN CGU_Vec3f value)
  130. {
  131. #ifdef ASPM_HLSL
  132. return saturate(value);
  133. #else
  134. return cmp_clampVec3f(value, 0.0f, 1.0f);
  135. #endif
  136. }
  137. static CGU_Vec3f cmp_powVec3f(CGU_Vec3f color, CGU_FLOAT ex)
  138. {
  139. #ifdef ASPM_GPU
  140. return pow(color, ex);
  141. #else
  142. CGU_Vec3f ColorSrgbPower;
  143. ColorSrgbPower.x = pow(color.x, ex);
  144. ColorSrgbPower.y = pow(color.y, ex);
  145. ColorSrgbPower.z = pow(color.z, ex);
  146. return ColorSrgbPower;
  147. #endif
  148. }
  149. CMP_STATIC CGU_Vec3f cmp_minVec3f(CMP_IN CGU_Vec3f a, CMP_IN CGU_Vec3f b)
  150. {
  151. #ifdef ASPM_HLSL
  152. return min(a, b);
  153. #endif
  154. CGU_Vec3f res;
  155. if (a.x < b.x)
  156. res.x = a.x;
  157. else
  158. res.x = b.x;
  159. if (a.y < b.y)
  160. res.y = a.y;
  161. else
  162. res.y = b.y;
  163. if (a.z < b.z)
  164. res.z = a.z;
  165. else
  166. res.z = b.z;
  167. return res;
  168. }
  169. CMP_STATIC CGU_Vec3f cmp_maxVec3f(CMP_IN CGU_Vec3f a, CMP_IN CGU_Vec3f b)
  170. {
  171. #ifdef ASPM_HLSL
  172. return max(a, b);
  173. #endif
  174. CGU_Vec3f res;
  175. if (a.x > b.x)
  176. res.x = a.x;
  177. else
  178. res.x = b.x;
  179. if (a.y > b.y)
  180. res.y = a.y;
  181. else
  182. res.y = b.y;
  183. if (a.z > b.z)
  184. res.z = a.z;
  185. else
  186. res.z = b.z;
  187. return res;
  188. }
  189. inline CGU_Vec3f cmp_min3f(CMP_IN CGU_Vec3f value1, CMP_IN CGU_Vec3f value2)
  190. {
  191. #ifdef ASPM_GPU
  192. return min(value1, value2);
  193. #else
  194. CGU_Vec3f res;
  195. res.x = CMP_MIN(value1.x, value2.x);
  196. res.y = CMP_MIN(value1.y, value2.y);
  197. res.z = CMP_MIN(value1.z, value2.z);
  198. return res;
  199. #endif
  200. }
  201. inline CGU_Vec3f cmp_max3f(CMP_IN CGU_Vec3f value1, CMP_IN CGU_Vec3f value2)
  202. {
  203. #ifdef ASPM_GPU
  204. return max(value1, value2);
  205. #else
  206. CGU_Vec3f res;
  207. res.x = CMP_MAX(value1.x, value2.x);
  208. res.y = CMP_MAX(value1.y, value2.y);
  209. res.z = CMP_MAX(value1.z, value2.z);
  210. return res;
  211. #endif
  212. }
  213. CMP_STATIC CGU_FLOAT cmp_minf(CMP_IN CGU_FLOAT a, CMP_IN CGU_FLOAT b)
  214. {
  215. return a < b ? a : b;
  216. }
  217. CMP_STATIC CGU_FLOAT cmp_maxf(CMP_IN CGU_FLOAT a, CMP_IN CGU_FLOAT b)
  218. {
  219. return a > b ? a : b;
  220. }
  221. CMP_STATIC CGU_FLOAT cmp_floor(CMP_IN CGU_FLOAT value)
  222. {
  223. return floor(value);
  224. }
  225. CMP_STATIC CGU_Vec3f cmp_floorVec3f(CMP_IN CGU_Vec3f value)
  226. {
  227. #ifdef ASPM_GPU
  228. return floor(value);
  229. #else
  230. CGU_Vec3f revalue;
  231. revalue.x = floor(value.x);
  232. revalue.y = floor(value.y);
  233. revalue.z = floor(value.z);
  234. return revalue;
  235. #endif
  236. }
  237. #ifndef ASPM_OPENCL
  238. //=======================================================
  239. // COMMON GPU & CPU API
  240. //=======================================================
  241. //======================
  242. // implicit vector cast
  243. //======================
  244. CMP_STATIC CGU_Vec4i cmp_castimp(CGU_Vec4ui v1)
  245. {
  246. #ifdef ASPM_HLSL
  247. return (v1);
  248. #else
  249. return (v1.x, v1.y, v1.z, v1.w);
  250. #endif
  251. }
  252. CMP_STATIC CGU_Vec3i cmp_castimp(CGU_Vec3ui v1)
  253. {
  254. #ifdef ASPM_HLSL
  255. return (v1);
  256. #else
  257. return (v1.x, v1.y, v1.z);
  258. #endif
  259. }
  260. //======================
  261. // Min / Max
  262. //======================
  263. CMP_STATIC CGU_UINT8 cmp_min8(CMP_IN CGU_UINT8 a, CMP_IN CGU_UINT8 b)
  264. {
  265. return a < b ? a : b;
  266. }
  267. CMP_STATIC CGU_UINT8 cmp_max8(CMP_IN CGU_UINT8 a, CMP_IN CGU_UINT8 b)
  268. {
  269. return a > b ? a : b;
  270. }
  271. CMP_STATIC CGU_UINT32 cmp_mini(CMP_IN CGU_UINT32 a, CMP_IN CGU_UINT32 b)
  272. {
  273. return (a < b) ? a : b;
  274. }
  275. CMP_STATIC CGU_UINT32 cmp_maxi(CMP_IN CGU_UINT32 a, CMP_IN CGU_UINT32 b)
  276. {
  277. return (a > b) ? a : b;
  278. }
  279. CMP_STATIC CGU_FLOAT cmp_max3(CMP_IN CGU_FLOAT i, CMP_IN CGU_FLOAT j, CMP_IN CGU_FLOAT k)
  280. {
  281. #ifdef ASPM_GLSL
  282. return max3(i, j, k);
  283. #else
  284. CGU_FLOAT max = i;
  285. if (max < j)
  286. max = j;
  287. if (max < k)
  288. max = k;
  289. return (max);
  290. #endif
  291. }
  292. CMP_STATIC CGU_Vec4ui cmp_minVec4ui(CMP_IN CGU_Vec4ui a, CMP_IN CGU_Vec4ui b)
  293. {
  294. //#ifdef ASPM_HLSL
  295. // return min(a, b);
  296. //#endif
  297. //#ifndef ASPM_GPU
  298. CGU_Vec4ui res;
  299. if (a.x < b.x)
  300. res.x = a.x;
  301. else
  302. res.x = b.x;
  303. if (a.y < b.y)
  304. res.y = a.y;
  305. else
  306. res.y = b.y;
  307. if (a.z < b.z)
  308. res.z = a.z;
  309. else
  310. res.z = b.z;
  311. if (a.w < b.w)
  312. res.w = a.w;
  313. else
  314. res.w = b.w;
  315. return res;
  316. //#endif
  317. }
  318. CMP_STATIC CGU_Vec4ui cmp_maxVec4ui(CMP_IN CGU_Vec4ui a, CMP_IN CGU_Vec4ui b)
  319. {
  320. //#ifdef ASPM_HLSL
  321. // return max(a, b);
  322. //#endif
  323. //#ifndef ASPM_GPU
  324. CGU_Vec4ui res;
  325. if (a.x > b.x)
  326. res.x = a.x;
  327. else
  328. res.x = b.x;
  329. if (a.y > b.y)
  330. res.y = a.y;
  331. else
  332. res.y = b.y;
  333. if (a.z > b.z)
  334. res.z = a.z;
  335. else
  336. res.z = b.z;
  337. if (a.w > b.w)
  338. res.w = a.w;
  339. else
  340. res.w = b.w;
  341. return res;
  342. //#endif
  343. }
  344. //======================
  345. // Clamps
  346. //======================
  347. CMP_STATIC CGU_UINT32 cmp_clampui32(CMP_IN CGU_UINT32 v, CMP_IN CGU_UINT32 a, CMP_IN CGU_UINT32 b)
  348. {
  349. if (v < a)
  350. return a;
  351. else if (v > b)
  352. return b;
  353. return v;
  354. }
  355. // Test Ref:https://en.wikipedia.org/wiki/Half-precision_floating-point_format
  356. // Half (in Hex) Float Comment
  357. // ---------------------------------------------------------------------------
  358. // 0001 (approx) = 0.000000059604645 smallest positive subnormal number
  359. // 03ff (approx) = 0.000060975552 largest subnormal number
  360. // 0400 (approx) = 0.00006103515625 smallest positive normal number
  361. // 7bff (approx) = 65504 largest normal number
  362. // 3bff (approx) = 0.99951172 largest number less than one
  363. // 3c00 (approx) = 1.00097656 smallest number larger than one
  364. // 3555 = 0.33325195 the rounding of 1/3 to nearest
  365. // c000 = ?2
  366. // 8000 = -0
  367. // 0000 = 0
  368. // 7c00 = infinity
  369. // fc00 = infinity
  370. // Half Float Math
  371. CMP_STATIC CGU_FLOAT HalfToFloat(CGU_UINT32 h)
  372. {
  373. #if defined(ASPM_GPU)
  374. CGU_FLOAT f = min16float((float)(h));
  375. return f;
  376. #else
  377. union FP32
  378. {
  379. CGU_UINT32 u;
  380. CGU_FLOAT f;
  381. };
  382. const FP32 magic = {(254 - 15) << 23};
  383. const FP32 was_infnan = {(127 + 16) << 23};
  384. FP32 o;
  385. o.u = (h & 0x7fff) << 13; // exponent/mantissa bits
  386. o.f *= magic.f; // exponent adjust
  387. if (o.f >= was_infnan.f) // check Inf/NaN
  388. o.u |= 255 << 23;
  389. o.u |= (h & 0x8000) << 16; // sign bit
  390. return o.f;
  391. #endif
  392. }
  393. // From BC6HEcode.hlsl
  394. CMP_STATIC CGU_FLOAT cmp_half2float1(CGU_UINT32 Value)
  395. {
  396. CGU_UINT32 Mantissa = (CGU_UINT32)(Value & 0x03FF);
  397. CGU_UINT32 Exponent;
  398. if ((Value & 0x7C00) != 0) // The value is normalized
  399. {
  400. Exponent = (CGU_UINT32)((Value >> 10) & 0x1F);
  401. }
  402. else if (Mantissa != 0) // The value is denormalized
  403. {
  404. // Normalize the value in the resulting float
  405. Exponent = 1;
  406. do
  407. {
  408. Exponent--;
  409. Mantissa <<= 1;
  410. } while ((Mantissa & 0x0400) == 0);
  411. Mantissa &= 0x03FF;
  412. }
  413. else // The value is zero
  414. {
  415. Exponent = (CGU_UINT32)(-112);
  416. }
  417. CGU_UINT32 Result = ((Value & 0x8000) << 16) | // Sign
  418. ((Exponent + 112) << 23) | // Exponent
  419. (Mantissa << 13); // Mantissa
  420. return CGU_FLOAT(Result);
  421. }
  422. CMP_STATIC CGU_Vec3f cmp_half2floatVec3(CGU_Vec3ui color_h)
  423. {
  424. //uint3 sign = color_h & 0x8000;
  425. //uint3 expo = color_h & 0x7C00;
  426. //uint3 base = color_h & 0x03FF;
  427. //return ( expo == 0 ) ? asfloat( ( sign << 16 ) | asuint( float3(base) / 16777216 ) ) //16777216 = 2^24
  428. // : asfloat( ( sign << 16 ) | ( ( ( expo + 0x1C000 ) | base ) << 13 ) ); //0x1C000 = 0x1FC00 - 0x3C00
  429. return CGU_Vec3f(cmp_half2float1(color_h.x), cmp_half2float1(color_h.y), cmp_half2float1(color_h.z));
  430. }
  431. CMP_STATIC CGU_UINT16 FloatToHalf(CGU_FLOAT value)
  432. {
  433. #if defined(ASPM_GPU)
  434. return 0;
  435. #else
  436. union FP32
  437. {
  438. CGU_UINT16 u;
  439. float f;
  440. struct
  441. {
  442. CGU_UINT32 Mantissa : 23;
  443. CGU_UINT32 Exponent : 8;
  444. CGU_UINT32 Sign : 1;
  445. };
  446. };
  447. union FP16
  448. {
  449. CGU_UINT16 u;
  450. struct
  451. {
  452. CGU_UINT32 Mantissa : 10;
  453. CGU_UINT32 Exponent : 5;
  454. CGU_UINT32 Sign : 1;
  455. };
  456. };
  457. FP16 o = {0};
  458. FP32 f;
  459. f.f = value;
  460. // Based on ISPC reference code (with minor modifications)
  461. if (f.Exponent == 0) // Signed zero/denormal (which will underflow)
  462. o.Exponent = 0;
  463. else if (f.Exponent == 255) // Inf or NaN (all exponent bits set)
  464. {
  465. o.Exponent = 31;
  466. o.Mantissa = f.Mantissa ? 0x200 : 0; // NaN->qNaN and Inf->Inf
  467. }
  468. else // Normalized number
  469. {
  470. // Exponent unbias the single, then bias the halfp
  471. int newexp = f.Exponent - 127 + 15;
  472. if (newexp >= 31) // Overflow, return signed infinity
  473. o.Exponent = 31;
  474. else if (newexp <= 0) // Underflow
  475. {
  476. if ((14 - newexp) <= 24) // Mantissa might be non-zero
  477. {
  478. CGU_UINT32 mant = f.Mantissa | 0x800000; // Hidden 1 bit
  479. o.Mantissa = mant >> (14 - newexp);
  480. if ((mant >> (13 - newexp)) & 1) // Check for rounding
  481. o.u++; // Round, might overflow into exp bit, but this is OK
  482. }
  483. }
  484. else
  485. {
  486. o.Exponent = newexp;
  487. o.Mantissa = f.Mantissa >> 13;
  488. if (f.Mantissa & 0x1000) // Check for rounding
  489. o.u++; // Round, might overflow to inf, this is OK
  490. }
  491. }
  492. o.Sign = f.Sign;
  493. return o.u;
  494. #endif
  495. }
  496. CMP_STATIC CGU_UINT32 cmp_float2halfui(CGU_FLOAT f)
  497. {
  498. CGU_UINT32 Result;
  499. CGU_UINT32 IValue = CGU_UINT32(f);
  500. CGU_UINT32 Sign = (IValue & 0x80000000U) >> 16U;
  501. IValue = IValue & 0x7FFFFFFFU;
  502. if (IValue > 0x47FFEFFFU)
  503. {
  504. // The number is too large to be represented as a half. Saturate to infinity.
  505. Result = 0x7FFFU;
  506. }
  507. else
  508. {
  509. if (IValue < 0x38800000U)
  510. {
  511. // The number is too small to be represented as a normalized half.
  512. // Convert it to a denormalized value.
  513. CGU_UINT32 Shift = 113U - (IValue >> 23U);
  514. IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
  515. }
  516. else
  517. {
  518. // Rebias the exponent to represent the value as a normalized half.
  519. IValue += 0xC8000000U;
  520. }
  521. Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU;
  522. }
  523. return (Result | Sign);
  524. }
  525. CMP_STATIC CGU_Vec3ui cmp_float2half(CGU_Vec3f endPoint_f)
  526. {
  527. return CGU_Vec3ui(cmp_float2halfui(endPoint_f.x), cmp_float2halfui(endPoint_f.y), cmp_float2halfui(endPoint_f.z));
  528. }
  529. CMP_STATIC CGU_UINT32 cmp_float2half1(CGU_FLOAT f)
  530. {
  531. CGU_UINT32 Result;
  532. CGU_UINT32 IValue = CGU_UINT32(f); //asuint(f);
  533. CGU_UINT32 Sign = (IValue & 0x80000000U) >> 16U;
  534. IValue = IValue & 0x7FFFFFFFU;
  535. if (IValue > 0x47FFEFFFU)
  536. {
  537. // The number is too large to be represented as a half. Saturate to infinity.
  538. Result = 0x7FFFU;
  539. }
  540. else
  541. {
  542. if (IValue < 0x38800000U)
  543. {
  544. // The number is too small to be represented as a normalized half.
  545. // Convert it to a denormalized value.
  546. CGU_UINT32 Shift = 113U - (IValue >> 23U);
  547. IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
  548. }
  549. else
  550. {
  551. // Rebias the exponent to represent the value as a normalized half.
  552. IValue += 0xC8000000U;
  553. }
  554. Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U) & 0x7FFFU;
  555. }
  556. return (Result | Sign);
  557. }
  558. CMP_STATIC CGU_Vec3ui cmp_float2halfVec3(CGU_Vec3f endPoint_f)
  559. {
  560. return CGU_Vec3ui(cmp_float2half1(endPoint_f.x), cmp_float2half1(endPoint_f.y), cmp_float2half1(endPoint_f.z));
  561. }
  562. CMP_STATIC CGU_FLOAT cmp_f32tof16(CMP_IN CGU_FLOAT value)
  563. {
  564. #ifdef ASPM_GLSL
  565. return packHalf2x16(CGU_Vec2f(value.x, 0.0));
  566. #endif
  567. #ifdef ASPM_HLSL
  568. return f32tof16(value);
  569. #endif
  570. #ifndef ASPM_GPU
  571. return FloatToHalf(value);
  572. #endif
  573. }
  574. CMP_STATIC CGU_Vec3f cmp_f32tof16(CMP_IN CGU_Vec3f value)
  575. {
  576. #ifdef ASPM_GLSL
  577. return CGU_Vec3f(packHalf2x16(CGU_Vec2f(value.x, 0.0)), packHalf2x16(CGU_Vec2f(value.y, 0.0)), packHalf2x16(CGU_Vec2f(value.z, 0.0)));
  578. #endif
  579. #ifdef ASPM_HLSL
  580. return f32tof16(value);
  581. #endif
  582. #ifndef ASPM_GPU
  583. CGU_Vec3f res;
  584. res.x = FloatToHalf(value.x);
  585. res.y = FloatToHalf(value.y);
  586. res.z = FloatToHalf(value.z);
  587. return res;
  588. #endif
  589. }
  590. CMP_STATIC CGU_FLOAT cmp_f16tof32(CGU_UINT32 value)
  591. {
  592. #ifdef ASPM_GLSL
  593. return unpackHalf2x16(value).x;
  594. #endif
  595. #ifdef ASPM_HLSL
  596. return f16tof32(value);
  597. #endif
  598. #ifndef ASPM_GPU
  599. return HalfToFloat(value);
  600. #endif
  601. }
  602. CMP_STATIC CGU_Vec3f cmp_f16tof32(CGU_Vec3ui value)
  603. {
  604. #ifdef ASPM_GLSL
  605. return CGU_Vec3f(unpackHalf2x16(value.x).x, unpackHalf2x16(value.y).x, unpackHalf2x16(value.z).x);
  606. #endif
  607. #ifdef ASPM_HLSL
  608. return f16tof32(value);
  609. #endif
  610. #ifndef ASPM_GPU
  611. CGU_Vec3f res;
  612. res.x = HalfToFloat(value.x);
  613. res.y = HalfToFloat(value.y);
  614. res.z = HalfToFloat(value.z);
  615. return res;
  616. #endif
  617. }
  618. CMP_STATIC CGU_Vec3f cmp_f16tof32(CGU_Vec3f value)
  619. {
  620. #ifdef ASPM_GLSL
  621. return CGU_Vec3f(unpackHalf2x16(value.x).x, unpackHalf2x16(value.y).x, unpackHalf2x16(value.z).x);
  622. #endif
  623. #ifdef ASPM_HLSL
  624. return f16tof32(value);
  625. #endif
  626. #ifndef ASPM_GPU
  627. CGU_Vec3f res;
  628. res.x = HalfToFloat((CGU_UINT32)value.x);
  629. res.y = HalfToFloat((CGU_UINT32)value.y);
  630. res.z = HalfToFloat((CGU_UINT32)value.z);
  631. return res;
  632. #endif
  633. }
  634. CMP_STATIC void cmp_swap(CMP_INOUT CGU_Vec3f CMP_REFINOUT a, CMP_INOUT CGU_Vec3f CMP_REFINOUT b)
  635. {
  636. CGU_Vec3f tmp = a;
  637. a = b;
  638. b = tmp;
  639. }
  640. CMP_STATIC void cmp_swap(CMP_INOUT CGU_FLOAT CMP_REFINOUT a, CMP_INOUT CGU_FLOAT CMP_REFINOUT b)
  641. {
  642. CGU_FLOAT tmp = a;
  643. a = b;
  644. b = tmp;
  645. }
  646. CMP_STATIC void cmp_swap(CMP_INOUT CGU_Vec3i CMP_REFINOUT lhs, CMP_INOUT CGU_Vec3i CMP_REFINOUT rhs) // valided with msc code
  647. {
  648. CGU_Vec3i tmp = lhs;
  649. lhs = rhs;
  650. rhs = tmp;
  651. }
  652. CMP_STATIC CGU_INT cmp_dotVec2i(CMP_IN CGU_Vec2i value1, CMP_IN CGU_Vec2i value2)
  653. {
  654. #ifdef ASPM_GPU
  655. return dot(value1, value2);
  656. #else
  657. return (value1.x * value2.x) + (value1.y * value2.y);
  658. #endif
  659. }
  660. CMP_STATIC CGU_FLOAT cmp_dotVec3f(CMP_IN CGU_Vec3f value1, CMP_IN CGU_Vec3f value2)
  661. {
  662. #ifdef ASPM_GPU
  663. return dot(value1, value2);
  664. #else
  665. return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z);
  666. #endif
  667. }
  668. CMP_STATIC CGU_UINT32 cmp_dotVec3ui(CMP_IN CGU_Vec3ui value1, CMP_IN CGU_Vec3ui value2)
  669. {
  670. #ifdef ASPM_GPU
  671. return dot(value1, value2);
  672. #else
  673. return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z);
  674. #endif
  675. }
  676. CMP_STATIC CGU_UINT32 cmp_dotVec4i(CMP_IN CGU_Vec4i value1, CMP_IN CGU_Vec4i value2)
  677. {
  678. #ifdef ASPM_GPU
  679. return dot(value1, value2);
  680. #else
  681. return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z) + (value1.w * value2.w);
  682. #endif
  683. }
  684. CMP_STATIC CGU_UINT32 cmp_dotVec4ui(CMP_IN CGU_Vec4ui value1, CMP_IN CGU_Vec4ui value2)
  685. {
  686. #ifdef ASPM_GPU
  687. return dot(value1, value2);
  688. #else
  689. return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z) + (value1.w * value2.w);
  690. #endif
  691. }
  692. CMP_STATIC CGU_Vec3f cmp_clampVec3fi(CMP_IN CGU_Vec3f value, CMP_IN CGU_INT minValue, CMP_IN CGU_INT maxValue)
  693. {
  694. #ifdef ASPM_GPU
  695. return clamp(value, minValue, maxValue);
  696. #else
  697. CGU_Vec3f revalue;
  698. revalue.x = cmp_clampf(value.x, (CGU_FLOAT)minValue, (CGU_FLOAT)maxValue);
  699. revalue.y = cmp_clampf(value.y, (CGU_FLOAT)minValue, (CGU_FLOAT)maxValue);
  700. revalue.z = cmp_clampf(value.z, (CGU_FLOAT)minValue, (CGU_FLOAT)maxValue);
  701. return revalue;
  702. #endif
  703. }
  704. CMP_STATIC CGU_Vec4ui cmp_clampVec4ui(CMP_IN CGU_Vec4ui value, CMP_IN CGU_UINT32 minValue, CMP_IN CGU_UINT32 maxValue)
  705. {
  706. #ifdef ASPM_GPU
  707. return clamp(value, minValue, maxValue);
  708. #else
  709. CGU_Vec4ui revalue;
  710. revalue.x = cmp_clampui32(value.x, minValue, maxValue);
  711. revalue.y = cmp_clampui32(value.y, minValue, maxValue);
  712. revalue.z = cmp_clampui32(value.z, minValue, maxValue);
  713. revalue.w = cmp_clampui32(value.w, minValue, maxValue);
  714. return revalue;
  715. #endif
  716. }
  717. CMP_STATIC CGU_Vec4f cmp_clampVec4f(CMP_IN CGU_Vec4f value, CMP_IN CGU_FLOAT minValue, CMP_IN CGU_FLOAT maxValue)
  718. {
  719. #ifdef ASPM_GPU
  720. return clamp(value, minValue, maxValue);
  721. #else
  722. CGU_Vec4f revalue;
  723. revalue.x = cmp_clampf(value.x, minValue, maxValue);
  724. revalue.y = cmp_clampf(value.y, minValue, maxValue);
  725. revalue.z = cmp_clampf(value.z, minValue, maxValue);
  726. revalue.w = cmp_clampf(value.w, minValue, maxValue);
  727. return revalue;
  728. #endif
  729. }
  730. CMP_STATIC CGU_Vec3f cmp_clamp3Vec3f(CMP_IN CGU_Vec3f value, CMP_IN CGU_Vec3f minValue, CMP_IN CGU_Vec3f maxValue)
  731. {
  732. #ifdef ASPM_GPU
  733. return clamp(value, minValue, maxValue);
  734. #else
  735. CGU_Vec3f revalue;
  736. revalue.x = cmp_clampf(value.x, minValue.x, maxValue.x);
  737. revalue.y = cmp_clampf(value.y, minValue.y, maxValue.y);
  738. revalue.z = cmp_clampf(value.z, minValue.z, maxValue.z);
  739. return revalue;
  740. #endif
  741. }
  742. CMP_STATIC CGU_Vec3f cmp_exp2(CMP_IN CGU_Vec3f value)
  743. {
  744. #ifdef ASPM_GPU
  745. return exp2(value);
  746. #else
  747. CGU_Vec3f revalue;
  748. revalue.x = exp2(value.x);
  749. revalue.y = exp2(value.y);
  750. revalue.z = exp2(value.z);
  751. return revalue;
  752. #endif
  753. }
  754. CMP_STATIC CGU_Vec3f cmp_roundVec3f(CMP_IN CGU_Vec3f value)
  755. {
  756. #ifdef ASPM_HLSL
  757. return round(value);
  758. #endif
  759. #ifndef ASPM_HLSL
  760. CGU_Vec3f res;
  761. res.x = round(value.x);
  762. res.y = round(value.y);
  763. res.z = round(value.z);
  764. return res;
  765. #endif
  766. }
  767. CMP_STATIC CGU_Vec3f cmp_log2Vec3f(CMP_IN CGU_Vec3f value)
  768. {
  769. #ifdef ASPM_GPU
  770. return log2(value);
  771. #else
  772. CGU_Vec3f res;
  773. res.x = log2(value.x);
  774. res.y = log2(value.y);
  775. res.z = log2(value.z);
  776. return res;
  777. #endif
  778. }
  779. // used in BC1 LowQuality code
  780. CMP_STATIC CGU_FLOAT cmp_saturate(CMP_IN CGU_FLOAT value)
  781. {
  782. #ifdef ASPM_HLSL
  783. return saturate(value);
  784. #else
  785. return cmp_clampf(value, 0.0f, 1.0f);
  786. #endif
  787. }
  788. CMP_STATIC CGU_FLOAT cmp_rcp(CMP_IN CGU_FLOAT det)
  789. {
  790. #ifdef ASPM_HLSL
  791. return rcp(det);
  792. #else
  793. if (det > 0.0f)
  794. return (1 / det);
  795. else
  796. return 0.0f;
  797. #endif
  798. }
  799. CMP_STATIC CGU_UINT32 cmp_Get4BitIndexPos(CMP_IN CGU_FLOAT indexPos, CMP_IN CGU_FLOAT endPoint0Pos, CMP_IN CGU_FLOAT endPoint1Pos)
  800. {
  801. CGU_FLOAT r = (indexPos - endPoint0Pos) / (endPoint1Pos - endPoint0Pos);
  802. return cmp_clampui32(CGU_UINT32(r * 14.93333f + 0.03333f + 0.5f), 0, 15);
  803. }
  804. // Calculate Mean Square Least Error (MSLE) for 2 Vectors
  805. CMP_STATIC CGU_FLOAT cmp_CalcMSLE(CMP_IN CGU_Vec3f a, CMP_IN CGU_Vec3f b)
  806. {
  807. CGU_Vec3f err = cmp_log2Vec3f((b + 1.0f) / (a + 1.0f));
  808. err = err * err;
  809. return err.x + err.y + err.z;
  810. }
  811. // Compute Endpoints (min/max) bounding box
  812. CMP_STATIC void cmp_GetTexelMinMax(CMP_IN CGU_Vec3f texels[16], CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMin, CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMax)
  813. {
  814. blockMin = texels[0];
  815. blockMax = texels[0];
  816. for (CGU_UINT32 i = 1u; i < 16u; ++i)
  817. {
  818. blockMin = cmp_minVec3f(blockMin, texels[i]);
  819. blockMax = cmp_maxVec3f(blockMax, texels[i]);
  820. }
  821. }
  822. // Refine Endpoints (min/max) by insetting bounding box in log2 RGB space
  823. CMP_STATIC void cmp_RefineMinMaxAsLog2(CMP_IN CGU_Vec3f texels[16], CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMin, CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMax)
  824. {
  825. CGU_Vec3f refinedBlockMin = blockMax;
  826. CGU_Vec3f refinedBlockMax = blockMin;
  827. for (CGU_UINT32 i = 0u; i < 16u; ++i)
  828. {
  829. refinedBlockMin = cmp_minVec3f(refinedBlockMin, texels[i] == blockMin ? refinedBlockMin : texels[i]);
  830. refinedBlockMax = cmp_maxVec3f(refinedBlockMax, texels[i] == blockMax ? refinedBlockMax : texels[i]);
  831. }
  832. CGU_Vec3f logBlockMax = cmp_log2Vec3f(blockMax + 1.0f);
  833. CGU_Vec3f logBlockMin = cmp_log2Vec3f(blockMin + 1.0f);
  834. CGU_Vec3f logRefinedBlockMax = cmp_log2Vec3f(refinedBlockMax + 1.0f);
  835. CGU_Vec3f logRefinedBlockMin = cmp_log2Vec3f(refinedBlockMin + 1.0f);
  836. CGU_Vec3f logBlockMaxExt = (logBlockMax - logBlockMin) * (1.0f / 32.0f);
  837. logBlockMin += cmp_minVec3f(logRefinedBlockMin - logBlockMin, logBlockMaxExt);
  838. logBlockMax -= cmp_minVec3f(logBlockMax - logRefinedBlockMax, logBlockMaxExt);
  839. blockMin = cmp_exp2(logBlockMin) - 1.0f;
  840. blockMax = cmp_exp2(logBlockMax) - 1.0f;
  841. }
  842. // Refine Endpoints (min/max) by Least Squares Optimization
  843. CMP_STATIC void cmp_RefineMinMaxAs16BitLeastSquares(CMP_IN CGU_Vec3f texels[16],
  844. CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMin,
  845. CMP_INOUT CGU_Vec3f CMP_REFINOUT blockMax)
  846. {
  847. CGU_Vec3f blockDir = blockMax - blockMin;
  848. blockDir = blockDir / (blockDir.x + blockDir.y + blockDir.z);
  849. CGU_FLOAT endPoint0Pos = cmp_f32tof16(cmp_dotVec3f(blockMin, blockDir));
  850. CGU_FLOAT endPoint1Pos = cmp_f32tof16(cmp_dotVec3f(blockMax, blockDir));
  851. CGU_Vec3f alphaTexelSum = 0.0f;
  852. CGU_Vec3f betaTexelSum = 0.0f;
  853. CGU_FLOAT alphaBetaSum = 0.0f;
  854. CGU_FLOAT alphaSqSum = 0.0f;
  855. CGU_FLOAT betaSqSum = 0.0f;
  856. for (CGU_UINT32 i = 0; i < 16; i++)
  857. {
  858. CGU_FLOAT texelPos = cmp_f32tof16(cmp_dotVec3f(texels[i], blockDir));
  859. CGU_UINT32 texelIndex = cmp_Get4BitIndexPos(texelPos, endPoint0Pos, endPoint1Pos);
  860. CGU_FLOAT beta = cmp_saturate(texelIndex / 15.0f);
  861. CGU_FLOAT alpha = 1.0f - beta;
  862. CGU_Vec3f texelF16;
  863. texelF16.x = cmp_f32tof16(texels[i].x);
  864. texelF16.y = cmp_f32tof16(texels[i].y);
  865. texelF16.z = cmp_f32tof16(texels[i].z);
  866. alphaTexelSum += texelF16 * alpha;
  867. betaTexelSum += texelF16 * beta;
  868. alphaBetaSum += alpha * beta;
  869. alphaSqSum += alpha * alpha;
  870. betaSqSum += beta * beta;
  871. }
  872. CGU_FLOAT det = alphaSqSum * betaSqSum - alphaBetaSum * alphaBetaSum;
  873. if (abs(det) > 0.00001f)
  874. {
  875. CGU_FLOAT detRcp = cmp_rcp(det);
  876. blockMin = cmp_clampVec3f((alphaTexelSum * betaSqSum - betaTexelSum * alphaBetaSum) * detRcp, 0.0f, CMP_MAX_16BITFLOAT);
  877. blockMax = cmp_clampVec3f((betaTexelSum * alphaSqSum - alphaTexelSum * alphaBetaSum) * detRcp, 0.0f, CMP_MAX_16BITFLOAT);
  878. blockMin = cmp_f16tof32(blockMin);
  879. blockMax = cmp_f16tof32(blockMax);
  880. }
  881. }
  882. //=============================================================================================
  883. CMP_STATIC CGU_Vec3f cmp_fabsVec3f(CGU_Vec3f value)
  884. {
  885. #ifdef ASPM_HLSL
  886. return abs(value);
  887. #else
  888. CGU_Vec3f res;
  889. res.x = abs(value.x);
  890. res.y = abs(value.y);
  891. res.z = abs(value.z);
  892. return res;
  893. #endif
  894. }
  895. CMP_STATIC CGU_UINT32 cmp_constructColor(CMP_IN CGU_Vec3ui EndPoints)
  896. {
  897. return (((EndPoints.r & 0x000000F8) << 8) | ((EndPoints.g & 0x000000FC) << 3) | ((EndPoints.b & 0x000000F8) >> 3));
  898. }
  899. CMP_STATIC CGU_UINT32 cmp_constructColorBGR(CMP_IN CGU_Vec3f EndPoints)
  900. {
  901. return (((CGU_UINT32(EndPoints.b) & 0x000000F8) << 8) | ((CGU_UINT32(EndPoints.g) & 0x000000FC) << 3) | ((CGU_UINT32(EndPoints.r) & 0x000000F8) >> 3));
  902. }
  903. CMP_STATIC CGU_FLOAT cmp_mod(CMP_IN CGU_FLOAT value, CMP_IN CGU_FLOAT modval)
  904. {
  905. #ifdef ASPM_GLSL
  906. return mod(value, modval);
  907. #endif
  908. return fmod(value, modval);
  909. }
  910. CMP_STATIC CGU_Vec3f cmp_truncVec3f(CMP_IN CGU_Vec3f value)
  911. {
  912. #ifdef ASPM_HLSL
  913. return trunc(value);
  914. #else
  915. CGU_Vec3f res;
  916. res.x = trunc(value.x);
  917. res.y = trunc(value.y);
  918. res.z = trunc(value.z);
  919. return res;
  920. #endif
  921. }
  922. CMP_STATIC CGU_Vec3f cmp_ceilVec3f(CMP_IN CGU_Vec3f value)
  923. {
  924. CGU_Vec3f res;
  925. res.x = ceil(value.x);
  926. res.y = ceil(value.y);
  927. res.z = ceil(value.z);
  928. return res;
  929. }
  930. CMP_STATIC CGU_FLOAT cmp_sqrt(CGU_FLOAT value)
  931. {
  932. return sqrt(value);
  933. }
  934. // Computes inverse square root over an implementation-defined range. The maximum error is implementation-defined.
  935. CMP_STATIC CGV_FLOAT cmp_rsqrt(CGV_FLOAT f)
  936. {
  937. CGV_FLOAT sf = sqrt(f);
  938. if (sf != 0)
  939. return 1 / sqrt(f);
  940. else
  941. return 0.0f;
  942. }
  943. // Common to BC7 API ------------------------------------------------------------------------------------------------------------------------
  944. // valid bit range is 0..8 for mode 1
  945. CMP_STATIC INLINE CGU_UINT32 cmp_shift_right_uint32(CMP_IN CGU_UINT32 v, CMP_IN CGU_INT bits)
  946. {
  947. return v >> bits; // (perf warning expected)
  948. }
  949. CMP_STATIC INLINE CGU_INT cmp_clampi(CMP_IN CGU_INT value, CMP_IN CGU_INT low, CMP_IN CGU_INT high)
  950. {
  951. if (value < low)
  952. return low;
  953. else if (value > high)
  954. return high;
  955. return value;
  956. }
  957. CMP_STATIC INLINE CGU_INT32 cmp_clampi32(CMP_IN CGU_INT32 value, CMP_IN CGU_INT32 low, CMP_IN CGU_INT32 high)
  958. {
  959. if (value < low)
  960. value = low;
  961. else if (value > high)
  962. value = high;
  963. return value;
  964. }
  965. CMP_STATIC CGV_FLOAT cmp_dot4f(CMP_IN CGV_Vec4f value1, CMP_IN CGV_Vec4f value2)
  966. {
  967. #ifdef ASPM_GPU
  968. return dot(value1, value2);
  969. #else
  970. return (value1.x * value2.x) + (value1.y * value2.y) + (value1.z * value2.z) + (value1.w * value2.w);
  971. #endif
  972. }
  973. CMP_STATIC INLINE void cmp_set_vec4f(CMP_INOUT CGU_Vec4f CMP_REFINOUT pV, CMP_IN CGU_FLOAT x, CMP_IN CGU_FLOAT y, CMP_IN CGU_FLOAT z, CMP_IN CGU_FLOAT w)
  974. {
  975. pV[0] = x;
  976. pV[1] = y;
  977. pV[2] = z;
  978. pV[3] = w;
  979. }
  980. CMP_STATIC INLINE void cmp_set_vec4ui(CGU_Vec4ui CMP_REFINOUT pV, CMP_IN CGU_UINT8 x, CMP_IN CGU_UINT8 y, CMP_IN CGU_UINT8 z, CMP_IN CGU_UINT8 w)
  981. {
  982. pV[0] = x;
  983. pV[1] = y;
  984. pV[2] = z;
  985. pV[3] = w;
  986. }
  987. CMP_STATIC inline void cmp_set_vec4ui_clamped(CGU_Vec4ui CMP_REFINOUT pRes, CMP_IN CGU_INT32 r, CMP_IN CGU_INT32 g, CMP_IN CGU_INT32 b, CMP_IN CGU_INT32 a)
  988. {
  989. pRes[0] = (CGU_UINT8)cmp_clampi32(r, 0, 255);
  990. pRes[1] = (CGU_UINT8)cmp_clampi32(g, 0, 255);
  991. pRes[2] = (CGU_UINT8)cmp_clampi32(b, 0, 255);
  992. pRes[3] = (CGU_UINT8)cmp_clampi32(a, 0, 255);
  993. }
  994. CMP_STATIC inline CGU_Vec4f cmp_clampNorm4f(CMP_IN CGU_Vec4f pV)
  995. {
  996. CGU_Vec4f res;
  997. res[0] = cmp_clampf(pV[0], 0.0f, 1.0f);
  998. res[1] = cmp_clampf(pV[1], 0.0f, 1.0f);
  999. res[2] = cmp_clampf(pV[2], 0.0f, 1.0f);
  1000. res[3] = cmp_clampf(pV[3], 0.0f, 1.0f);
  1001. return res;
  1002. }
  1003. CMP_STATIC INLINE CGU_Vec4f cmp_vec4ui_to_vec4f(CMP_IN CGU_Vec4ui pC)
  1004. {
  1005. CGU_Vec4f res;
  1006. cmp_set_vec4f(res, (CGU_FLOAT)pC[0], (CGU_FLOAT)pC[1], (CGU_FLOAT)pC[2], (CGU_FLOAT)pC[3]);
  1007. return res;
  1008. }
  1009. CMP_STATIC INLINE void cmp_normalize(CGU_Vec4f CMP_REFINOUT pV)
  1010. {
  1011. CGU_FLOAT s = cmp_dot4f(pV, pV);
  1012. if (s != 0.0f)
  1013. {
  1014. s = 1.0f / cmp_sqrt(s);
  1015. pV *= s;
  1016. }
  1017. }
  1018. CMP_STATIC INLINE CGV_FLOAT cmp_squaref(CMP_IN CGV_FLOAT v)
  1019. {
  1020. return v * v;
  1021. }
  1022. CMP_STATIC INLINE CGU_INT cmp_squarei(CMP_IN CGU_INT i)
  1023. {
  1024. return i * i;
  1025. }
  1026. CMP_STATIC CGU_UINT8 cmp_clampui8(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 a, CMP_IN CGU_UINT8 b)
  1027. {
  1028. if (v < a)
  1029. return a;
  1030. else if (v > b)
  1031. return b;
  1032. return v;
  1033. }
  1034. CMP_STATIC CGU_INT32 cmp_abs32(CMP_IN CGU_INT32 v)
  1035. {
  1036. CGU_UINT32 msk = v >> 31;
  1037. return (v ^ msk) - msk;
  1038. }
  1039. CMP_STATIC void cmp_swap32(CMP_INOUT CGU_UINT32 CMP_REFINOUT a, CMP_INOUT CGU_UINT32 CMP_REFINOUT b)
  1040. {
  1041. CGU_UINT32 t = a;
  1042. a = b;
  1043. b = t;
  1044. }
  1045. // Computes inverse square root over an implementation-defined range. The maximum error is implementation-defined.
  1046. CMP_STATIC CGV_FLOAT cmp_Image_rsqrt(CMP_IN CGV_FLOAT f)
  1047. {
  1048. CGV_FLOAT sf = sqrt(f);
  1049. if (sf != 0)
  1050. return 1 / sqrt(f);
  1051. else
  1052. return 0.0f;
  1053. }
  1054. CMP_STATIC void cmp_pack4bitindex32(CMP_INOUT CGU_UINT32 packed_index[2], CMP_IN CGU_UINT32 src_index[16])
  1055. {
  1056. // Converts from unpacked index to packed index
  1057. packed_index[0] = 0x0000;
  1058. packed_index[1] = 0x0000;
  1059. CGU_UINT32 shift = 0; // was CGU_UINT8
  1060. for (CGU_INT k = 0; k < 8; k++)
  1061. {
  1062. packed_index[0] |= (CGU_UINT32)(src_index[k] & 0x0F) << shift;
  1063. packed_index[1] |= (CGU_UINT32)(src_index[k + 8] & 0x0F) << shift;
  1064. shift += 4;
  1065. }
  1066. }
  1067. CMP_STATIC void cmp_pack4bitindex(CMP_INOUT CGU_UINT32 packed_index[2], CMP_IN CGU_UINT8 src_index[16])
  1068. {
  1069. // Converts from unpacked index to packed index
  1070. packed_index[0] = 0x0000;
  1071. packed_index[1] = 0x0000;
  1072. CGU_UINT32 shift = 0; // was CGU_UINT8
  1073. for (CGU_INT k = 0; k < 8; k++)
  1074. {
  1075. packed_index[0] |= (CGU_UINT32)(src_index[k] & 0x0F) << shift;
  1076. packed_index[1] |= (CGU_UINT32)(src_index[k + 8] & 0x0F) << shift;
  1077. shift += 4;
  1078. }
  1079. }
  1080. CMP_STATIC INLINE CGU_INT cmp_expandbits(CMP_IN CGU_INT v, CMP_IN CGU_INT bits)
  1081. {
  1082. CGU_INT vv = v << (8 - bits);
  1083. return vv + cmp_shift_right_uint32(vv, bits);
  1084. }
  1085. // This code need further improvements and investigation
  1086. CMP_STATIC INLINE CGU_UINT8 cmp_ep_find_floor2(CMP_IN CGV_FLOAT v, CMP_IN CGU_UINT8 bits, CMP_IN CGU_UINT8 use_par, CMP_IN CGU_UINT8 odd)
  1087. {
  1088. CGU_UINT8 i1 = 0;
  1089. CGU_UINT8 i2 = 1 << (bits - use_par);
  1090. odd = use_par ? odd : 0;
  1091. while (i2 - i1 > 1)
  1092. {
  1093. CGU_UINT8 j = (CGU_UINT8)((i1 + i2) * 0.5f);
  1094. CGV_FLOAT ep_d = (CGV_FLOAT)cmp_expandbits((j << use_par) + odd, bits);
  1095. if (v >= ep_d)
  1096. i1 = j;
  1097. else
  1098. i2 = j;
  1099. }
  1100. return (i1 << use_par) + odd;
  1101. }
  1102. CMP_STATIC CGV_FLOAT cmp_absf(CMP_IN CGV_FLOAT a)
  1103. {
  1104. return a > 0.0F ? a : -a;
  1105. }
  1106. CMP_STATIC INLINE CGU_UINT32 cmp_pow2Packed(CMP_IN CGU_INT x)
  1107. {
  1108. return 1 << x;
  1109. }
  1110. CMP_STATIC INLINE CGU_UINT8 cmp_clampIndex(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 a, CMP_IN CGU_UINT8 b)
  1111. {
  1112. if (v < a)
  1113. return a;
  1114. else if (v > b)
  1115. return b;
  1116. return v;
  1117. }
  1118. CMP_STATIC INLINE CGU_UINT8 shift_right_uint82(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 bits)
  1119. {
  1120. return v >> bits; // (perf warning expected)
  1121. }
  1122. #endif
  1123. CMP_STATIC CGU_INT cmp_QuantizeToBitSize(CMP_IN CGU_INT value, CMP_IN CGU_INT prec, CMP_IN CGU_BOOL signedfloat16)
  1124. {
  1125. if (prec <= 1)
  1126. return 0;
  1127. CGU_BOOL negvalue = false;
  1128. // move data to use extra bits for processing
  1129. CGU_INT ivalue = value;
  1130. if (signedfloat16)
  1131. {
  1132. if (value < 0)
  1133. {
  1134. negvalue = true;
  1135. value = -value;
  1136. }
  1137. prec--;
  1138. }
  1139. else
  1140. {
  1141. // clamp -ve
  1142. if (value < 0)
  1143. value = 0;
  1144. }
  1145. CGU_INT iQuantized;
  1146. CGU_INT bias = (prec > 10 && prec != 16) ? ((1 << (prec - 11)) - 1) : 0;
  1147. bias = (prec == 16) ? 15 : bias;
  1148. iQuantized = ((ivalue << prec) + bias) / (0x7bff + 1); // 16 bit Float Max 0x7bff
  1149. return (negvalue ? -iQuantized : iQuantized);
  1150. }
  1151. //=======================================================
  1152. // CPU GPU Macro API
  1153. //=======================================================
  1154. #ifdef ASPM_GPU
  1155. #define cmp_min(a, b) min(a, b)
  1156. #define cmp_max(a, b) max(a, b)
  1157. #else
  1158. #ifndef cmp_min
  1159. #define cmp_min(a, b) ((a) < (b) ? (a) : (b))
  1160. #endif
  1161. #ifndef cmp_max
  1162. #define cmp_max(a, b) ((a) > (b) ? (a) : (b))
  1163. #endif
  1164. #endif
  1165. //=======================================================
  1166. // CPU Template API
  1167. //=======================================================
  1168. #ifndef ASPM_GPU
  1169. #ifndef TEMPLATE_API_INTERFACED
  1170. #define TEMPLATE_API_INTERFACED
  1171. template <typename T>
  1172. T clamp(T& v, const T& lo, const T& hi)
  1173. {
  1174. if (v < lo)
  1175. return lo;
  1176. else if (v > hi)
  1177. return hi;
  1178. return v;
  1179. }
  1180. template <typename T>
  1181. Vec4T<T> clamp(Vec4T<T>& v, const T& lo, const T& hi)
  1182. {
  1183. Vec4T<T> res = v;
  1184. if (v.x < lo)
  1185. res.x = lo;
  1186. else if (v.x > hi)
  1187. res.x = hi;
  1188. if (v.y < lo)
  1189. res.y = lo;
  1190. else if (v.y > hi)
  1191. res.y = hi;
  1192. if (v.z < lo)
  1193. res.z = lo;
  1194. else if (v.w > hi)
  1195. res.w = hi;
  1196. if (v.w < lo)
  1197. res.w = lo;
  1198. else if (v.z > hi)
  1199. res.z = hi;
  1200. return res;
  1201. }
  1202. template <typename T,typename T2>
  1203. T dot(T& v1, T2& v2)
  1204. {
  1205. return (v1 * v2);
  1206. }
  1207. template <typename T>
  1208. T dot(Vec4T<T>& v1, Vec4T<T>& v2)
  1209. {
  1210. return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z + v1.w * v2.w);
  1211. }
  1212. template <typename T, typename T2>
  1213. T dot(Vec4T<T>& v1, Vec4T<T2>& v2)
  1214. {
  1215. return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z + v1.w * v2.w);
  1216. }
  1217. template <typename T>
  1218. T dot(Vec3T<T>& v1, Vec3T<T>& v2)
  1219. {
  1220. return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
  1221. }
  1222. template <typename T, typename T2>
  1223. T dot(Vec3T<T>& v1, Vec3T<T2>& v2)
  1224. {
  1225. return (v1.x * v2.x + v1.y * v2.y + v1.z * v2.z);
  1226. }
  1227. #endif // API_INTERFACED
  1228. #endif // ASPM_GPU
  1229. #endif //