//
// Copyright (c) Microsoft. All rights reserved.
// This code is licensed under the MIT License (MIT).
// THIS CODE IS PROVIDED *AS IS* WITHOUT WARRANTY OF
// ANY KIND, EITHER EXPRESS OR IMPLIED, INCLUDING ANY
// IMPLIED WARRANTIES OF FITNESS FOR A PARTICULAR
// PURPOSE, MERCHANTABILITY, OR NON-INFRINGEMENT.
//
// Developed by Minigraph
//
// Author:  James Stanard 
//

// RGBM is a good way to pack HDR values into R8G8B8A8_UNORM
uint PackRGBM( float3 rgb, float PeakValue = 16.0 )
{
	rgb = saturate(rgb / PeakValue);
	float maxVal = max(max(1e-6, rgb.x), max(rgb.y, rgb.z));
	maxVal = ceil(maxVal * 255.0);
	float divisor = (255 * 255.0) / maxVal;
#if _XBOX_ONE
	uint RGBM = (uint)maxVal;
	RGBM = __XB_PackF32ToU8(rgb.r * divisor + 0.5, 3, RGBM);
	RGBM = __XB_PackF32ToU8(rgb.g * divisor + 0.5, 2, RGBM);
	RGBM = __XB_PackF32ToU8(rgb.b * divisor + 0.5, 1, RGBM);
	return RGBM;
#else
	uint M = (uint)maxVal;
	uint R = (uint)(rgb.r * divisor + 0.5);
	uint G = (uint)(rgb.g * divisor + 0.5);
	uint B = (uint)(rgb.b * divisor + 0.5);
	return R << 24 | G << 16 | B << 8 | M;
#endif
}

float3 UnpackRGBM( uint p, float PeakValue = 16.0 )
{
#if _XBOX_ONE
	float R = __XB_UnpackByte3(p);
	float G = __XB_UnpackByte2(p);
	float B = __XB_UnpackByte1(p);
	float M = __XB_UnpackByte0(p);
#else
	uint R = p >> 24;
	uint G = (p >> 16) & 0xFF;
	uint B = (p >> 8) & 0xFF;
	uint M = p & 0xFF;
#endif
	return float3(R, G, B) * M * PeakValue / (255.0 * 255.0);
}

// RGBE packs 9 bits per color channel while encoding the multiplier as a perfect power of 2 (just the exponent)
// What's nice about this is that it gives you a lot more range than RGBM.  This isn't proven to be bitwise
// compatible with DXGI_FORMAT_R9B9G9E5_SHAREDEXP, but if it's not, it could be made so.
uint PackRGBE(float3 rgb)
{
	float MaxChannel = max(rgb.r, max(rgb.g, rgb.b));

	// NextPow2 has to have the biggest exponent plus 1 (and nothing in the mantissa)
	float NextPow2 = asfloat((asuint(MaxChannel) + 0x800000) & 0x7F800000);

	// By adding NextPow2, all channels have the same exponent, shifting their mantissa bits
	// to the right to accomodate it.  This also shifts in the implicit '1' bit of all channels.
	// The largest channel will always have the high bit set.
	rgb += NextPow2;

#if _XBOX_ONE
	uint R = __XB_UBFE(9, 14, asuint(rgb.r));
	uint G = __XB_UBFE(9, 14, asuint(rgb.g));
	uint B = __XB_UBFE(9, 14, asuint(rgb.b));
#else
	uint R = (asuint(rgb.r) << 9) >> 23;
	uint G = (asuint(rgb.g) << 9) >> 23;
	uint B = (asuint(rgb.b) << 9) >> 23;
#endif
	uint E = f32tof16(NextPow2) << 17;
	return R | G << 9 | B << 18 | E;
}

float3 UnpackRGBE(uint p)
{
#if _XBOX_ONE
	float Pow2 = f16tof32(__XB_UBFE(5, 27, p) << 10);
	float R = asfloat(asuint(Pow2) | __XB_UBFE(9, 0, p) << 14);
	float G = asfloat(asuint(Pow2) | __XB_UBFE(9, 9, p) << 14);
	float B = asfloat(asuint(Pow2) | __XB_UBFE(9, 18, p) << 14);
#else
	float Pow2 = f16tof32((p >> 27) << 10);
	float R = asfloat(asuint(Pow2) | (p << 14) & 0x7FC000);
	float G = asfloat(asuint(Pow2) | (p <<  5) & 0x7FC000);
	float B = asfloat(asuint(Pow2) | (p >>  4) & 0x7FC000);
#endif
	return float3(R, G, B) - Pow2;
}

// Like LogLuv "minus the log".  The intention is to store Y in 16-bit float, and UV as 8-bit
// unorm values.  This is corrected from the widely publicized LogLuv encoding which used
// the wrong color primaries for sRGB and Rec.709.  Note the correct coefficients for computing
// luminance ("Y") in the 2nd row of RGBtoXYZ.
float3 EncodeYUV(float3 RGB)
{
	/*
	// Start with the right RGBtoXYZ matrix for your color space (this one is sRGB D65)
	// http://www.brucelindbloom.com/index.html?Eqn_RGB_XYZ_Matrix.html
	static const float3x3 RGBtoXYZ =
	{
		0.4124564, 0.3575761, 0.1804375,
		0.2126729, 0.7151522, 0.0721750,	<-- The color primaries determining luminance
		0.0193339, 0.1191920, 0.9503041
	};

	// Compute u' and v'.  These pack chrominance into two normalized channels.
	// u' = 4X / (X + 15Y + 3Z)
	// v' = 9Y / (X + 15Y + 3Z)

	// Expand visible spectrum from (0, 0.62) to (0, 1)
	// u" = u' / 0.62
	// v" = v' / 0.62

	// If we compute these two values...
	// X' = 4 / 9 * X
	// XYZ' = (X + 15 * Y + 3 * Z) * 0.62 / 9

	// ...we can derive our final Yu"v" from X', Y, and XYZ'
	// u" = X' / XYZ'
	// v" = Y  / XYZ'

	// We can compute (X', Y, XYZ') by multiplying XYZ by this matrix
	static const float3x3 FixupMatrix =
	{
		4.0 / 9.0, 0.0, 0.0,
		0.0, 1.0, 0.0,
		0.62 / 9.0, 15.0 * 0.62 / 9.0, 3.0 * 0.62 / 9.0
	};
	
	// But we should just concatenate the two matrices
	static const float3x3 EncodeMatrix = mul(FixupMatrix, RGBtoXYZ);
	*/

	static const float3x3 EncodeMatrix = 
	{
		0.1833140, 0.1589227, 0.0801944,
		0.2126729, 0.7151522, 0.0721750,
		0.2521713, 0.7882566, 0.2834072
	};			  

	float3 Xp_Y_XYZp = mul(EncodeMatrix, RGB);
	float Y = Xp_Y_XYZp.y;
	float2 UV = saturate(Xp_Y_XYZp.xy / max(Xp_Y_XYZp.z, 1e-6));
	return float3(Y, UV);
}

float3 DecodeYUV(float3 YUV)
{
	// Inverse of EncodeMatrix
	static const float3x3 DecodeMatrix = 
	{
		 7.6649220,  0.9555189, -2.4122494,
		-2.2120164,  1.6682305,  0.2010778,
		-0.6677212, -5.4901517,  5.1156056
	};

	// Reverse of operations
	float Y = YUV.x;
	float XYZp = YUV.x / max(YUV.z, 1e-6);
	float Xp = YUV.y * XYZp;
	return mul(DecodeMatrix, float3(Xp, Y, XYZp));
}

// If you can't write Y and UV to separate buffers (R16_FLOAT, R8G8_UNORM), then
// you can pack them into R32_UINT.
uint PackYUV(float3 YUV)
{
	uint Y = f32tof16(YUV.x);
#if _XBOX_ONE
	uint p = __XB_PackF32ToU8(YUV.y * 255.0 + 0.5, 3, Y);
	return __XB_PackF32ToU8(YUV.z * 255.0 + 0.5, 2, p);
#else
	uint U = (uint)(YUV.y * 255.0 + 0.5);
	uint V = (uint)(YUV.z * 255.0 + 0.5);
	return Y | U << 24 | V << 16;
#endif
}

float3 UnpackYUV(uint YUV)
{
	float Y = f16tof32(YUV);
#if _XBOX_ONE
	float U = __XB_UnpackByte3(YUV) / 255.0;
	float V = __XB_UnpackByte2(YUV) / 255.0;
#else
	float U = (YUV >> 24) / 255.0;
	float V = ((YUV >> 16) & 0xFF) / 255.0; 
#endif
	return float3(Y, U, V);
}

// To understand this, know that all math on YUV should really be done on
// Y, Y*U, Y*V.  You can add and blend all three of those values the same
// as RGB, but for compact encoding, you only want to store Y, U, and V.
float3 AddYUV( float3 YUV1, float3 YUV2 )
{
	// Luminance is simply added; chrominance becomes a weighted average
	float Y = YUV1.x + YUV2.x;
	float2 UV = (YUV1.yz * YUV1.x + YUV2.yz * YUV2.x) / Y;
	return float3(Y, UV);
}

float3 LerpYUV( float3 YUV1, float3 YUV2, float t )
{
	// To rescale a YUV value, you just have to rescale Y.  Chroma remains the same.
	// After scaling luminance, you can add the two colors together.  This version of
	// the math (as opposed to the possibly more readable code commented out below) is
	// more efficient.  But it's interesting to note that if you kept values as Y, Y*U, Y*V,
	// you could simply add or lerp them.
	YUV1.x *= (1 - t);
	YUV2.x *= t;
	return AddYUV(YUV1, YUV2);

	//float Y = lerp(YUV1.x, YUV2.x, t);
	//float2 UV = lerp(YUV1.yz * YUV1.x, YUV2.yz * YUV2.x, t) / Y;
	//return float3(Y, UV);
}

// The standard 32-bit HDR color format.  Each float has a 5-bit exponent and no sign bit.
uint Pack_R11G11B10_FLOAT( float3 rgb )
{
	uint r = (f32tof16(rgb.x) >>  4) & 0x000007FF;
	uint g = (f32tof16(rgb.y) <<  7) & 0x003FF800;
	uint b = (f32tof16(rgb.z) << 17) & 0xFFC00000;
	return r | g | b;
}

float3 Unpack_R11G11B10_FLOAT( uint rgb )
{
	float r = f16tof32((rgb << 4 ) & 0x7FF0);
	float g = f16tof32((rgb >> 7 ) & 0x7FF0);
	float b = f16tof32((rgb >> 17) & 0x7FE0);
	return float3(r, g, b);
}

// These next two encodings are great for LDR data.  By knowing that our values are [0.0, 1.0]
// (or [0.0, 2.0), incidentally), we can reduce how many bits we need in the exponent.  We can
// immediately eliminate all postive exponents.  By giving more bits to the mantissa, we can
// improve precision at the expense of range.  The 8E3 format goes one bit further, quadrupling
// mantissa precision but increasing smallest exponent from -14 to -6.  The smallest value of 8E3
// is 2^-14, while the smallest value of 7E4 is 2^-21.  Both are smaller than the smallest 8-bit
// sRGB value, which is close to 2^-12.

// This is like R11G11B10_FLOAT except that it moves one bit from each exponent to each mantissa.
uint Pack_R11G11B10_E4_FLOAT( float3 rgb )
{
	// Clamp to [0.0, 2.0).  The magic number is 1.FFFFF x 2^0.  (We can't represent hex floats in HLSL.)
	// This trick works because clamping your exponent to 0 reduces the number of bits needed by 1.
	rgb = clamp( rgb, 0.0, asfloat(0x3FFFFFFF) );
	uint r = (f32tof16(rgb.r) >> 3 ) & 0x000007FF;
	uint g = (f32tof16(rgb.g) << 8 ) & 0x003FF800;
	uint b = (f32tof16(rgb.b) << 18) & 0xFFC00000;
	return r | g | b;
}

float3 Unpack_R11G11B10_E4_FLOAT( uint rgb )
{
	float r = f16tof32((rgb << 3 ) & 0x3FF8);
	float g = f16tof32((rgb >> 8 ) & 0x3FF8);
	float b = f16tof32((rgb >> 18) & 0x3FF0);
	return float3(r, g, b);
}

// This is like R11G11B10_FLOAT except that it moves two bits from each exponent to each mantissa.
uint Pack_R11G11B10_E3_FLOAT( float3 rgb )
{
	// Clamp to [0.0, 2.0).  Divide by 256 to bias the exponent by -8.  This shifts it down to use one
	// fewer bit while still taking advantage of the denormalization hardware.  In half precision,
	// the exponent of 0 is 0xF.  Dividing by 256 makes the max exponent 0x7--one fewer bit.
	rgb = clamp( rgb, 0.0, asfloat(0x3FFFFFFF) ) / 256.0;
	uint r = (f32tof16(rgb.r) >> 2 ) & 0x000007FF;
	uint g = (f32tof16(rgb.g) << 9 ) & 0x003FF800;
	uint b = (f32tof16(rgb.b) << 19) & 0xFFC00000;
	return r | g | b;
}

float3 Unpack_R11G11B10_E3_FLOAT( uint rgb )
{
	float r = f16tof32((rgb << 2 ) & 0x1FFC);
	float g = f16tof32((rgb >> 9 ) & 0x1FFC);
	float b = f16tof32((rgb >> 19) & 0x1FF8);
	return float3(r, g, b) * 256.0;
}