10 years ago · 8ab70bd8cf
--- a/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
+++ b/3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
@@ -0,0 +1,24 @@
 
				+NVIDIA Texture Tools 2.0 is licensed under the MIT license.

			
 
				+

			
 
				+Copyright (c) 2007 NVIDIA Corporation

			
 
				+

			
 
				+Permission is hereby granted, free of charge, to any person

			
 
				+obtaining a copy of this software and associated documentation

			
 
				+files (the "Software"), to deal in the Software without

			
 
				+restriction, including without limitation the rights to use,

			
 
				+copy, modify, merge, publish, distribute, sublicense, and/or sell

			
 
				+copies of the Software, and to permit persons to whom the

			
 
				+Software is furnished to do so, subject to the following

			
 
				+conditions:

			
 
				+

			
 
				+The above copyright notice and this permission notice shall be

			
 
				+included in all copies or substantial portions of the Software.

			
 
				+

			
 
				+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,

			
 
				+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES

			
 
				+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND

			
 
				+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT

			
 
				+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,

			
 
				+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING

			
 
				+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR

			
 
				+OTHER DEALINGS IN THE SOFTWARE.

			
--- a/3rdparty/nvtt/bc6h/bits.h
+++ b/3rdparty/nvtt/bc6h/bits.h
@@ -0,0 +1,76 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+#pragma once
			
 
				+#ifndef _ZOH_BITS_H
			
 
				+#define _ZOH_BITS_H
			
 
				+
			
 
				+// read/write a bitstream
			
 
				+
			
 
				+#include "nvcore/Debug.h"
			
 
				+
			
 
				+namespace ZOH {
			
 
				+
			
 
				+class Bits
			
 
				+{
			
 
				+public:
			
 
				+
			
 
				+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
			
 
				+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
			
 
				+
			
 
				+	void write(int value, int nbits) {
			
 
				+		nvAssert (nbits >= 0 && nbits < 32);
			
 
				+		nvAssert (sizeof(int)>= 4);
			
 
				+		for (int i=0; i<nbits; ++i)
			
 
				+			writeone(value>>i);
			
 
				+	}
			
 
				+	int read(int nbits) { 
			
 
				+		nvAssert (nbits >= 0 && nbits < 32);
			
 
				+		nvAssert (sizeof(int)>= 4);
			
 
				+		int out = 0;
			
 
				+		for (int i=0; i<nbits; ++i)
			
 
				+			out |= readone() << i;
			
 
				+		return out;
			
 
				+	}
			
 
				+	int getptr() { return bptr; }
			
 
				+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
			
 
				+	int getsize() { return bend; }
			
 
				+
			
 
				+private:
			
 
				+	int	bptr;		// next bit to read
			
 
				+	int bend;		// last written bit + 1
			
 
				+	char *bits;		// ptr to user bit stream
			
 
				+	const char *cbits;	// ptr to const user bit stream
			
 
				+	int maxbits;	// max size of user bit stream
			
 
				+	char readonly;	// 1 if this is a read-only stream
			
 
				+
			
 
				+	int readone() {
			
 
				+		nvAssert (bptr < bend);
			
 
				+		if (bptr >= bend) return 0;
			
 
				+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
			
 
				+		++bptr;
			
 
				+		return bit != 0;
			
 
				+	}
			
 
				+	void writeone(int bit) {
			
 
				+		nvAssert (!readonly); // "Writing a read-only bit stream"
			
 
				+		nvAssert (bptr < maxbits);
			
 
				+		if (bptr >= maxbits) return;
			
 
				+		if (bit&1)
			
 
				+			bits[bptr>>3] |= 1 << (bptr & 7);
			
 
				+		else
			
 
				+			bits[bptr>>3] &= ~(1 << (bptr & 7));
			
 
				+		if (bptr++ >= bend) bend = bptr;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/bc6h/shapes_two.h
+++ b/3rdparty/nvtt/bc6h/shapes_two.h
@@ -0,0 +1,133 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+#pragma once
			
 
				+#ifndef _ZOH_SHAPES_TWO_H
			
 
				+#define _ZOH_SHAPES_TWO_H
			
 
				+
			
 
				+// shapes for two regions
			
 
				+
			
 
				+#define NREGIONS 2
			
 
				+#define NSHAPES 64
			
 
				+#define SHAPEBITS 6
			
 
				+
			
 
				+static const int shapes[NSHAPES*16] = 
			
 
				+{
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
			
 
				+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
			
 
				+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
			
 
				+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
			
 
				+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
			
 
				+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
			
 
				+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
			
 
				+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
			
 
				+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
			
 
				+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
			
 
				+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
			
 
				+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
			
 
				+
			
 
				+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
			
 
				+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
			
 
				+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
			
 
				+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
			
 
				+
			
 
				+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
			
 
				+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
			
 
				+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
			
 
				+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
			
 
				+
			
 
				+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
			
 
				+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
			
 
				+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
			
 
				+
			
 
				+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
			
 
				+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
			
 
				+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
			
 
				+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
			
 
				+
			
 
				+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
			
 
				+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
			
 
				+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
			
 
				+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
			
 
				+
			
 
				+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
			
 
				+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
			
 
				+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
			
 
				+
			
 
				+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
			
 
				+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
			
 
				+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
			
 
				+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
			
 
				+
			
 
				+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
			
 
				+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
			
 
				+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
			
 
				+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
			
 
				+
			
 
				+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
			
 
				+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
			
 
				+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
			
 
				+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
			
 
				+
			
 
				+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
			
 
				+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
			
 
				+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
			
 
				+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
			
 
				+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
			
 
				+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
			
 
				+
			
 
				+};
			
 
				+
			
 
				+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
			
 
				+
			
 
				+static const int shapeindex_to_compressed_indices[NSHAPES*2] = 
			
 
				+{
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+
			
 
				+	0,15,  0, 2,  0, 8,  0, 2,
			
 
				+	0, 2,  0, 8,  0, 8,  0,15,
			
 
				+	0, 2,  0, 8,  0, 2,  0, 2,
			
 
				+	0, 8,  0, 8,  0, 2,  0, 2,
			
 
				+
			
 
				+	0,15,  0,15,  0, 6,  0, 8,
			
 
				+	0, 2,  0, 8,  0,15,  0,15,
			
 
				+	0, 2,  0, 8,  0, 2,  0, 2,
			
 
				+	0, 2,  0,15,  0,15,  0, 6,
			
 
				+
			
 
				+	0, 6,  0, 2,  0, 6,  0, 8,
			
 
				+	0,15,  0,15,  0, 2,  0, 2,
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+	0,15,  0, 2,  0, 2,  0,15
			
 
				+
			
 
				+};
			
 
				+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/bc6h/tile.h
+++ b/3rdparty/nvtt/bc6h/tile.h
@@ -0,0 +1,83 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+#pragma once
			
 
				+#ifndef _ZOH_TILE_H
			
 
				+#define _ZOH_TILE_H
			
 
				+
			
 
				+#include "zoh_utils.h"
			
 
				+#include "nvmath/Vector.h"
			
 
				+#include <math.h>
			
 
				+
			
 
				+namespace ZOH {
			
 
				+
			
 
				+//#define	USE_IMPORTANCE_MAP	1		// define this if you want to increase importance of some pixels in tile
			
 
				+class Tile
			
 
				+{
			
 
				+public:
			
 
				+	// NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value
			
 
				+	static float half2float(uint16 h)
			
 
				+	{
			
 
				+		return (float) Utils::ushort_to_format(h);
			
 
				+	}
			
 
				+	// NOTE: this is the inverse of the above operation
			
 
				+	static uint16 float2half(float f)
			
 
				+	{
			
 
				+		return Utils::format_to_ushort((int)f);
			
 
				+	}
			
 
				+
			
 
				+	// look for adjacent pixels that are identical. if there are enough of them, increase their importance
			
 
				+	void generate_importance_map()
			
 
				+	{
			
 
				+		// initialize
			
 
				+		for (int y=0; y<size_y; ++y)
			
 
				+		for (int x=0; x<size_x; ++x)
			
 
				+		{
			
 
				+			// my importance is increased if I am identical to any of my 4-neighbors
			
 
				+			importance_map[y][x] = match_4_neighbor(x,y) ? 5.0f : 1.0f;
			
 
				+		}
			
 
				+	}
			
 
				+	bool is_equal(int x, int y, int xn, int yn)
			
 
				+	{
			
 
				+		if (xn < 0 || xn >= size_x || yn < 0 || yn >= size_y)
			
 
				+			return false;
			
 
				+		return( (data[y][x].x == data[yn][xn].x) &&
			
 
				+				(data[y][x].y == data[yn][xn].y) &&
			
 
				+				(data[y][x].z == data[yn][xn].z) );
			
 
				+	}
			
 
				+
			
 
				+#ifdef USE_IMPORTANCE_MAP
			
 
				+	bool match_4_neighbor(int x, int y)
			
 
				+	{
			
 
				+		return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1);
			
 
				+	}
			
 
				+#else
			
 
				+	bool match_4_neighbor(int, int)
			
 
				+	{
			
 
				+		return false;
			
 
				+	}
			
 
				+#endif
			
 
				+
			
 
				+	Tile() {};
			
 
				+	~Tile(){};
			
 
				+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
			
 
				+
			
 
				+	static const int TILE_H = 4;
			
 
				+	static const int TILE_W = 4;
			
 
				+	static const int TILE_TOTAL = TILE_H * TILE_W;
			
 
				+    nv::Vector3 data[TILE_H][TILE_W];
			
 
				+	float importance_map[TILE_H][TILE_W];
			
 
				+	int	size_x, size_y;			// actual size of tile
			
 
				+};
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif // _ZOH_TILE_H
			
--- a/3rdparty/nvtt/bc6h/zoh.cpp
+++ b/3rdparty/nvtt/bc6h/zoh.cpp
@@ -0,0 +1,197 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// the zoh compressor and decompressor
			
 
				+
			
 
				+#include "tile.h"
			
 
				+#include "zoh.h"
			
 
				+
			
 
				+#include <string.h> // memcpy
			
 
				+
			
 
				+using namespace ZOH;
			
 
				+
			
 
				+
			
 
				+bool ZOH::isone(const char *block)
			
 
				+{
			
 
				+	char code = block[0] & 0x1F;
			
 
				+
			
 
				+	return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f);
			
 
				+}
			
 
				+
			
 
				+void ZOH::compress(const Tile &t, char *block)
			
 
				+{
			
 
				+	char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE];
			
 
				+
			
 
				+	float mseone = ZOH::compressone(t, oneblock);
			
 
				+	float msetwo = ZOH::compresstwo(t, twoblock);
			
 
				+
			
 
				+	if (mseone <= msetwo)
			
 
				+		memcpy(block, oneblock, ZOH::BLOCKSIZE);
			
 
				+	else
			
 
				+		memcpy(block, twoblock, ZOH::BLOCKSIZE);
			
 
				+}
			
 
				+
			
 
				+void ZOH::decompress(const char *block, Tile &t)
			
 
				+{
			
 
				+	if (ZOH::isone(block))
			
 
				+		ZOH::decompressone(block, t);
			
 
				+	else
			
 
				+		ZOH::decompresstwo(block, t);
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+void ZOH::compress(string inf, string zohf)
			
 
				+{
			
 
				+	Array2D<Rgba> pixels;
			
 
				+	int w, h;
			
 
				+	char block[ZOH::BLOCKSIZE];
			
 
				+
			
 
				+	Exr::readRgba(inf, pixels, w, h);
			
 
				+	FILE *zohfile = fopen(zohf.c_str(), "wb");
			
 
				+	if (zohfile == NULL) throw "Unable to open .zoh file for write";
			
 
				+
			
 
				+	// stuff for progress bar O.o
			
 
				+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
			
 
				+	int tilecnt = 0;
			
 
				+	int ndots = 25;
			
 
				+	int dotcnt = 0;
			
 
				+	printf("Progress [");
			
 
				+	for (int i=0; i<ndots;++i) printf(" ");
			
 
				+	printf("]\rProgress ["); fflush(stdout);
			
 
				+
			
 
				+	// convert to tiles and compress each tile
			
 
				+	for (int y=0; y<h; y+=Tile::TILE_H)
			
 
				+	{
			
 
				+		int ysize = min(Tile::TILE_H, h-y);
			
 
				+		for (int x=0; x<w; x+=Tile::TILE_W)
			
 
				+		{
			
 
				+			int xsize = min(Tile::TILE_W, w-x);
			
 
				+			Tile t(xsize, ysize);
			
 
				+
			
 
				+			t.insert(pixels, x, y);
			
 
				+
			
 
				+			ZOH::compress(t, block);
			
 
				+			if (fwrite(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
			
 
				+				throw "File error on write";
			
 
				+
			
 
				+			// progress bar
			
 
				+			++tilecnt;
			
 
				+			if (tilecnt > (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; }
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	printf("]\n");		// advance to next line finally
			
 
				+
			
 
				+	if (fclose(zohfile)) throw "Close failed on .zoh file";
			
 
				+}
			
 
				+
			
 
				+static int str2int(std::string s)
			
 
				+{
			
 
				+	int thing;
			
 
				+	std::stringstream str (stringstream::in | stringstream::out);
			
 
				+	str << s;
			
 
				+	str >> thing;
			
 
				+	return thing;
			
 
				+}
			
 
				+
			
 
				+// zoh file name is ...-w-h.zoh, extract width and height
			
 
				+static void extract(string zohf, int &w, int &h)
			
 
				+{
			
 
				+	size_t n = zohf.rfind('.', zohf.length()-1);
			
 
				+	size_t n1 = zohf.rfind('-', n-1);
			
 
				+	size_t n2 = zohf.rfind('-', n1-1);
			
 
				+	string width = zohf.substr(n2+1, n1-n2-1);
			
 
				+	w = str2int(width);
			
 
				+	string height = zohf.substr(n1+1, n-n1-1);
			
 
				+	h = str2int(height);
			
 
				+}
			
 
				+
			
 
				+static int mode_to_prec[] = {
			
 
				+	10,7,11,10,
			
 
				+	10,7,11,11,
			
 
				+	10,7,11,12,
			
 
				+	10,7,9,16,
			
 
				+	10,7,8,-1,
			
 
				+	10,7,8,-1,
			
 
				+	10,7,8,-1,
			
 
				+	10,7,6,-1,
			
 
				+};
			
 
				+
			
 
				+static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions;
			
 
				+
			
 
				+static void stats(char block[ZOH::BLOCKSIZE])
			
 
				+{
			
 
				+	char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++;
			
 
				+	int prec = mode_to_prec[mode];
			
 
				+	nvAssert (prec != -1);
			
 
				+	if (!ZOH::isone(block))
			
 
				+	{
			
 
				+		tworegions++;
			
 
				+		prechisttwo[prec]++;
			
 
				+		int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3);
			
 
				+		shapeindexhist[shapeindex]++;
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		oneregion++;
			
 
				+		prechistone[prec]++;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void printstats()
			
 
				+{
			
 
				+	printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]);
			
 
				+	printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]);
			
 
				+	printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]);
			
 
				+	printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]);
			
 
				+	printf("\nOne region %5.2f%%  Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions));
			
 
				+	printf("\n");
			
 
				+}
			
 
				+
			
 
				+void ZOH::decompress(string zohf, string outf)
			
 
				+{
			
 
				+	Array2D<Rgba> pixels;
			
 
				+	int w, h;
			
 
				+	char block[ZOH::BLOCKSIZE];
			
 
				+
			
 
				+	extract(zohf, w, h);
			
 
				+	FILE *zohfile = fopen(zohf.c_str(), "rb");
			
 
				+	if (zohfile == NULL) throw "Unable to open .zoh file for read";
			
 
				+	pixels.resizeErase(h, w);
			
 
				+
			
 
				+	// convert to tiles and decompress each tile
			
 
				+	for (int y=0; y<h; y+=Tile::TILE_H)
			
 
				+	{
			
 
				+		int ysize = min(Tile::TILE_H, h-y);
			
 
				+		for (int x=0; x<w; x+=Tile::TILE_W)
			
 
				+		{
			
 
				+			int xsize = min(Tile::TILE_W, w-x);
			
 
				+			Tile t(xsize, ysize);
			
 
				+
			
 
				+			if (fread(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
			
 
				+				throw "File error on read";
			
 
				+
			
 
				+			stats(block);	// collect statistics
			
 
				+
			
 
				+			ZOH::decompress(block, t);
			
 
				+
			
 
				+			t.extract(pixels, x, y);
			
 
				+		}
			
 
				+	}
			
 
				+	if (fclose(zohfile)) throw "Close failed on .zoh file";
			
 
				+	Exr::writeRgba(outf, pixels, w, h);
			
 
				+
			
 
				+#ifndef EXTERNAL_RELEASE
			
 
				+	printstats();	// print statistics
			
 
				+#endif
			
 
				+}
			
 
				+*/
			
--- a/3rdparty/nvtt/bc6h/zoh.h
+++ b/3rdparty/nvtt/bc6h/zoh.h
@@ -0,0 +1,65 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+#pragma once
			
 
				+#ifndef _ZOH_H
			
 
				+#define _ZOH_H
			
 
				+
			
 
				+#include "tile.h"
			
 
				+
			
 
				+namespace ZOH {
			
 
				+
			
 
				+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f
			
 
				+
			
 
				+static const int NREGIONS_TWO	= 2;
			
 
				+static const int NREGIONS_ONE	= 1;
			
 
				+static const int NCHANNELS		= 3;
			
 
				+
			
 
				+struct FltEndpts
			
 
				+{
			
 
				+    nv::Vector3 A;
			
 
				+    nv::Vector3 B;
			
 
				+};
			
 
				+
			
 
				+struct IntEndpts
			
 
				+{
			
 
				+	int A[NCHANNELS];
			
 
				+	int B[NCHANNELS];
			
 
				+};
			
 
				+
			
 
				+struct ComprEndpts
			
 
				+{
			
 
				+	uint A[NCHANNELS];
			
 
				+	uint B[NCHANNELS];
			
 
				+};
			
 
				+
			
 
				+static const int BLOCKSIZE=16;
			
 
				+static const int BITSIZE=128;
			
 
				+
			
 
				+void compress(const Tile &t, char *block);
			
 
				+void decompress(const char *block, Tile &t);
			
 
				+
			
 
				+float compressone(const Tile &t, char *block);
			
 
				+float compresstwo(const Tile &t, char *block);
			
 
				+void decompressone(const char *block, Tile &t);
			
 
				+void decompresstwo(const char *block, Tile &t);
			
 
				+
			
 
				+float refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block);
			
 
				+float roughtwo(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_TWO]);
			
 
				+
			
 
				+float refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block);
			
 
				+float roughone(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_ONE]);
			
 
				+
			
 
				+bool isone(const char *block);
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif // _ZOH_H
			
--- a/3rdparty/nvtt/bc6h/zoh_utils.cpp
+++ b/3rdparty/nvtt/bc6h/zoh_utils.cpp
@@ -0,0 +1,324 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Utility and common routines
			
 
				+
			
 
				+#include "zoh_utils.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include <math.h>
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace ZOH;
			
 
				+
			
 
				+static const int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
			
 
				+static const int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
			
 
				+
			
 
				+/*static*/ Format Utils::FORMAT;
			
 
				+
			
 
				+int Utils::lerp(int a, int b, int i, int denom)
			
 
				+{
			
 
				+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
			
 
				+	nvDebugCheck (i >= 0 && i <= denom);
			
 
				+
			
 
				+	int round = 32, shift = 6;
			
 
				+	const int *weights;
			
 
				+
			
 
				+	switch(denom)
			
 
				+	{
			
 
				+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
			
 
				+	case 15:	weights = denom15_weights_64; break;
			
 
				+	case 7:		weights = denom7_weights_64; break;
			
 
				+	default:	nvDebugCheck(0);
			
 
				+	}
			
 
				+
			
 
				+	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
			
 
				+}
			
 
				+
			
 
				+Vector3 Utils::lerp(const Vector3& a, const Vector3 &b, int i, int denom)
			
 
				+{
			
 
				+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
			
 
				+	nvDebugCheck (i >= 0 && i <= denom);
			
 
				+
			
 
				+	int shift = 6;
			
 
				+	const int *weights;
			
 
				+
			
 
				+	switch(denom)
			
 
				+	{
			
 
				+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
			
 
				+	case 15:	weights = denom15_weights_64; break;
			
 
				+	case 7:		weights = denom7_weights_64; break;
			
 
				+	default:	nvUnreachable();
			
 
				+	}
			
 
				+
			
 
				+	// no need to round these as this is an exact division
			
 
				+	return (a*float(weights[denom-i]) +b*float(weights[i])) / float(1 << shift);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/*
			
 
				+	For unsigned f16, clamp the input to [0,F16MAX]. Thus u15.
			
 
				+	For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16.
			
 
				+
			
 
				+	The conversions proceed as follows:
			
 
				+
			
 
				+	unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX.
			
 
				+	signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value
			
 
				+	unsigned int: get bits. return as a positive value.
			
 
				+	signed int. get bits. return as a value in -32768..32767.
			
 
				+
			
 
				+	The inverse conversions are just the inverse of the above.
			
 
				+*/
			
 
				+
			
 
				+// clamp the 3 channels of the input vector to the allowable range based on FORMAT
			
 
				+// note that each channel is a float storing the allowable range as a bit pattern converted to float
			
 
				+// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX]
			
 
				+
			
 
				+void Utils::clamp(Vector3 &v)
			
 
				+{
			
 
				+	for (int i=0; i<3; ++i)
			
 
				+	{
			
 
				+		switch(Utils::FORMAT)
			
 
				+		{
			
 
				+		case UNSIGNED_F16:
			
 
				+			if (v.component[i] < 0.0) v.component[i] = 0;
			
 
				+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
			
 
				+			break;
			
 
				+
			
 
				+		case SIGNED_F16:
			
 
				+			if (v.component[i] < -F16MAX) v.component[i] = -F16MAX;
			
 
				+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
			
 
				+			break;
			
 
				+
			
 
				+		default:
			
 
				+			nvUnreachable();
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// convert a u16 value to s17 (represented as an int) based on the format expected
			
 
				+int Utils::ushort_to_format(unsigned short input)
			
 
				+{
			
 
				+	int out, s;
			
 
				+
			
 
				+	// clamp to the valid range we are expecting
			
 
				+	switch (Utils::FORMAT)
			
 
				+	{
			
 
				+	case UNSIGNED_F16:
			
 
				+		if (input & F16S_MASK) out = 0;
			
 
				+		else if (input > F16MAX) out = F16MAX;
			
 
				+		else out = input;
			
 
				+		break;
			
 
				+
			
 
				+	case SIGNED_F16:
			
 
				+		s = input & F16S_MASK;
			
 
				+		input &= F16EM_MASK;
			
 
				+		if (input > F16MAX) out = F16MAX;
			
 
				+		else out = input;
			
 
				+		out = s ? -out : out;
			
 
				+		break;
			
 
				+	}
			
 
				+	return out;
			
 
				+}
			
 
				+
			
 
				+// convert a s17 value to u16 based on the format expected
			
 
				+unsigned short Utils::format_to_ushort(int input)
			
 
				+{
			
 
				+	unsigned short out;
			
 
				+
			
 
				+	// clamp to the valid range we are expecting
			
 
				+	switch (Utils::FORMAT)
			
 
				+	{
			
 
				+	case UNSIGNED_F16:
			
 
				+		nvDebugCheck (input >= 0 && input <= F16MAX);
			
 
				+		out = input;
			
 
				+		break;
			
 
				+
			
 
				+	case SIGNED_F16:
			
 
				+		nvDebugCheck (input >= -F16MAX && input <= F16MAX);
			
 
				+		// convert to sign-magnitude
			
 
				+		int s;
			
 
				+		if (input < 0) { s = F16S_MASK; input = -input; }
			
 
				+		else           { s = 0; }
			
 
				+		out = s | input;
			
 
				+		break;
			
 
				+	}
			
 
				+	return out;
			
 
				+}
			
 
				+
			
 
				+// quantize the input range into equal-sized bins
			
 
				+int Utils::quantize(float value, int prec)
			
 
				+{
			
 
				+	int q, ivalue, s;
			
 
				+
			
 
				+	nvDebugCheck (prec > 1);	// didn't bother to make it work for 1
			
 
				+
			
 
				+	value = (float)floor(value + 0.5);
			
 
				+
			
 
				+	int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0;	// bias precisions 11..16 to get a more accurate quantization
			
 
				+
			
 
				+	switch (Utils::FORMAT)
			
 
				+	{
			
 
				+	case UNSIGNED_F16:
			
 
				+		nvDebugCheck (value >= 0 && value <= F16MAX);
			
 
				+		ivalue = (int)value;
			
 
				+		q = ((ivalue << prec) + bias) / (F16MAX+1);
			
 
				+		nvDebugCheck (q >= 0 && q < (1 << prec));
			
 
				+		break;
			
 
				+
			
 
				+	case SIGNED_F16:
			
 
				+		nvDebugCheck (value >= -F16MAX && value <= F16MAX);
			
 
				+		// convert to sign-magnitude
			
 
				+		ivalue = (int)value;
			
 
				+		if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0;
			
 
				+
			
 
				+		q = ((ivalue << (prec-1)) + bias) / (F16MAX+1);
			
 
				+		if (s)
			
 
				+			q = -q;
			
 
				+		nvDebugCheck (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
			
 
				+		break;
			
 
				+	}
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+int Utils::finish_unquantize(int q, int prec)
			
 
				+{
			
 
				+	if (Utils::FORMAT == UNSIGNED_F16)
			
 
				+		return (q * 31) >> 6;										// scale the magnitude by 31/64
			
 
				+	else if (Utils::FORMAT == SIGNED_F16)
			
 
				+		return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;		// scale the magnitude by 31/32
			
 
				+	else
			
 
				+		return q;
			
 
				+}
			
 
				+
			
 
				+// unquantize each bin to midpoint of original bin range, except
			
 
				+// for the end bins which we push to an endpoint of the bin range.
			
 
				+// we do this to ensure we can represent all possible original values.
			
 
				+// the asymmetric end bins do not affect PSNR for the test images.
			
 
				+//
			
 
				+// code this function assuming an arbitrary bit pattern as the encoded block
			
 
				+int Utils::unquantize(int q, int prec)
			
 
				+{
			
 
				+	int unq, s;
			
 
				+
			
 
				+	nvDebugCheck (prec > 1);	// not implemented for prec 1
			
 
				+
			
 
				+	switch (Utils::FORMAT)
			
 
				+	{
			
 
				+	// modify this case to move the multiplication by 31 after interpolation.
			
 
				+	// Need to use finish_unquantize.
			
 
				+
			
 
				+	// since we have 16 bits available, let's unquantize this to 16 bits unsigned
			
 
				+	// thus the scale factor is [0-7c00)/[0-10000) = 31/64
			
 
				+	case UNSIGNED_F16:
			
 
				+		if (prec >= 15) 
			
 
				+			unq = q;
			
 
				+		else if (q == 0) 
			
 
				+			unq = 0;
			
 
				+		else if (q == ((1<<prec)-1)) 
			
 
				+			unq = U16MAX;
			
 
				+		else
			
 
				+			unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
			
 
				+		break;
			
 
				+
			
 
				+	// here, let's stick with S16 (no apparent quality benefit from going to S17)
			
 
				+	// range is (-7c00..7c00)/(-8000..8000) = 31/32
			
 
				+	case SIGNED_F16:
			
 
				+		// don't remove this test even though it appears equivalent to the code below
			
 
				+		// as it isn't -- the code below can overflow for prec = 16
			
 
				+		if (prec >= 16)
			
 
				+			unq = q;
			
 
				+		else
			
 
				+		{
			
 
				+			if (q < 0) { s = 1; q = -q; } else s = 0;
			
 
				+
			
 
				+			if (q == 0)
			
 
				+				unq = 0;
			
 
				+			else if (q >= ((1<<(prec-1))-1))
			
 
				+				unq = s ? -S16MAX : S16MAX;
			
 
				+			else
			
 
				+			{
			
 
				+				unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
			
 
				+				if (s)
			
 
				+					unq = -unq;
			
 
				+			}
			
 
				+		}
			
 
				+		break;
			
 
				+	}
			
 
				+	return unq;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+// pick a norm!
			
 
				+#define	NORM_EUCLIDEAN 1
			
 
				+
			
 
				+float Utils::norm(const Vector3 &a, const Vector3 &b)
			
 
				+{
			
 
				+#ifdef	NORM_EUCLIDEAN
			
 
				+	return lengthSquared(a - b);
			
 
				+#endif
			
 
				+#ifdef	NORM_ABS
			
 
				+	Vector3 err = a - b;
			
 
				+	return fabs(err.x) + fabs(err.y) + fabs(err.z);
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+// parse <name>[<start>{:<end>}]{,}	
			
 
				+// the pointer starts here         ^
			
 
				+// name is 1 or 2 chars and matches field names. start and end are decimal numbers
			
 
				+void Utils::parse(const char *encoding, int &ptr, Field &field, int &endbit, int &len)
			
 
				+{
			
 
				+	if (ptr <= 0) return;
			
 
				+	--ptr;
			
 
				+	if (encoding[ptr] == ',') --ptr;
			
 
				+	nvDebugCheck (encoding[ptr] == ']');
			
 
				+	--ptr;
			
 
				+	endbit = 0;
			
 
				+	int scale = 1;
			
 
				+	while (encoding[ptr] != ':' && encoding[ptr] != '[')
			
 
				+	{
			
 
				+		nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
			
 
				+		endbit += (encoding[ptr--] - '0') * scale;
			
 
				+		scale *= 10;
			
 
				+	}
			
 
				+	int startbit = 0; scale = 1;
			
 
				+	if (encoding[ptr] == '[')
			
 
				+		startbit = endbit;
			
 
				+	else  
			
 
				+	{
			
 
				+		ptr--;
			
 
				+		while (encoding[ptr] != '[')
			
 
				+		{
			
 
				+			nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
			
 
				+			startbit += (encoding[ptr--] - '0') * scale;
			
 
				+			scale *= 10;
			
 
				+		}
			
 
				+	}
			
 
				+	len = startbit - endbit + 1;	// startbit>=endbit note
			
 
				+	--ptr;
			
 
				+	if (encoding[ptr] == 'm')		field = FIELD_M;
			
 
				+	else if (encoding[ptr] == 'd')	field = FIELD_D;
			
 
				+	else {
			
 
				+		// it's wxyz
			
 
				+		nvDebugCheck (encoding[ptr] >= 'w' && encoding[ptr] <= 'z');
			
 
				+		int foo = encoding[ptr--] - 'w';
			
 
				+		// now it is r g or b
			
 
				+		if (encoding[ptr] == 'r')		foo += 10;
			
 
				+		else if (encoding[ptr] == 'g')	foo += 20;
			
 
				+		else if (encoding[ptr] == 'b')	foo += 30;
			
 
				+		else nvDebugCheck(0);
			
 
				+		field = (Field) foo;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
--- a/3rdparty/nvtt/bc6h/zoh_utils.h
+++ b/3rdparty/nvtt/bc6h/zoh_utils.h
@@ -0,0 +1,73 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// utility class holding common routines
			
 
				+#pragma once
			
 
				+#ifndef _ZOH_UTILS_H
			
 
				+#define _ZOH_UTILS_H
			
 
				+
			
 
				+#include "nvmath/Vector.h"
			
 
				+
			
 
				+namespace ZOH {
			
 
				+
			
 
				+inline int SIGN_EXTEND(int x, int nb) { return ((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x))); }
			
 
				+
			
 
				+enum Field {
			
 
				+    FIELD_M = 1,	// mode
			
 
				+    FIELD_D = 2,	// distribution/shape
			
 
				+    FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3,	// red channel endpoints or deltas
			
 
				+    FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3,	// green channel endpoints or deltas
			
 
				+    FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3,	// blue channel endpoints or deltas
			
 
				+};
			
 
				+
			
 
				+// some constants
			
 
				+static const int F16S_MASK	=  0x8000;		// f16 sign mask
			
 
				+static const int F16EM_MASK	=  0x7fff;		// f16 exp & mantissa mask
			
 
				+static const int U16MAX		=  0xffff;
			
 
				+static const int S16MIN		= -0x8000;
			
 
				+static const int S16MAX		=  0x7fff;
			
 
				+static const int INT16_MASK	=  0xffff;
			
 
				+static const int F16MAX		=  0x7bff;		// MAXFLT bit pattern for halfs
			
 
				+
			
 
				+enum Format { UNSIGNED_F16, SIGNED_F16 };
			
 
				+
			
 
				+class Utils
			
 
				+{
			
 
				+public:
			
 
				+    static Format FORMAT;     // this is a global -- we're either handling unsigned or unsigned half values
			
 
				+
			
 
				+    // error metrics
			
 
				+    static float norm(const nv::Vector3 &a, const nv::Vector3 &b);
			
 
				+    static float mpsnr_norm(const nv::Vector3 &a, int exposure, const nv::Vector3 &b);
			
 
				+
			
 
				+    // conversion & clamp
			
 
				+    static int ushort_to_format(unsigned short input);
			
 
				+    static unsigned short format_to_ushort(int input);
			
 
				+
			
 
				+    // clamp to format
			
 
				+    static void clamp(nv::Vector3 &v);
			
 
				+
			
 
				+    // quantization and unquantization
			
 
				+    static int finish_unquantize(int q, int prec);
			
 
				+    static int unquantize(int q, int prec);
			
 
				+    static int quantize(float value, int prec);
			
 
				+
			
 
				+    static void parse(const char *encoding, int &ptr, Field & field, int &endbit, int &len);
			
 
				+
			
 
				+    // lerping
			
 
				+    static int lerp(int a, int b, int i, int denom);
			
 
				+    static nv::Vector3 lerp(const nv::Vector3 & a, const nv::Vector3 & b, int i, int denom);
			
 
				+};
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif // _ZOH_UTILS_H
			
--- a/3rdparty/nvtt/bc6h/zohone.cpp
+++ b/3rdparty/nvtt/bc6h/zohone.cpp
@@ -0,0 +1,799 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// one region zoh compress/decompress code
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "zoh.h"
			
 
				+#include "zoh_utils.h"
			
 
				+
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+
			
 
				+#include <string.h> // strlen
			
 
				+#include <float.h> // FLT_MAX
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace ZOH;
			
 
				+
			
 
				+#define NINDICES	16
			
 
				+#define	INDEXBITS	4
			
 
				+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
			
 
				+#define	DENOM		(NINDICES-1)
			
 
				+
			
 
				+#define	NSHAPES	1
			
 
				+
			
 
				+static const int shapes[NSHAPES] =
			
 
				+{
			
 
				+    0x0000
			
 
				+};	// only 1 shape
			
 
				+
			
 
				+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
			
 
				+
			
 
				+#define	POS_TO_X(pos)	((pos)&3)
			
 
				+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
			
 
				+
			
 
				+#define	NDELTA	2
			
 
				+
			
 
				+struct Chanpat
			
 
				+{
			
 
				+    int prec[NDELTA];		// precision pattern for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+    Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
			
 
				+    int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
			
 
				+    int mode;				// associated mode value
			
 
				+    int modebits;			// number of mode bits
			
 
				+    const char *encoding;	// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define MAXMODEBITS	5
			
 
				+#define	MAXMODES (1<<MAXMODEBITS)
			
 
				+
			
 
				+#define	NPATTERNS 4
			
 
				+
			
 
				+static const Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+    16,4,  16,4,  16,4,   1, 0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
			
 
				+    12,8,  12,8,  12,8,   1, 0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
			
 
				+    11,9,  11,9,  11,9,   1, 0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
			
 
				+    10,10, 10,10, 10,10,  0, 0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
			
 
				+};
			
 
				+
			
 
				+// mapping of mode to the corresponding index in pattern
			
 
				+static const int mode_to_pat[MAXMODES] = {
			
 
				+    -1,-1,-1,
			
 
				+    3,	// 0x03
			
 
				+    -1,-1,-1,
			
 
				+    2,	// 0x07
			
 
				+    -1,-1,-1,
			
 
				+    1,	// 0x0b
			
 
				+    -1,-1,-1,
			
 
				+    0,	// 0x0f
			
 
				+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
			
 
				+};
			
 
				+
			
 
				+#define	R_0(ep)	(ep)[0].A[i]
			
 
				+#define	R_1(ep)	(ep)[0].B[i]
			
 
				+#define	MASK(n)	((1<<(n))-1)
			
 
				+
			
 
				+// compress endpoints
			
 
				+static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
			
 
				+{
			
 
				+    if (p.transformed)
			
 
				+    {
			
 
				+        for (int i=0; i<NCHANNELS; ++i)
			
 
				+        {
			
 
				+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
			
 
				+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        for (int i=0; i<NCHANNELS; ++i)
			
 
				+        {
			
 
				+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
			
 
				+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// decompress endpoints
			
 
				+static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
			
 
				+{
			
 
				+    bool issigned = Utils::FORMAT == SIGNED_F16;
			
 
				+
			
 
				+    if (p.transformed)
			
 
				+    {
			
 
				+        for (int i=0; i<NCHANNELS; ++i)
			
 
				+        {
			
 
				+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
			
 
				+            int t;
			
 
				+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
			
 
				+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
			
 
				+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        for (int i=0; i<NCHANNELS; ++i)
			
 
				+        {
			
 
				+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
			
 
				+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
			
 
				+{
			
 
				+    for (int region = 0; region < NREGIONS_ONE; ++region)
			
 
				+    {
			
 
				+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
			
 
				+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
			
 
				+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
			
 
				+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
			
 
				+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
			
 
				+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
			
 
				+// index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
			
 
				+static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
			
 
				+{
			
 
				+    int index_positions[NREGIONS_ONE];
			
 
				+
			
 
				+    index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
			
 
				+
			
 
				+    for (int region = 0; region < NREGIONS_ONE; ++region)
			
 
				+    {
			
 
				+        int x = index_positions[region] & 3;
			
 
				+        int y = (index_positions[region] >> 2) & 3;
			
 
				+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+        if (indices[y][x] & HIGH_INDEXBIT)
			
 
				+        {
			
 
				+            // high bit is set, swap the endpts and indices for this region
			
 
				+            int t;
			
 
				+            for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
			
 
				+
			
 
				+            for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+                for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+                    if (REGION(x,y,shapeindex) == region)
			
 
				+                        indices[y][x] = NINDICES - 1 - indices[y][x];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// endpoints fit only if the compression was lossless
			
 
				+static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
			
 
				+{
			
 
				+    IntEndpts uncompressed[NREGIONS_ONE];
			
 
				+
			
 
				+    decompress_endpts(compressed, uncompressed, p);
			
 
				+
			
 
				+    for (int j=0; j<NREGIONS_ONE; ++j)
			
 
				+	for (int i=0; i<NCHANNELS; ++i)
			
 
				+	{
			
 
				+        if (orig[j].A[i] != uncompressed[j].A[i]) return false;
			
 
				+        if (orig[j].B[i] != uncompressed[j].B[i]) return false;
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
			
 
				+{
			
 
				+    // interpret the verilog backwards and process it
			
 
				+    int m = p.mode;
			
 
				+    int rw = endpts[0].A[0], rx = endpts[0].B[0];
			
 
				+    int gw = endpts[0].A[1], gx = endpts[0].B[1];
			
 
				+    int bw = endpts[0].A[2], bx = endpts[0].B[2];
			
 
				+    int ptr = int(strlen(p.encoding));
			
 
				+    while (ptr)
			
 
				+    {
			
 
				+        Field field;
			
 
				+        int endbit, len;
			
 
				+
			
 
				+		// !!!UNDONE: get rid of string parsing!!!
			
 
				+        Utils::parse(p.encoding, ptr, field, endbit, len);
			
 
				+        switch(field)
			
 
				+        {
			
 
				+        case FIELD_M:	out.write( m >> endbit, len); break;
			
 
				+        case FIELD_RW:	out.write(rw >> endbit, len); break;
			
 
				+        case FIELD_RX:	out.write(rx >> endbit, len); break;
			
 
				+        case FIELD_GW:	out.write(gw >> endbit, len); break;
			
 
				+        case FIELD_GX:	out.write(gx >> endbit, len); break;
			
 
				+        case FIELD_BW:	out.write(bw >> endbit, len); break;
			
 
				+        case FIELD_BX:	out.write(bx >> endbit, len); break;
			
 
				+
			
 
				+        case FIELD_D:
			
 
				+        case FIELD_RY:
			
 
				+        case FIELD_RZ:
			
 
				+        case FIELD_GY:
			
 
				+        case FIELD_GZ:
			
 
				+        case FIELD_BY:
			
 
				+        case FIELD_BZ:
			
 
				+        default: nvUnreachable();
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
			
 
				+{
			
 
				+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
			
 
				+    int mode = in.read(2);
			
 
				+    if (mode != 0x00 && mode != 0x01)
			
 
				+        mode = (in.read(3) << 2) | mode;
			
 
				+
			
 
				+    int pat_index = mode_to_pat[mode];
			
 
				+
			
 
				+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+    p = patterns[pat_index];
			
 
				+
			
 
				+    int d;
			
 
				+    int rw, rx;
			
 
				+    int gw, gx;
			
 
				+    int bw, bx;
			
 
				+
			
 
				+    d = 0;
			
 
				+    rw = rx = 0;
			
 
				+    gw = gx = 0;
			
 
				+    bw = bx = 0;
			
 
				+
			
 
				+    int ptr = int(strlen(p.encoding));
			
 
				+
			
 
				+    while (ptr)
			
 
				+    {
			
 
				+        Field field;
			
 
				+        int endbit, len;
			
 
				+
			
 
				+		// !!!UNDONE: get rid of string parsing!!!
			
 
				+        Utils::parse(p.encoding, ptr, field, endbit, len);
			
 
				+
			
 
				+        switch(field)
			
 
				+        {
			
 
				+        case FIELD_M:	break;	// already processed so ignore
			
 
				+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
			
 
				+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
			
 
				+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
			
 
				+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
			
 
				+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
			
 
				+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
			
 
				+
			
 
				+        case FIELD_D:
			
 
				+        case FIELD_RY:
			
 
				+        case FIELD_RZ:
			
 
				+        case FIELD_GY:
			
 
				+        case FIELD_GZ:
			
 
				+        case FIELD_BY:
			
 
				+        case FIELD_BZ:
			
 
				+        default: nvUnreachable();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    nvDebugCheck (in.getptr() == 128 - 63);
			
 
				+
			
 
				+    endpts[0].A[0] = rw; endpts[0].B[0] = rx;
			
 
				+    endpts[0].A[1] = gw; endpts[0].B[1] = gx;
			
 
				+    endpts[0].A[2] = bw; endpts[0].B[2] = bx;
			
 
				+}
			
 
				+
			
 
				+// compress index 0
			
 
				+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
			
 
				+{
			
 
				+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+    {
			
 
				+        int x = POS_TO_X(pos);
			
 
				+        int y = POS_TO_Y(pos);
			
 
				+
			
 
				+        out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
			
 
				+{
			
 
				+    Bits out(block, ZOH::BITSIZE);
			
 
				+
			
 
				+    write_header(endpts, p, out);
			
 
				+
			
 
				+    write_indices(indices, shapeindex, out);
			
 
				+
			
 
				+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
			
 
				+{
			
 
				+    // scale endpoints
			
 
				+    int a, b;			// really need a IntVector3...
			
 
				+
			
 
				+    a = Utils::unquantize(endpts.A[0], prec);
			
 
				+    b = Utils::unquantize(endpts.B[0], prec);
			
 
				+
			
 
				+    // interpolate
			
 
				+    for (int i = 0; i < NINDICES; ++i)
			
 
				+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
			
 
				+
			
 
				+    a = Utils::unquantize(endpts.A[1], prec);
			
 
				+    b = Utils::unquantize(endpts.B[1], prec);
			
 
				+
			
 
				+    // interpolate
			
 
				+    for (int i = 0; i < NINDICES; ++i)
			
 
				+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
			
 
				+
			
 
				+    a = Utils::unquantize(endpts.A[2], prec);
			
 
				+    b = Utils::unquantize(endpts.B[2], prec);
			
 
				+
			
 
				+    // interpolate
			
 
				+    for (int i = 0; i < NINDICES; ++i)
			
 
				+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
			
 
				+}
			
 
				+
			
 
				+// position 0 was compressed
			
 
				+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+    {
			
 
				+        int x = POS_TO_X(pos);
			
 
				+        int y = POS_TO_Y(pos);
			
 
				+
			
 
				+        indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void ZOH::decompressone(const char *block, Tile &t)
			
 
				+{
			
 
				+    Bits in(block, ZOH::BITSIZE);
			
 
				+
			
 
				+    Pattern p;
			
 
				+    IntEndpts endpts[NREGIONS_ONE];
			
 
				+    ComprEndpts compr_endpts[NREGIONS_ONE];
			
 
				+
			
 
				+    read_header(in, compr_endpts, p);
			
 
				+    int shapeindex = 0;		// only one shape
			
 
				+
			
 
				+    decompress_endpts(compr_endpts, endpts, p);
			
 
				+
			
 
				+    Vector3 palette[NREGIONS_ONE][NINDICES];
			
 
				+    for (int r = 0; r < NREGIONS_ONE; ++r)
			
 
				+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
			
 
				+
			
 
				+    // read indices
			
 
				+    int indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+    read_indices(in, shapeindex, indices);
			
 
				+
			
 
				+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
			
 
				+
			
 
				+    // lookup
			
 
				+    for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+            t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
			
 
				+{
			
 
				+    Vector3 palette[NINDICES];
			
 
				+    float toterr = 0;
			
 
				+    Vector3 err;
			
 
				+
			
 
				+    generate_palette_quantized(endpts, prec, palette);
			
 
				+
			
 
				+    for (int i = 0; i < np; ++i)
			
 
				+    {
			
 
				+        float err, besterr;
			
 
				+
			
 
				+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
			
 
				+
			
 
				+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
			
 
				+        {
			
 
				+            err = Utils::norm(colors[i], palette[j]) * importance[i];
			
 
				+
			
 
				+            if (err > besterr)	// error increased, so we're done searching
			
 
				+                break;
			
 
				+            if (err < besterr)
			
 
				+                besterr = err;
			
 
				+        }
			
 
				+        toterr += besterr;
			
 
				+    }
			
 
				+    return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, 
			
 
				+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE])
			
 
				+{
			
 
				+    // build list of possibles
			
 
				+    Vector3 palette[NREGIONS_ONE][NINDICES];
			
 
				+
			
 
				+    for (int region = 0; region < NREGIONS_ONE; ++region)
			
 
				+    {
			
 
				+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
			
 
				+        toterr[region] = 0;
			
 
				+    }
			
 
				+
			
 
				+    Vector3 err;
			
 
				+
			
 
				+    for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+        int region = REGION(x,y,shapeindex);
			
 
				+        float err, besterr;
			
 
				+
			
 
				+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
			
 
				+        indices[y][x] = 0;
			
 
				+
			
 
				+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
			
 
				+        {
			
 
				+            err = Utils::norm(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+            if (err > besterr)	// error increased, so we're done searching
			
 
				+                break;
			
 
				+            if (err < besterr)
			
 
				+            {
			
 
				+                besterr = err;
			
 
				+                indices[y][x] = i;
			
 
				+            }
			
 
				+        }
			
 
				+        toterr[region] += besterr;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
			
 
				+                          float old_err, int do_b)
			
 
				+{
			
 
				+    // we have the old endpoints: old_endpts
			
 
				+    // we have the perturbed endpoints: new_endpts
			
 
				+    // we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+    IntEndpts temp_endpts;
			
 
				+    float min_err = old_err;		// start with the best current error
			
 
				+    int beststep;
			
 
				+
			
 
				+    // copy real endpoints so we can perturb them
			
 
				+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
			
 
				+
			
 
				+    // do a logarithmic search for the best error for this endpoint (which)
			
 
				+    for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+    {
			
 
				+        bool improved = false;
			
 
				+        for (int sign = -1; sign <= 1; sign += 2)
			
 
				+        {
			
 
				+            if (do_b == 0)
			
 
				+            {
			
 
				+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+                    continue;
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+                    continue;
			
 
				+            }
			
 
				+
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, prec);
			
 
				+
			
 
				+            if (err < min_err)
			
 
				+            {
			
 
				+                improved = true;
			
 
				+                min_err = err;
			
 
				+                beststep = sign * step;
			
 
				+            }
			
 
				+        }
			
 
				+        // if this was an improvement, move the endpoint and continue search from there
			
 
				+        if (improved)
			
 
				+        {
			
 
				+            if (do_b == 0)
			
 
				+                new_endpts.A[ch] += beststep;
			
 
				+            else
			
 
				+                new_endpts.B[ch] += beststep;
			
 
				+        }
			
 
				+    }
			
 
				+    return min_err;
			
 
				+}
			
 
				+
			
 
				+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
			
 
				+{
			
 
				+    float opt_err = orig_err;
			
 
				+    for (int ch = 0; ch < NCHANNELS; ++ch)
			
 
				+    {
			
 
				+        opt_endpts.A[ch] = orig_endpts.A[ch];
			
 
				+        opt_endpts.B[ch] = orig_endpts.B[ch];
			
 
				+    }
			
 
				+    /*
			
 
				+        err0 = perturb(rgb0, delta0)
			
 
				+        err1 = perturb(rgb1, delta1)
			
 
				+        if (err0 < err1)
			
 
				+            if (err0 >= initial_error) break
			
 
				+            rgb0 += delta0
			
 
				+            next = 1
			
 
				+        else
			
 
				+            if (err1 >= initial_error) break
			
 
				+            rgb1 += delta1
			
 
				+            next = 0
			
 
				+        initial_err = map()
			
 
				+        for (;;)
			
 
				+            err = perturb(next ? rgb1:rgb0, delta)
			
 
				+            if (err >= initial_err) break
			
 
				+            next? rgb1 : rgb0 += delta
			
 
				+            initial_err = err
			
 
				+	*/
			
 
				+    IntEndpts new_a, new_b;
			
 
				+    IntEndpts new_endpt;
			
 
				+    int do_b;
			
 
				+
			
 
				+    // now optimize each channel separately
			
 
				+    for (int ch = 0; ch < NCHANNELS; ++ch)
			
 
				+    {
			
 
				+        // figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+        // if we just alternate, we can easily end up in a local minima
			
 
				+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
			
 
				+
			
 
				+        if (err0 < err1)
			
 
				+        {
			
 
				+            if (err0 >= opt_err)
			
 
				+                continue;
			
 
				+
			
 
				+            opt_endpts.A[ch] = new_a.A[ch];
			
 
				+            opt_err = err0;
			
 
				+            do_b = 1;		// do B next
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            if (err1 >= opt_err)
			
 
				+                continue;
			
 
				+            opt_endpts.B[ch] = new_b.B[ch];
			
 
				+            opt_err = err1;
			
 
				+            do_b = 0;		// do A next
			
 
				+        }
			
 
				+
			
 
				+        // now alternate endpoints and keep trying until there is no improvement
			
 
				+        for (;;)
			
 
				+        {
			
 
				+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
			
 
				+            if (err >= opt_err)
			
 
				+                break;
			
 
				+            if (do_b == 0)
			
 
				+                opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+            else
			
 
				+                opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+            opt_err = err;
			
 
				+            do_b = 1 - do_b;	// now move the other endpoint
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE], 
			
 
				+                            const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
			
 
				+{
			
 
				+    Vector3 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+    float err = 0;
			
 
				+
			
 
				+    for (int region=0; region<NREGIONS_ONE; ++region)
			
 
				+    {
			
 
				+        // collect the pixels in the region
			
 
				+        int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    ++np;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+    for each pattern
			
 
				+        convert endpoints using pattern precision
			
 
				+        assign indices and get initial error
			
 
				+        compress indices (and possibly reorder endpoints)
			
 
				+        transform endpoints
			
 
				+        if transformed endpoints fit pattern
			
 
				+            get original endpoints back
			
 
				+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+            compress new indices
			
 
				+            transform new endpoints
			
 
				+            if new endpoints fit pattern AND if error is improved
			
 
				+                emit compressed block with new data
			
 
				+            else
			
 
				+                emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
			
 
				+{
			
 
				+    float orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
			
 
				+    IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
			
 
				+    ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
			
 
				+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+    for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+    {
			
 
				+        // precisions for all channels need to be the same
			
 
				+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
			
 
				+
			
 
				+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
			
 
				+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
			
 
				+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
			
 
				+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
			
 
				+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
			
 
				+        {
			
 
				+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
			
 
				+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
			
 
				+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
			
 
				+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
			
 
				+            orig_toterr = opt_toterr = 0;
			
 
				+            for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+
			
 
				+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+            {
			
 
				+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
			
 
				+                return opt_toterr;
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                // either it stopped fitting when we optimized it, or there was no improvement
			
 
				+                // so go back to the unoptimized endpoints which we know will fit
			
 
				+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
			
 
				+                return orig_toterr;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+	nvAssert (false); // "No candidate found, should never happen (refineone.)";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES])
			
 
				+{
			
 
				+    for (int region = 0; region < NREGIONS_ONE; ++region)
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
			
 
				+}
			
 
				+
			
 
				+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
			
 
				+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
			
 
				+{
			
 
				+    // build list of possibles
			
 
				+    Vector3 palette[NREGIONS_ONE][NINDICES];
			
 
				+
			
 
				+    generate_palette_unquantized(endpts, palette);
			
 
				+
			
 
				+    float toterr = 0;
			
 
				+    Vector3 err;
			
 
				+
			
 
				+    for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+        int region = REGION(x,y,shapeindex);
			
 
				+        float err, besterr;
			
 
				+
			
 
				+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
			
 
				+
			
 
				+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
			
 
				+        {
			
 
				+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
			
 
				+
			
 
				+            if (err > besterr)	// error increased, so we're done searching
			
 
				+                break;
			
 
				+            if (err < besterr)
			
 
				+                besterr = err;
			
 
				+        }
			
 
				+        toterr += besterr;
			
 
				+    }
			
 
				+    return toterr;
			
 
				+}
			
 
				+
			
 
				+float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
			
 
				+{
			
 
				+    for (int region=0; region<NREGIONS_ONE; ++region)
			
 
				+    {
			
 
				+        int np = 0;
			
 
				+        Vector3 colors[Tile::TILE_TOTAL];
			
 
				+        Vector3 mean(0,0,0);
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x,y,shapeindex) == region)
			
 
				+                {
			
 
				+                    colors[np] = tile.data[y][x];
			
 
				+                    mean += tile.data[y][x];
			
 
				+                    ++np;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // handle simple cases
			
 
				+        if (np == 0)
			
 
				+        {
			
 
				+            Vector3 zero(0,0,0);
			
 
				+            endpts[region].A = zero;
			
 
				+            endpts[region].B = zero;
			
 
				+            continue;
			
 
				+        }
			
 
				+        else if (np == 1)
			
 
				+        {
			
 
				+            endpts[region].A = colors[0];
			
 
				+            endpts[region].B = colors[0];
			
 
				+            continue;
			
 
				+        }
			
 
				+        else if (np == 2)
			
 
				+        {
			
 
				+            endpts[region].A = colors[0];
			
 
				+            endpts[region].B = colors[1];
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        mean /= float(np);
			
 
				+
			
 
				+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+        // project each pixel value along the principal direction
			
 
				+        float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+        for (int i = 0; i < np; i++)
			
 
				+        {
			
 
				+            float dp = dot(colors[i]-mean, direction);
			
 
				+            if (dp < minp) minp = dp;
			
 
				+            if (dp > maxp) maxp = dp;
			
 
				+        }
			
 
				+
			
 
				+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+        endpts[region].A = mean + minp*direction;
			
 
				+        endpts[region].B = mean + maxp*direction;
			
 
				+
			
 
				+        // clamp endpoints
			
 
				+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+        // shape based on endpoints being clamped
			
 
				+        Utils::clamp(endpts[region].A);
			
 
				+        Utils::clamp(endpts[region].B);
			
 
				+    }
			
 
				+
			
 
				+    return map_colors(tile, shapeindex, endpts);
			
 
				+}
			
 
				+
			
 
				+float ZOH::compressone(const Tile &t, char *block)
			
 
				+{
			
 
				+    int shapeindex_best = 0;
			
 
				+    FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
			
 
				+    float msebest = FLT_MAX;
			
 
				+
			
 
				+    /*
			
 
				+		collect the mse values that are within 5% of the best values
			
 
				+		optimize each one and choose the best
			
 
				+	*/
			
 
				+    // hack for now -- just use the best value WORK
			
 
				+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
			
 
				+    {
			
 
				+        float mse = roughone(t, i, tempendpts);
			
 
				+        if (mse < msebest)
			
 
				+        {
			
 
				+            msebest = mse;
			
 
				+            shapeindex_best = i;
			
 
				+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+    return refineone(t, shapeindex_best, endptsbest, block);
			
 
				+}
			
--- a/3rdparty/nvtt/bc6h/zohtwo.cpp
+++ b/3rdparty/nvtt/bc6h/zohtwo.cpp
@@ -0,0 +1,883 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// two regions zoh compress/decompress code
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+
			
 
				+	get initial float endpoints
			
 
				+	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
			
 
				+		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
			
 
				+	for each EC candidate in order from max precision to smaller precision
			
 
				+		convert endpoints using the appropriate precision.
			
 
				+		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
			
 
				+			(thus the endpoints and indices are in final form.)
			
 
				+		transform and get bit delta.
			
 
				+		if the bit delta fits, exit
			
 
				+	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
			
 
				+		add a state variable to nvDebugCheck we only do this once.
			
 
				+	convert to bit stream.
			
 
				+	return the error.
			
 
				+
			
 
				+	Global optimization
			
 
				+		order all tiles based on their errors
			
 
				+		do something special for high-error tiles
			
 
				+			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
			
 
				+
			
 
				+	display an image that shows partitioning and precision selected for each tile
			
 
				+*/
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "zoh.h"
			
 
				+#include "zoh_utils.h"
			
 
				+
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+
			
 
				+#include <string.h> // strlen
			
 
				+#include <float.h> // FLT_MAX
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace ZOH;
			
 
				+
			
 
				+#define NINDICES	8
			
 
				+#define	INDEXBITS	3
			
 
				+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
			
 
				+#define	DENOM		(NINDICES-1)
			
 
				+
			
 
				+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
			
 
				+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
			
 
				+// stop without having to touch all shapes?
			
 
				+
			
 
				+#include "shapes_two.h"
			
 
				+// use only the first 32 available shapes
			
 
				+#undef NSHAPES
			
 
				+#undef SHAPEBITS
			
 
				+#define NSHAPES 32
			
 
				+#define SHAPEBITS 5
			
 
				+
			
 
				+#define	POS_TO_X(pos)	((pos)&3)
			
 
				+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
			
 
				+
			
 
				+#define	NDELTA	4
			
 
				+
			
 
				+struct Chanpat
			
 
				+{
			
 
				+    int prec[NDELTA];		// precision pattern for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+    Chanpat chan[NCHANNELS];    // allow different bit patterns per channel -- but we still want constant precision per channel
			
 
				+    int transformed;            // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
			
 
				+    int mode;                   // associated mode value
			
 
				+    int modebits;               // number of mode bits
			
 
				+    const char *encoding;       // verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define MAXMODEBITS	5
			
 
				+#define	MAXMODES (1<<MAXMODEBITS)
			
 
				+
			
 
				+#define	NPATTERNS 10
			
 
				+
			
 
				+static const Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+    11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
			
 
				+    11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
			
 
				+    11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
			
 
				+    10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
			
 
				+    9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
			
 
				+    8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
			
 
				+    8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
			
 
				+    8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
			
 
				+    7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
			
 
				+    6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
			
 
				+};
			
 
				+
			
 
				+// mapping of mode to the corresponding index in pattern
			
 
				+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
			
 
				+static const int mode_to_pat[MAXMODES] = {	
			
 
				+    3,	// 0x00
			
 
				+    8,	// 0x01
			
 
				+    0,	// 0x02
			
 
				+    -1,-1,-1,
			
 
				+    1,	// 0x06
			
 
				+    -1,-1,-1,
			
 
				+    2,	// 0x0a
			
 
				+    -1,-1,-1,
			
 
				+    4,	// 0x0e
			
 
				+    -1,-1,-1,
			
 
				+    5,	// 0x12
			
 
				+    -2,-1,-1,
			
 
				+    6,	// 0x16
			
 
				+    -2,-1,-1,
			
 
				+    7,	// 0x1a
			
 
				+    -2,-1,-1,
			
 
				+    9,	// 0x1e
			
 
				+    -2
			
 
				+};
			
 
				+
			
 
				+#define	R_0(ep)	(ep)[0].A[i]
			
 
				+#define	R_1(ep)	(ep)[0].B[i]
			
 
				+#define	R_2(ep)	(ep)[1].A[i]
			
 
				+#define	R_3(ep)	(ep)[1].B[i]
			
 
				+#define	MASK(n)	((1<<(n))-1)
			
 
				+
			
 
				+// compress endpoints
			
 
				+static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
			
 
				+{
			
 
				+    if (p.transformed)
			
 
				+    {
			
 
				+        for (int i=0; i<NCHANNELS; ++i)
			
 
				+        {
			
 
				+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
			
 
				+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
			
 
				+            R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
			
 
				+            R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        for (int i=0; i<NCHANNELS; ++i)
			
 
				+        {
			
 
				+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
			
 
				+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
			
 
				+            R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
			
 
				+            R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// decompress endpoints
			
 
				+static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
			
 
				+{
			
 
				+    bool issigned = Utils::FORMAT == SIGNED_F16;
			
 
				+
			
 
				+    if (p.transformed)
			
 
				+    {
			
 
				+        for (int i=0; i<NCHANNELS; ++i)
			
 
				+        {
			
 
				+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
			
 
				+            int t;
			
 
				+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
			
 
				+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
			
 
				+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
			
 
				+            t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
			
 
				+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
			
 
				+            R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
			
 
				+            t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
			
 
				+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
			
 
				+            R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
			
 
				+        }
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        for (int i=0; i<NCHANNELS; ++i)
			
 
				+        {
			
 
				+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
			
 
				+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
			
 
				+            R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
			
 
				+            R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
			
 
				+{
			
 
				+    for (int region = 0; region < NREGIONS_TWO; ++region)
			
 
				+    {
			
 
				+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
			
 
				+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
			
 
				+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
			
 
				+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
			
 
				+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
			
 
				+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
			
 
				+static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
			
 
				+{
			
 
				+    for (int region = 0; region < NREGIONS_TWO; ++region)
			
 
				+    {
			
 
				+        int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
			
 
				+
			
 
				+        int x = POS_TO_X(position);
			
 
				+        int y = POS_TO_Y(position);
			
 
				+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+        if (indices[y][x] & HIGH_INDEXBIT)
			
 
				+        {
			
 
				+            // high bit is set, swap the endpts and indices for this region
			
 
				+            int t;
			
 
				+            for (int i=0; i<NCHANNELS; ++i)
			
 
				+            {
			
 
				+                t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
			
 
				+            }
			
 
				+
			
 
				+            for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+                for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+                    if (REGION(x,y,shapeindex) == region)
			
 
				+                        indices[y][x] = NINDICES - 1 - indices[y][x];
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+// endpoints fit only if the compression was lossless
			
 
				+static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
			
 
				+{
			
 
				+    IntEndpts uncompressed[NREGIONS_TWO];
			
 
				+
			
 
				+    decompress_endpts(compressed, uncompressed, p);
			
 
				+
			
 
				+    for (int j=0; j<NREGIONS_TWO; ++j)
			
 
				+    {
			
 
				+	for (int i=0; i<NCHANNELS; ++i)
			
 
				+	{
			
 
				+            if (orig[j].A[i] != uncompressed[j].A[i]) return false;
			
 
				+            if (orig[j].B[i] != uncompressed[j].B[i]) return false;
			
 
				+        }
			
 
				+    }
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
			
 
				+{
			
 
				+    // interpret the verilog backwards and process it
			
 
				+    int m = p.mode;
			
 
				+    int d = shapeindex;
			
 
				+    int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
			
 
				+    int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
			
 
				+    int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
			
 
				+    int ptr = int(strlen(p.encoding));
			
 
				+    while (ptr)
			
 
				+    {
			
 
				+        Field field;
			
 
				+        int endbit, len;
			
 
				+
			
 
				+		// !!!UNDONE: get rid of string parsing!!!
			
 
				+        Utils::parse(p.encoding, ptr, field, endbit, len);
			
 
				+        switch(field)
			
 
				+        {
			
 
				+        case FIELD_M:	out.write( m >> endbit, len); break;
			
 
				+        case FIELD_D:	out.write( d >> endbit, len); break;
			
 
				+        case FIELD_RW:	out.write(rw >> endbit, len); break;
			
 
				+        case FIELD_RX:	out.write(rx >> endbit, len); break;
			
 
				+        case FIELD_RY:	out.write(ry >> endbit, len); break;
			
 
				+        case FIELD_RZ:	out.write(rz >> endbit, len); break;
			
 
				+        case FIELD_GW:	out.write(gw >> endbit, len); break;
			
 
				+        case FIELD_GX:	out.write(gx >> endbit, len); break;
			
 
				+        case FIELD_GY:	out.write(gy >> endbit, len); break;
			
 
				+        case FIELD_GZ:	out.write(gz >> endbit, len); break;
			
 
				+        case FIELD_BW:	out.write(bw >> endbit, len); break;
			
 
				+        case FIELD_BX:	out.write(bx >> endbit, len); break;
			
 
				+        case FIELD_BY:	out.write(by >> endbit, len); break;
			
 
				+        case FIELD_BZ:	out.write(bz >> endbit, len); break;
			
 
				+        default: nvUnreachable();
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
			
 
				+{
			
 
				+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
			
 
				+    int mode = in.read(2);
			
 
				+    if (mode != 0x00 && mode != 0x01)
			
 
				+        mode = (in.read(3) << 2) | mode;
			
 
				+
			
 
				+    int pat_index = mode_to_pat[mode];
			
 
				+
			
 
				+    if (pat_index == -2)
			
 
				+        return false;		// reserved mode found
			
 
				+
			
 
				+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+    p = patterns[pat_index];
			
 
				+
			
 
				+    int d;
			
 
				+    int rw, rx, ry, rz;
			
 
				+    int gw, gx, gy, gz;
			
 
				+    int bw, bx, by, bz;
			
 
				+
			
 
				+    d = 0;
			
 
				+    rw = rx = ry = rz = 0;
			
 
				+    gw = gx = gy = gz = 0;
			
 
				+    bw = bx = by = bz = 0;
			
 
				+
			
 
				+    int ptr = int(strlen(p.encoding));
			
 
				+
			
 
				+    while (ptr)
			
 
				+    {
			
 
				+        Field field;
			
 
				+        int endbit, len;
			
 
				+
			
 
				+		// !!!UNDONE: get rid of string parsing!!!
			
 
				+        Utils::parse(p.encoding, ptr, field, endbit, len);
			
 
				+
			
 
				+        switch(field)
			
 
				+        {
			
 
				+        case FIELD_M:	break;	// already processed so ignore
			
 
				+        case FIELD_D:	 d |= in.read(len) << endbit; break;
			
 
				+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
			
 
				+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
			
 
				+        case FIELD_RY:	ry |= in.read(len) << endbit; break;
			
 
				+        case FIELD_RZ:	rz |= in.read(len) << endbit; break;
			
 
				+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
			
 
				+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
			
 
				+        case FIELD_GY:	gy |= in.read(len) << endbit; break;
			
 
				+        case FIELD_GZ:	gz |= in.read(len) << endbit; break;
			
 
				+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
			
 
				+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
			
 
				+        case FIELD_BY:	by |= in.read(len) << endbit; break;
			
 
				+        case FIELD_BZ:	bz |= in.read(len) << endbit; break;
			
 
				+        default: nvUnreachable();
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    nvDebugCheck (in.getptr() == 128 - 46);
			
 
				+
			
 
				+    shapeindex = d;
			
 
				+    endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
			
 
				+    endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
			
 
				+    endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
			
 
				+{
			
 
				+    int positions[NREGIONS_TWO];
			
 
				+
			
 
				+    for (int r = 0; r < NREGIONS_TWO; ++r)
			
 
				+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+    {
			
 
				+        int x = POS_TO_X(pos);
			
 
				+        int y = POS_TO_Y(pos);
			
 
				+
			
 
				+        bool match = false;
			
 
				+
			
 
				+        for (int r = 0; r < NREGIONS_TWO; ++r)
			
 
				+            if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+        out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
			
 
				+{
			
 
				+    Bits out(block, ZOH::BITSIZE);
			
 
				+
			
 
				+    write_header(compr_endpts, shapeindex, p, out);
			
 
				+
			
 
				+    write_indices(indices, shapeindex, out);
			
 
				+
			
 
				+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
			
 
				+{
			
 
				+    // scale endpoints
			
 
				+    int a, b;			// really need a IntVector3...
			
 
				+
			
 
				+    a = Utils::unquantize(endpts.A[0], prec);
			
 
				+    b = Utils::unquantize(endpts.B[0], prec);
			
 
				+
			
 
				+    // interpolate
			
 
				+    for (int i = 0; i < NINDICES; ++i)
			
 
				+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
			
 
				+
			
 
				+    a = Utils::unquantize(endpts.A[1], prec);
			
 
				+    b = Utils::unquantize(endpts.B[1], prec);
			
 
				+
			
 
				+    // interpolate
			
 
				+    for (int i = 0; i < NINDICES; ++i)
			
 
				+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
			
 
				+
			
 
				+    a = Utils::unquantize(endpts.A[2], prec);
			
 
				+    b = Utils::unquantize(endpts.B[2], prec);
			
 
				+
			
 
				+    // interpolate
			
 
				+    for (int i = 0; i < NINDICES; ++i)
			
 
				+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+    int positions[NREGIONS_TWO];
			
 
				+
			
 
				+    for (int r = 0; r < NREGIONS_TWO; ++r)
			
 
				+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+    {
			
 
				+        int x = POS_TO_X(pos);
			
 
				+        int y = POS_TO_Y(pos);
			
 
				+
			
 
				+        bool match = false;
			
 
				+
			
 
				+        for (int r = 0; r < NREGIONS_TWO; ++r)
			
 
				+            if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+        indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+void ZOH::decompresstwo(const char *block, Tile &t)
			
 
				+{
			
 
				+    Bits in(block, ZOH::BITSIZE);
			
 
				+
			
 
				+    Pattern p;
			
 
				+    IntEndpts endpts[NREGIONS_TWO];
			
 
				+    ComprEndpts compr_endpts[NREGIONS_TWO];
			
 
				+    int shapeindex;
			
 
				+
			
 
				+    if (!read_header(in, compr_endpts, shapeindex, p))
			
 
				+    {
			
 
				+        // reserved mode, return all zeroes
			
 
				+        for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+            for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+                t.data[y][x] = Vector3(0.0f);
			
 
				+
			
 
				+        return;
			
 
				+    }
			
 
				+
			
 
				+    decompress_endpts(compr_endpts, endpts, p);
			
 
				+
			
 
				+    Vector3 palette[NREGIONS_TWO][NINDICES];
			
 
				+    for (int r = 0; r < NREGIONS_TWO; ++r)
			
 
				+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
			
 
				+
			
 
				+    int indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+    read_indices(in, shapeindex, indices);
			
 
				+
			
 
				+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
			
 
				+
			
 
				+    // lookup
			
 
				+    for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+        t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
			
 
				+{
			
 
				+    Vector3 palette[NINDICES];
			
 
				+    float toterr = 0;
			
 
				+    Vector3 err;
			
 
				+
			
 
				+    generate_palette_quantized(endpts, prec, palette);
			
 
				+
			
 
				+    for (int i = 0; i < np; ++i)
			
 
				+    {
			
 
				+        float err, besterr;
			
 
				+
			
 
				+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
			
 
				+
			
 
				+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
			
 
				+        {
			
 
				+            err = Utils::norm(colors[i], palette[j]) * importance[i];
			
 
				+
			
 
				+            if (err > besterr)	// error increased, so we're done searching
			
 
				+                break;
			
 
				+            if (err < besterr)
			
 
				+                besterr = err;
			
 
				+        }
			
 
				+        toterr += besterr;
			
 
				+    }
			
 
				+    return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, 
			
 
				+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
			
 
				+{
			
 
				+    // build list of possibles
			
 
				+    Vector3 palette[NREGIONS_TWO][NINDICES];
			
 
				+
			
 
				+    for (int region = 0; region < NREGIONS_TWO; ++region)
			
 
				+    {
			
 
				+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
			
 
				+        toterr[region] = 0;
			
 
				+    }
			
 
				+
			
 
				+    Vector3 err;
			
 
				+
			
 
				+    for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+        int region = REGION(x,y,shapeindex);
			
 
				+        float err, besterr;
			
 
				+
			
 
				+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
			
 
				+        indices[y][x] = 0;
			
 
				+
			
 
				+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
			
 
				+        {
			
 
				+            err = Utils::norm(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+            if (err > besterr)	// error increased, so we're done searching
			
 
				+                break;
			
 
				+            if (err < besterr)
			
 
				+            {
			
 
				+                besterr = err;
			
 
				+                indices[y][x] = i;
			
 
				+            }
			
 
				+        }
			
 
				+        toterr[region] += besterr;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
			
 
				+                          float old_err, int do_b)
			
 
				+{
			
 
				+    // we have the old endpoints: old_endpts
			
 
				+    // we have the perturbed endpoints: new_endpts
			
 
				+    // we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+    IntEndpts temp_endpts;
			
 
				+    float min_err = old_err;		// start with the best current error
			
 
				+    int beststep;
			
 
				+
			
 
				+    // copy real endpoints so we can perturb them
			
 
				+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
			
 
				+
			
 
				+    // do a logarithmic search for the best error for this endpoint (which)
			
 
				+    for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+    {
			
 
				+        bool improved = false;
			
 
				+        for (int sign = -1; sign <= 1; sign += 2)
			
 
				+        {
			
 
				+            if (do_b == 0)
			
 
				+            {
			
 
				+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+                    continue;
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+                    continue;
			
 
				+            }
			
 
				+
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, prec);
			
 
				+
			
 
				+            if (err < min_err)
			
 
				+            {
			
 
				+                improved = true;
			
 
				+                min_err = err;
			
 
				+                beststep = sign * step;
			
 
				+            }
			
 
				+        }
			
 
				+        // if this was an improvement, move the endpoint and continue search from there
			
 
				+        if (improved)
			
 
				+        {
			
 
				+            if (do_b == 0)
			
 
				+                new_endpts.A[ch] += beststep;
			
 
				+            else
			
 
				+                new_endpts.B[ch] += beststep;
			
 
				+        }
			
 
				+    }
			
 
				+    return min_err;
			
 
				+}
			
 
				+
			
 
				+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
			
 
				+{
			
 
				+    float opt_err = orig_err;
			
 
				+    for (int ch = 0; ch < NCHANNELS; ++ch)
			
 
				+    {
			
 
				+        opt_endpts.A[ch] = orig_endpts.A[ch];
			
 
				+        opt_endpts.B[ch] = orig_endpts.B[ch];
			
 
				+    }
			
 
				+    /*
			
 
				+        err0 = perturb(rgb0, delta0)
			
 
				+        err1 = perturb(rgb1, delta1)
			
 
				+        if (err0 < err1)
			
 
				+            if (err0 >= initial_error) break
			
 
				+            rgb0 += delta0
			
 
				+            next = 1
			
 
				+        else
			
 
				+            if (err1 >= initial_error) break
			
 
				+            rgb1 += delta1
			
 
				+            next = 0
			
 
				+        initial_err = map()
			
 
				+        for (;;)
			
 
				+            err = perturb(next ? rgb1:rgb0, delta)
			
 
				+            if (err >= initial_err) break
			
 
				+            next? rgb1 : rgb0 += delta
			
 
				+            initial_err = err
			
 
				+    */
			
 
				+    IntEndpts new_a, new_b;
			
 
				+    IntEndpts new_endpt;
			
 
				+    int do_b;
			
 
				+
			
 
				+    // now optimize each channel separately
			
 
				+    for (int ch = 0; ch < NCHANNELS; ++ch)
			
 
				+    {
			
 
				+        // figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+        // if we just alternate, we can easily end up in a local minima
			
 
				+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
			
 
				+
			
 
				+        if (err0 < err1)
			
 
				+        {
			
 
				+            if (err0 >= opt_err)
			
 
				+                continue;
			
 
				+
			
 
				+            opt_endpts.A[ch] = new_a.A[ch];
			
 
				+            opt_err = err0;
			
 
				+            do_b = 1;		// do B next
			
 
				+        }
			
 
				+        else
			
 
				+        {
			
 
				+            if (err1 >= opt_err)
			
 
				+                continue;
			
 
				+            opt_endpts.B[ch] = new_b.B[ch];
			
 
				+            opt_err = err1;
			
 
				+            do_b = 0;		// do A next
			
 
				+        }
			
 
				+
			
 
				+        // now alternate endpoints and keep trying until there is no improvement
			
 
				+        for (;;)
			
 
				+        {
			
 
				+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
			
 
				+            if (err >= opt_err)
			
 
				+                break;
			
 
				+            if (do_b == 0)
			
 
				+                opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+            else
			
 
				+                opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+            opt_err = err;
			
 
				+            do_b = 1 - do_b;	// now move the other endpoint
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO], 
			
 
				+                            const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
			
 
				+{
			
 
				+    Vector3 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+    float err = 0;
			
 
				+
			
 
				+    for (int region=0; region<NREGIONS_TWO; ++region)
			
 
				+    {
			
 
				+        // collect the pixels in the region
			
 
				+        int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++)
			
 
				+            for (int x = 0; x < tile.size_x; x++)
			
 
				+                if (REGION(x,y,shapeindex) == region)
			
 
				+                {
			
 
				+            pixels[np] = tile.data[y][x];
			
 
				+            importance[np] = tile.importance_map[y][x];
			
 
				+            ++np;
			
 
				+        }
			
 
				+
			
 
				+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+    for each pattern
			
 
				+        convert endpoints using pattern precision
			
 
				+        assign indices and get initial error
			
 
				+        compress indices (and possibly reorder endpoints)
			
 
				+        transform endpoints
			
 
				+        if transformed endpoints fit pattern
			
 
				+            get original endpoints back
			
 
				+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+            compress new indices
			
 
				+            transform new endpoints
			
 
				+            if new endpoints fit pattern AND if error is improved
			
 
				+                emit compressed block with new data
			
 
				+            else
			
 
				+                emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
			
 
				+{
			
 
				+    float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
			
 
				+    IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
			
 
				+    ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
			
 
				+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+    for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+    {
			
 
				+        // precisions for all channels need to be the same
			
 
				+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
			
 
				+
			
 
				+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
			
 
				+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
			
 
				+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
			
 
				+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
			
 
				+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
			
 
				+        {
			
 
				+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
			
 
				+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
			
 
				+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
			
 
				+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
			
 
				+            orig_toterr = opt_toterr = 0;
			
 
				+            for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+            {
			
 
				+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
			
 
				+                return opt_toterr;
			
 
				+            }
			
 
				+            else
			
 
				+            {
			
 
				+                // either it stopped fitting when we optimized it, or there was no improvement
			
 
				+                // so go back to the unoptimized endpoints which we know will fit
			
 
				+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
			
 
				+                return orig_toterr;
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
			
 
				+{
			
 
				+    for (int region = 0; region < NREGIONS_TWO; ++region)
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
			
 
				+}
			
 
				+
			
 
				+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
			
 
				+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
			
 
				+{
			
 
				+    // build list of possibles
			
 
				+    Vector3 palette[NREGIONS_TWO][NINDICES];
			
 
				+
			
 
				+    generate_palette_unquantized(endpts, palette);
			
 
				+
			
 
				+    float toterr = 0;
			
 
				+    Vector3 err;
			
 
				+
			
 
				+    for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+        int region = REGION(x,y,shapeindex);
			
 
				+        float err, besterr;
			
 
				+
			
 
				+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
			
 
				+
			
 
				+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
			
 
				+        {
			
 
				+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
			
 
				+
			
 
				+            if (err > besterr)	// error increased, so we're done searching
			
 
				+                break;
			
 
				+            if (err < besterr)
			
 
				+                besterr = err;
			
 
				+        }
			
 
				+        toterr += besterr;
			
 
				+    }
			
 
				+    return toterr;
			
 
				+}
			
 
				+
			
 
				+float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
			
 
				+{
			
 
				+    for (int region=0; region<NREGIONS_TWO; ++region)
			
 
				+    {
			
 
				+        int np = 0;
			
 
				+        Vector3 colors[Tile::TILE_TOTAL];
			
 
				+        Vector3 mean(0,0,0);
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++)
			
 
				+            for (int x = 0; x < tile.size_x; x++)
			
 
				+                if (REGION(x,y,shapeindex) == region)
			
 
				+                {
			
 
				+            colors[np] = tile.data[y][x];
			
 
				+            mean += tile.data[y][x];
			
 
				+            ++np;
			
 
				+        }
			
 
				+
			
 
				+        // handle simple cases
			
 
				+        if (np == 0)
			
 
				+        {
			
 
				+            Vector3 zero(0,0,0);
			
 
				+            endpts[region].A = zero;
			
 
				+            endpts[region].B = zero;
			
 
				+            continue;
			
 
				+        }
			
 
				+        else if (np == 1)
			
 
				+        {
			
 
				+            endpts[region].A = colors[0];
			
 
				+            endpts[region].B = colors[0];
			
 
				+            continue;
			
 
				+        }
			
 
				+        else if (np == 2)
			
 
				+        {
			
 
				+            endpts[region].A = colors[0];
			
 
				+            endpts[region].B = colors[1];
			
 
				+            continue;
			
 
				+        }
			
 
				+
			
 
				+        mean /= float(np);
			
 
				+
			
 
				+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+        // project each pixel value along the principal direction
			
 
				+        float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+        for (int i = 0; i < np; i++)
			
 
				+        {
			
 
				+            float dp = dot(colors[i]-mean, direction);
			
 
				+            if (dp < minp) minp = dp;
			
 
				+            if (dp > maxp) maxp = dp;
			
 
				+        }
			
 
				+
			
 
				+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+        endpts[region].A = mean + minp*direction;
			
 
				+        endpts[region].B = mean + maxp*direction;
			
 
				+
			
 
				+        // clamp endpoints
			
 
				+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+        // shape based on endpoints being clamped
			
 
				+        Utils::clamp(endpts[region].A);
			
 
				+        Utils::clamp(endpts[region].B);
			
 
				+    }
			
 
				+
			
 
				+    return map_colors(tile, shapeindex, endpts);
			
 
				+}
			
 
				+
			
 
				+float ZOH::compresstwo(const Tile &t, char *block)
			
 
				+{
			
 
				+    int shapeindex_best = 0;
			
 
				+    FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
			
 
				+    float msebest = FLT_MAX;
			
 
				+
			
 
				+    /*
			
 
				+    collect the mse values that are within 5% of the best values
			
 
				+    optimize each one and choose the best
			
 
				+    */
			
 
				+    // hack for now -- just use the best value WORK
			
 
				+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
			
 
				+    {
			
 
				+        float mse = roughtwo(t, i, tempendpts);
			
 
				+        if (mse < msebest)
			
 
				+        {
			
 
				+            msebest = mse;
			
 
				+            shapeindex_best = i;
			
 
				+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
			
 
				+        }
			
 
				+
			
 
				+    }
			
 
				+    return refinetwo(t, shapeindex_best, endptsbest, block);
			
 
				+}
			
 
				+
			
--- a/3rdparty/nvtt/bc7/avpcl.cpp
+++ b/3rdparty/nvtt/bc7/avpcl.cpp
@@ -0,0 +1,264 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// the avpcl compressor and decompressor
			
 
				+
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+// global flags
			
 
				+bool AVPCL::flag_premult = false;
			
 
				+bool AVPCL::flag_nonuniform = false;
			
 
				+bool AVPCL::flag_nonuniform_ati = false;
			
 
				+
			
 
				+// global mode
			
 
				+bool AVPCL::mode_rgb = false;		// true if image had constant alpha = 255
			
 
				+
			
 
				+void AVPCL::compress(const Tile &t, char *block)
			
 
				+{
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+
			
 
				+	float mse_mode0 = AVPCL::compress_mode0(t, tempblock);		if(mse_mode0 < msebest) { msebest = mse_mode0; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
			
 
				+	float mse_mode1 = AVPCL::compress_mode1(t, tempblock);		if(mse_mode1 < msebest) { msebest = mse_mode1; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
			
 
				+	float mse_mode2 = AVPCL::compress_mode2(t, tempblock);		if(mse_mode2 < msebest) { msebest = mse_mode2; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
			
 
				+	float mse_mode3 = AVPCL::compress_mode3(t, tempblock);		if(mse_mode3 < msebest) { msebest = mse_mode3; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
			
 
				+	float mse_mode4 = AVPCL::compress_mode4(t, tempblock);		if(mse_mode4 < msebest) { msebest = mse_mode4; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
			
 
				+	float mse_mode5 = AVPCL::compress_mode5(t, tempblock);		if(mse_mode5 < msebest) { msebest = mse_mode5; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
			
 
				+	float mse_mode6 = AVPCL::compress_mode6(t, tempblock);		if(mse_mode6 < msebest) { msebest = mse_mode6; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
			
 
				+	float mse_mode7 = AVPCL::compress_mode7(t, tempblock);		if(mse_mode7 < msebest) { msebest = mse_mode7; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
			
 
				+		
			
 
				+	/*if (errfile)
			
 
				+	{
			
 
				+		float errs[21];
			
 
				+		int nerrs = 8;
			
 
				+		errs[0] = mse_mode0; 
			
 
				+		errs[1] = mse_mode1; 
			
 
				+		errs[2] = mse_mode2; 
			
 
				+		errs[3] = mse_mode3; 
			
 
				+		errs[4] = mse_mode4; 
			
 
				+		errs[5] = mse_mode5; 
			
 
				+		errs[6] = mse_mode6; 
			
 
				+		errs[7] = mse_mode7;
			
 
				+		if (fwrite(errs, sizeof(float), nerrs, errfile) != nerrs)
			
 
				+			throw "Write error on error file";
			
 
				+	}*/
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+static int getbit(char *b, int start)
			
 
				+{
			
 
				+	if (start < 0 || start >= 128) return 0; // out of range
			
 
				+
			
 
				+	int ix = start >> 3;
			
 
				+	return (b[ix] & (1 << (start & 7))) != 0;
			
 
				+}
			
 
				+
			
 
				+static int getbits(char *b, int start, int len)
			
 
				+{
			
 
				+	int out = 0;
			
 
				+	for (int i=0; i<len; ++i)
			
 
				+		out |= getbit(b, start+i) << i;
			
 
				+	return out;
			
 
				+}
			
 
				+
			
 
				+static void setbit(char *b, int start, int bit)
			
 
				+{
			
 
				+	if (start < 0 || start >= 128) return; // out of range
			
 
				+
			
 
				+	int ix = start >> 3;
			
 
				+
			
 
				+	if (bit & 1)
			
 
				+		b[ix] |= (1 << (start & 7));
			
 
				+	else
			
 
				+		b[ix] &= ~(1 << (start & 7));
			
 
				+}
			
 
				+
			
 
				+static void setbits(char *b, int start, int len, int bits)
			
 
				+{
			
 
				+	for (int i=0; i<len; ++i)
			
 
				+		setbit(b, start+i, bits >> i);
			
 
				+}
			
 
				+*/
			
 
				+
			
 
				+void AVPCL::decompress(const char *cblock, Tile &t)
			
 
				+{
			
 
				+	char block[AVPCL::BLOCKSIZE];
			
 
				+	memcpy(block, cblock, AVPCL::BLOCKSIZE);
			
 
				+
			
 
				+	switch(getmode(block))
			
 
				+	{
			
 
				+	case 0:	AVPCL::decompress_mode0(block, t);	break;
			
 
				+	case 1:	AVPCL::decompress_mode1(block, t);	break;
			
 
				+	case 2:	AVPCL::decompress_mode2(block, t);	break;
			
 
				+	case 3:	AVPCL::decompress_mode3(block, t);	break;
			
 
				+	case 4:	AVPCL::decompress_mode4(block, t);	break;
			
 
				+	case 5:	AVPCL::decompress_mode5(block, t);	break;
			
 
				+	case 6:	AVPCL::decompress_mode6(block, t);	break;
			
 
				+	case 7:	AVPCL::decompress_mode7(block, t);	break;
			
 
				+	case 8: // return a black tile if you get a reserved mode
			
 
				+		for (int y=0; y<Tile::TILE_H; ++y)
			
 
				+			for (int x=0; x<Tile::TILE_W; ++x)
			
 
				+				t.data[y][x].set(0, 0, 0, 0);
			
 
				+		break;
			
 
				+	default: nvUnreachable();
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+void AVPCL::compress(string inf, string avpclf, string errf)
			
 
				+{
			
 
				+	Array2D<RGBA> pixels;
			
 
				+	int w, h;
			
 
				+	char block[AVPCL::BLOCKSIZE];
			
 
				+
			
 
				+	Targa::read(inf, pixels, w, h);
			
 
				+	FILE *avpclfile = fopen(avpclf.c_str(), "wb");
			
 
				+	if (avpclfile == NULL) throw "Unable to open .avpcl file for write";
			
 
				+	FILE *errfile = NULL;
			
 
				+	if (errf != "")
			
 
				+	{
			
 
				+		errfile = fopen(errf.c_str(), "wb");
			
 
				+		if (errfile == NULL) throw "Unable to open error file for write";
			
 
				+	}
			
 
				+
			
 
				+	// Look at alpha channel and override the premult flag if alpha is constant (but only if premult is set)
			
 
				+	if (AVPCL::flag_premult)
			
 
				+	{
			
 
				+		if (AVPCL::mode_rgb)
			
 
				+		{
			
 
				+			AVPCL::flag_premult = false;
			
 
				+			cout << endl << "NOTE: Source image alpha is constant 255, turning off premultiplied-alpha error metric." << endl << endl;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// stuff for progress bar O.o
			
 
				+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
			
 
				+	int tilecnt = 0;
			
 
				+	clock_t start, prev, cur;
			
 
				+
			
 
				+	start = prev = clock();
			
 
				+
			
 
				+	// convert to tiles and compress each tile
			
 
				+	for (int y=0; y<h; y+=Tile::TILE_H)
			
 
				+	{
			
 
				+		int ysize = min(Tile::TILE_H, h-y);
			
 
				+		for (int x=0; x<w; x+=Tile::TILE_W)
			
 
				+		{
			
 
				+			if ((tilecnt%100) == 0) { cur = clock(); printf("Progress %d of %d, %5.2f seconds per 100 tiles\r", tilecnt, ntiles, float(cur-prev)/CLOCKS_PER_SEC); fflush(stdout); prev = cur; }
			
 
				+
			
 
				+			int xsize = min(Tile::TILE_W, w-x);
			
 
				+			Tile t(xsize, ysize);
			
 
				+
			
 
				+			t.insert(pixels, x, y);
			
 
				+
			
 
				+			AVPCL::compress(t, block, errfile);
			
 
				+			if (fwrite(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
			
 
				+				throw "File error on write";
			
 
				+
			
 
				+			// progress bar
			
 
				+			++tilecnt;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	cur = clock();
			
 
				+	printf("\nTotal time to compress: %.2f seconds\n\n", float(cur-start)/CLOCKS_PER_SEC);		// advance to next line finally
			
 
				+
			
 
				+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
			
 
				+	if (errfile && fclose(errfile)) throw "Close failed on error file";
			
 
				+}
			
 
				+
			
 
				+static int str2int(std::string s) 
			
 
				+{
			
 
				+	int thing;
			
 
				+	std::stringstream str (stringstream::in | stringstream::out);
			
 
				+	str << s;
			
 
				+	str >> thing;
			
 
				+	return thing;
			
 
				+}
			
 
				+
			
 
				+// avpcl file name is ...-w-h-RGB[A].avpcl, extract width and height
			
 
				+static void extract(string avpclf, int &w, int &h, bool &mode_rgb)
			
 
				+{
			
 
				+	size_t n = avpclf.rfind('.', avpclf.length()-1);
			
 
				+	size_t n1 = avpclf.rfind('-', n-1);
			
 
				+	size_t n2 = avpclf.rfind('-', n1-1);
			
 
				+	size_t n3 = avpclf.rfind('-', n2-1);
			
 
				+	//	...-wwww-hhhh-RGB[A].avpcl
			
 
				+	//     ^    ^    ^      ^
			
 
				+	//     n3   n2   n1     n n3<n2<n1<n
			
 
				+	string width = avpclf.substr(n3+1, n2-n3-1);
			
 
				+	w = str2int(width);
			
 
				+	string height = avpclf.substr(n2+1, n1-n2-1);
			
 
				+	h = str2int(height);
			
 
				+	string mode = avpclf.substr(n1+1, n-n1-1);
			
 
				+	mode_rgb = mode == "RGB";
			
 
				+}
			
 
				+
			
 
				+static int modehist[8];
			
 
				+
			
 
				+static void stats(char block[AVPCL::BLOCKSIZE])
			
 
				+{
			
 
				+	int m = AVPCL::getmode(block);
			
 
				+	modehist[m]++;
			
 
				+}
			
 
				+
			
 
				+static void printstats()
			
 
				+{
			
 
				+	printf("\nMode histogram: "); for (int i=0; i<8; ++i) { printf("%d,", modehist[i]); }
			
 
				+	printf("\n");
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress(string avpclf, string outf)
			
 
				+{
			
 
				+	Array2D<RGBA> pixels;
			
 
				+	int w, h;
			
 
				+	char block[AVPCL::BLOCKSIZE];
			
 
				+
			
 
				+	extract(avpclf, w, h, AVPCL::mode_rgb);
			
 
				+	FILE *avpclfile = fopen(avpclf.c_str(), "rb");
			
 
				+	if (avpclfile == NULL) throw "Unable to open .avpcl file for read";
			
 
				+	pixels.resizeErase(h, w);
			
 
				+
			
 
				+	// convert to tiles and decompress each tile
			
 
				+	for (int y=0; y<h; y+=Tile::TILE_H)
			
 
				+	{
			
 
				+		int ysize = min(Tile::TILE_H, h-y);
			
 
				+		for (int x=0; x<w; x+=Tile::TILE_W)
			
 
				+		{
			
 
				+			int xsize = min(Tile::TILE_W, w-x);
			
 
				+			Tile t(xsize, ysize);
			
 
				+
			
 
				+			if (fread(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
			
 
				+				throw "File error on read";
			
 
				+
			
 
				+			stats(block);	// collect statistics
			
 
				+		
			
 
				+			AVPCL::decompress(block, t);
			
 
				+
			
 
				+			t.extract(pixels, x, y);
			
 
				+		}
			
 
				+	}
			
 
				+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
			
 
				+
			
 
				+	Targa::write(outf, pixels, w, h);
			
 
				+
			
 
				+	printstats();	// print statistics
			
 
				+}
			
 
				+*/
			
--- a/3rdparty/nvtt/bc7/avpcl.h
+++ b/3rdparty/nvtt/bc7/avpcl.h
@@ -0,0 +1,99 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+#ifndef _AVPCL_H
			
 
				+#define _AVPCL_H
			
 
				+
			
 
				+#include "tile.h"
			
 
				+#include "bits.h"
			
 
				+
			
 
				+#define	DISABLE_EXHAUSTIVE	1	// define this if you don't want to spend a lot of time on exhaustive compression
			
 
				+#define	USE_ZOH_INTERP		1	// use zoh interpolator, otherwise use exact avpcl interpolators
			
 
				+#define	USE_ZOH_INTERP_ROUNDED 1	// use the rounded versions!
			
 
				+
			
 
				+namespace AVPCL {
			
 
				+
			
 
				+static const int NREGIONS_TWO	= 2;
			
 
				+static const int NREGIONS_THREE	= 3;
			
 
				+
			
 
				+static const int BLOCKSIZE=16;
			
 
				+static const int BITSIZE=128;
			
 
				+
			
 
				+// global flags
			
 
				+extern bool flag_premult;
			
 
				+extern bool flag_nonuniform;
			
 
				+extern bool flag_nonuniform_ati;
			
 
				+
			
 
				+// global mode
			
 
				+extern bool mode_rgb;		// true if image had constant alpha = 255
			
 
				+
			
 
				+void compress(const Tile &t, char *block);
			
 
				+void decompress(const char *block, Tile &t);
			
 
				+
			
 
				+float compress_mode0(const Tile &t, char *block);
			
 
				+void decompress_mode0(const char *block, Tile &t);
			
 
				+
			
 
				+float compress_mode1(const Tile &t, char *block);
			
 
				+void decompress_mode1(const char *block, Tile &t);
			
 
				+
			
 
				+float compress_mode2(const Tile &t, char *block);
			
 
				+void decompress_mode2(const char *block, Tile &t);
			
 
				+
			
 
				+float compress_mode3(const Tile &t, char *block);
			
 
				+void decompress_mode3(const char *block, Tile &t);
			
 
				+
			
 
				+float compress_mode4(const Tile &t, char *block);
			
 
				+void decompress_mode4(const char *block, Tile &t);
			
 
				+
			
 
				+float compress_mode5(const Tile &t, char *block);
			
 
				+void decompress_mode5(const char *block, Tile &t);
			
 
				+
			
 
				+float compress_mode6(const Tile &t, char *block);
			
 
				+void decompress_mode6(const char *block, Tile &t);
			
 
				+
			
 
				+float compress_mode7(const Tile &t, char *block);
			
 
				+void decompress_mode7(const char *block, Tile &t);
			
 
				+
			
 
				+inline int getmode(Bits &in)
			
 
				+{
			
 
				+	int mode = 0;
			
 
				+
			
 
				+	if (in.read(1))			mode = 0;
			
 
				+	else if (in.read(1))	mode = 1;
			
 
				+	else if (in.read(1))	mode = 2;
			
 
				+	else if (in.read(1))	mode = 3;
			
 
				+	else if (in.read(1))	mode = 4;
			
 
				+	else if (in.read(1))	mode = 5;
			
 
				+	else if (in.read(1))	mode = 6;
			
 
				+	else if (in.read(1))	mode = 7;
			
 
				+	else mode = 8;	// reserved
			
 
				+	return mode;
			
 
				+}
			
 
				+inline int getmode(const char *block)
			
 
				+{
			
 
				+	int bits = block[0], mode = 0;
			
 
				+
			
 
				+	if (bits & 1) mode = 0;
			
 
				+	else if ((bits&3) == 2) mode = 1;
			
 
				+	else if ((bits&7) == 4) mode = 2;
			
 
				+	else if ((bits & 0xF) == 8) mode = 3;
			
 
				+	else if ((bits & 0x1F) == 16) mode = 4;
			
 
				+	else if ((bits & 0x3F) == 32) mode = 5;
			
 
				+	else if ((bits & 0x7F) == 64) mode = 6;
			
 
				+	else if ((bits & 0xFF) == 128) mode = 7;
			
 
				+	else mode = 8;	// reserved
			
 
				+	return mode;
			
 
				+}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/bc7/avpcl_mode0.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode0.cpp
@@ -0,0 +1,1066 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+//  x1		444.1x6 16p 45b (3bi)
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Matrix.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "endpts.h"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+#include "shapes_three.h"
			
 
				+
			
 
				+// use only the first 16 available shapes
			
 
				+#undef NSHAPES
			
 
				+#undef SHAPEBITS
			
 
				+#define NSHAPES 16
			
 
				+#define SHAPEBITS 4
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
			
 
				+
			
 
				+#define NINDICES	8
			
 
				+#define	INDEXBITS	3
			
 
				+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
			
 
				+#define	DENOM		(NINDICES-1)
			
 
				+#define	BIAS		(DENOM/2)
			
 
				+
			
 
				+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
			
 
				+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
			
 
				+// stop without having to touch all shapes?
			
 
				+
			
 
				+#define	POS_TO_X(pos)	((pos)&3)
			
 
				+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
			
 
				+
			
 
				+#define	NBITSIZES	(NREGIONS*2)
			
 
				+#define	ABITINDEX(region)	(2*(region)+0)
			
 
				+#define	BBITINDEX(region)	(2*(region)+1)
			
 
				+
			
 
				+struct ChanBits
			
 
				+{
			
 
				+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
			
 
				+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
			
 
				+	int mode;				// associated mode value
			
 
				+	int modebits;			// number of mode bits
			
 
				+    const char *encoding;			// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define	NPATTERNS 1
			
 
				+
			
 
				+static Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+	// red			green			blue			xfm	mode  mb
			
 
				+	4,4,4,4,4,4,	4,4,4,4,4,4,	4,4,4,4,4,4,	0,	0x1, 1, "",	// really 444.1 x 6
			
 
				+};
			
 
				+
			
 
				+struct RegionPrec
			
 
				+{
			
 
				+	int	endpt_a_prec[NCHANNELS_RGB];
			
 
				+	int endpt_b_prec[NCHANNELS_RGB];
			
 
				+};
			
 
				+
			
 
				+struct PatternPrec
			
 
				+{
			
 
				+	RegionPrec region_precs[NREGIONS];
			
 
				+};
			
 
				+
			
 
				+// this is the precision for each channel and region
			
 
				+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
			
 
				+static PatternPrec pattern_precs[NPATTERNS] =
			
 
				+{
			
 
				+	4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 
			
 
				+};
			
 
				+
			
 
				+// return # of bits needed to store n. handle signed or unsigned cases properly
			
 
				+static int nbits(int n, bool issigned)
			
 
				+{
			
 
				+	int nb;
			
 
				+	if (n==0)
			
 
				+		return 0;	// no bits needed for 0, signed or not
			
 
				+	else if (n > 0)
			
 
				+	{
			
 
				+		for (nb=0; n; ++nb, n>>=1) ;
			
 
				+		return nb + (issigned?1:0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nvAssert (issigned);
			
 
				+		for (nb=0; n<-1; ++nb, n>>=1) ;
			
 
				+		return nb + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+// endpoints are 555,555; reduce to 444,444 and put the lsb bit majority in compr_bits
			
 
				+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
			
 
				+{
			
 
				+	int onescnt;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+	{
			
 
				+		onescnt += endpts.A[j] & 1;
			
 
				+		compr_endpts.A[j] = endpts.A[j] >> 1;
			
 
				+		nvAssert (compr_endpts.A[j] < 16);
			
 
				+	}
			
 
				+	compr_endpts.a_lsb = onescnt >= 2;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+	{
			
 
				+		onescnt += endpts.B[j] & 1;
			
 
				+		compr_endpts.B[j] = endpts.B[j] >> 1;
			
 
				+		nvAssert (compr_endpts.B[j] < 16);
			
 
				+	}
			
 
				+	compr_endpts.b_lsb = onescnt >= 2;
			
 
				+}
			
 
				+
			
 
				+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
			
 
				+{
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+	{
			
 
				+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
			
 
				+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		uncompress_one(compr_endpts[i], endpts[i]);
			
 
				+}
			
 
				+
			
 
				+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		compress_one(endpts[i], compr_endpts[i]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
			
 
				+{
			
 
				+	IntEndptsRGB full_endpts[NREGIONS];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
			
 
				+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
			
 
				+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
			
 
				+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
			
 
				+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
			
 
				+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
			
 
				+		compress_one(full_endpts[region], q_endpts[region]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
			
 
				+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
			
 
				+
			
 
				+		int x = POS_TO_X(position);
			
 
				+		int y = POS_TO_Y(position);
			
 
				+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+		if (indices[y][x] & HIGH_INDEXBIT)
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=0; i<NCHANNELS_RGB; ++i) 
			
 
				+			{
			
 
				+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
			
 
				+			}
			
 
				+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[y][x] = NINDICES - 1 - indices[y][x];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
			
 
				+{
			
 
				+	out.write(p.mode, p.modebits);
			
 
				+	out.write(shapeindex, SHAPEBITS);
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+	{
			
 
				+		out.write(endpts[i].a_lsb, 1);
			
 
				+		out.write(endpts[i].b_lsb, 1);
			
 
				+	}
			
 
				+
			
 
				+	nvAssert (out.getptr() == 83);
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
			
 
				+{
			
 
				+	int mode = AVPCL::getmode(in);
			
 
				+
			
 
				+	pat_index = 0;
			
 
				+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+	nvAssert (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+	shapeindex = in.read(SHAPEBITS);
			
 
				+	p = patterns[pat_index];
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+	
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+	{
			
 
				+		endpts[i].a_lsb  = in.read(1);
			
 
				+		endpts[i].b_lsb  = in.read(1);
			
 
				+	}
			
 
				+
			
 
				+	nvAssert (in.getptr() == 83);
			
 
				+}
			
 
				+
			
 
				+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
			
 
				+{
			
 
				+	int positions[NREGIONS];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	int positions[NREGIONS];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
			
 
				+{
			
 
				+	Bits out(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	write_header(endpts, shapeindex, p, out);
			
 
				+
			
 
				+	write_indices(indices, shapeindex, out);
			
 
				+
			
 
				+	nvAssert(out.getptr() == AVPCL::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
			
 
				+{
			
 
				+	IntEndptsRGB endpts;
			
 
				+
			
 
				+	uncompress_one(endpts_2, endpts);
			
 
				+
			
 
				+	// scale endpoints
			
 
				+	int a, b;			// really need a IntVec4...
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
			
 
				+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	// constant alpha
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress_mode0(const char *block, Tile &t)
			
 
				+{
			
 
				+	Bits in(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	Pattern p;
			
 
				+	IntEndptsRGB_2 endpts[NREGIONS];
			
 
				+	int shapeindex, pat_index;
			
 
				+
			
 
				+	read_header(in, endpts, shapeindex, p, pat_index);
			
 
				+	
			
 
				+	if (p.transformed)
			
 
				+	{
			
 
				+		sign_extend(p, endpts);
			
 
				+		transform_inverse(endpts);
			
 
				+	}
			
 
				+
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
			
 
				+
			
 
				+	int indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	read_indices(in, shapeindex, indices);
			
 
				+
			
 
				+	nvAssert(in.getptr() == AVPCL::BITSIZE);
			
 
				+
			
 
				+	// lookup
			
 
				+	for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	Vector4 palette[NINDICES];
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	generate_palette_quantized(endpts, region_prec, palette);
			
 
				+
			
 
				+	for (int i = 0; i < np; ++i)
			
 
				+	{
			
 
				+		float besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
			
 
				+		{
			
 
				+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[i] = j;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+
			
 
				+		// check for early exit
			
 
				+		if (toterr > current_err)
			
 
				+		{
			
 
				+			// fill out bogus index values so it's initialized at least
			
 
				+			for (int k = i; k < np; ++k)
			
 
				+				indices[k] = -1;
			
 
				+
			
 
				+			return FLT_MAX;
			
 
				+		}
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
			
 
				+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
			
 
				+		toterr[region] = 0;
			
 
				+	}
			
 
				+
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[y][x] = i;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr[region] += besterr;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
			
 
				+// this function returns either old_err or a value smaller (if it was successful in improving the error)
			
 
				+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
			
 
				+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	// we have the old endpoints: old_endpts
			
 
				+	// we have the perturbed endpoints: new_endpts
			
 
				+	// we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+	IntEndptsRGB_2 temp_endpts;
			
 
				+	float min_err = old_err;		// start with the best current error
			
 
				+	int beststep;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	// copy real endpoints so we can perturb them
			
 
				+	temp_endpts = new_endpts = old_endpts;
			
 
				+
			
 
				+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
			
 
				+
			
 
				+	// do a logarithmic search for the best error for this endpoint (which)
			
 
				+	for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+	{
			
 
				+		bool improved = false;
			
 
				+		for (int sign = -1; sign <= 1; sign += 2)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+			{
			
 
				+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
			
 
				+
			
 
				+			if (err < min_err)
			
 
				+			{
			
 
				+				improved = true;
			
 
				+				min_err = err;
			
 
				+				beststep = sign * step;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+		// if this was an improvement, move the endpoint and continue search from there
			
 
				+		if (improved)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+				new_endpts.A[ch] += beststep;
			
 
				+			else
			
 
				+				new_endpts.B[ch] += beststep;
			
 
				+		}
			
 
				+	}
			
 
				+	return min_err;
			
 
				+}
			
 
				+
			
 
				+// the larger the error the more time it is worth spending on an exhaustive search.
			
 
				+// perturb the endpoints at least -3 to 3.
			
 
				+// if err > 5000 perturb endpoints 50% of precision
			
 
				+// if err > 1000 25%
			
 
				+// if err > 200 12.5%
			
 
				+// if err > 40  6.25%
			
 
				+// for np = 16 -- adjust error thresholds as a function of np
			
 
				+// always ensure endpoint ordering is preserved (no need to overlap the scan)
			
 
				+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
			
 
				+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	IntEndptsRGB_2 temp_endpts;
			
 
				+	float best_err = orig_err;
			
 
				+	int aprec = region_prec.endpt_a_prec[ch];
			
 
				+	int bprec = region_prec.endpt_b_prec[ch];
			
 
				+	int good_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
			
 
				+
			
 
				+	if (orig_err == 0) return orig_err;
			
 
				+
			
 
				+	int adelta = 0, bdelta = 0;
			
 
				+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
			
 
				+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
			
 
				+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
			
 
				+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
			
 
				+	adelta = max(adelta, 3);
			
 
				+	bdelta = max(bdelta, 3);
			
 
				+
			
 
				+#ifdef	DISABLE_EXHAUSTIVE
			
 
				+	adelta = bdelta = 3;
			
 
				+#endif
			
 
				+
			
 
				+	temp_endpts = opt_endpts;
			
 
				+
			
 
				+	// ok figure out the range of A and B
			
 
				+	int alow = max(0, opt_endpts.A[ch] - adelta);
			
 
				+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
			
 
				+	int blow = max(0, opt_endpts.B[ch] - bdelta);
			
 
				+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
			
 
				+
			
 
				+	// now there's no need to swap the ordering of A and B
			
 
				+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
			
 
				+
			
 
				+	int amin, bmin;
			
 
				+
			
 
				+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
			
 
				+	{
			
 
				+		// keep a <= b
			
 
				+		for (int a = alow; a <= ahigh; ++a)
			
 
				+		for (int b = max(a, blow); b < bhigh; ++b)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// keep b <= a
			
 
				+		for (int b = blow; b < bhigh; ++b)
			
 
				+		for (int a = max(b, alow); a <= ahigh; ++a)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err; 
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (best_err < orig_err)
			
 
				+	{
			
 
				+		opt_endpts.A[ch] = amin;
			
 
				+		opt_endpts.B[ch] = bmin;
			
 
				+		orig_err = best_err;
			
 
				+		// if we actually improved, update the indices
			
 
				+		for (int i=0; i<np; ++i)
			
 
				+			indices[i] = good_indices[i];
			
 
				+	}
			
 
				+	return best_err;
			
 
				+}
			
 
				+
			
 
				+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
			
 
				+{
			
 
				+	float opt_err = orig_err;
			
 
				+
			
 
				+	opt_endpts = orig_endpts;
			
 
				+
			
 
				+	/*
			
 
				+		err0 = perturb(rgb0, delta0)
			
 
				+		err1 = perturb(rgb1, delta1)
			
 
				+		if (err0 < err1)
			
 
				+			if (err0 >= initial_error) break
			
 
				+			rgb0 += delta0
			
 
				+			next = 1
			
 
				+		else
			
 
				+			if (err1 >= initial_error) break
			
 
				+			rgb1 += delta1
			
 
				+			next = 0
			
 
				+		initial_err = map()
			
 
				+		for (;;)
			
 
				+			err = perturb(next ? rgb1:rgb0, delta)
			
 
				+			if (err >= initial_err) break
			
 
				+			next? rgb1 : rgb0 += delta
			
 
				+			initial_err = err
			
 
				+	*/
			
 
				+	IntEndptsRGB_2 new_a, new_b;
			
 
				+	IntEndptsRGB_2 new_endpt;
			
 
				+	int do_b;
			
 
				+	int orig_indices[Tile::TILE_TOTAL];
			
 
				+	int new_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices0[Tile::TILE_TOTAL];
			
 
				+	int temp_indices1[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	// now optimize each channel separately
			
 
				+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
			
 
				+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
			
 
				+	{
			
 
				+		// figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+		// if we just alternate, we can easily end up in a local minima
			
 
				+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
			
 
				+
			
 
				+		if (err0 < err1)
			
 
				+		{
			
 
				+			if (err0 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.A[ch] = new_a.A[ch];
			
 
				+			opt_err = err0;
			
 
				+			do_b = 1;		// do B next
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (err1 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices1[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.B[ch] = new_b.B[ch];
			
 
				+			opt_err = err1;
			
 
				+			do_b = 0;		// do A next
			
 
				+		}
			
 
				+		
			
 
				+		// now alternate endpoints and keep trying until there is no improvement
			
 
				+		for (;;)
			
 
				+		{
			
 
				+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
			
 
				+			if (err >= opt_err)
			
 
				+				break;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (new_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			if (do_b == 0)
			
 
				+				opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+			else
			
 
				+				opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+			opt_err = err;
			
 
				+			do_b = 1 - do_b;	// now move the other endpoint
			
 
				+		}
			
 
				+
			
 
				+		// see if the indices have changed
			
 
				+		int i;
			
 
				+		for (i=0; i<np; ++i)
			
 
				+			if (orig_indices[i] != new_indices[i])
			
 
				+				break;
			
 
				+
			
 
				+		if (i<np)
			
 
				+			ch = -1;	// start over
			
 
				+	}
			
 
				+
			
 
				+	// finally, do a small exhaustive search around what we think is the global minima to be sure
			
 
				+	// note this is independent of the above search, so we don't care about the indices from the above
			
 
				+	// we don't care about the above because if they differ, so what? we've already started at ch=0
			
 
				+	bool first = true;
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
			
 
				+	{
			
 
				+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
			
 
				+
			
 
				+		if (new_err < opt_err)
			
 
				+		{
			
 
				+			opt_err = new_err;
			
 
				+
			
 
				+			if (first)
			
 
				+			{
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+				{
			
 
				+					orig_indices[i] = temp_indices0[i];
			
 
				+					nvAssert (orig_indices[i] != -1);
			
 
				+				}
			
 
				+				first = false;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// see if the indices have changed
			
 
				+				int i;
			
 
				+				for (i=0; i<np; ++i)
			
 
				+					if (orig_indices[i] != temp_indices0[i])
			
 
				+						break;
			
 
				+
			
 
				+				if (i<np)
			
 
				+				{
			
 
				+					ch = -1;	// start over
			
 
				+					first = true;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return opt_err;
			
 
				+}
			
 
				+
			
 
				+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
			
 
				+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
			
 
				+{
			
 
				+	Vector4 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+	IntEndptsRGB_2 temp_in, temp_out;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		// collect the pixels in the region
			
 
				+		int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    np++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+		opt_endpts[region] = temp_in = orig_endpts[region];
			
 
				+		opt_err[region] = orig_err[region];
			
 
				+
			
 
				+		float best_err = orig_err[region];
			
 
				+
			
 
				+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
			
 
				+		{
			
 
				+			temp_in.a_lsb = lsbmode & 1;
			
 
				+			temp_in.b_lsb = (lsbmode >> 1) & 1;
			
 
				+
			
 
				+			// make sure we have a valid error for temp_in
			
 
				+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
			
 
				+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
			
 
				+			float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
			
 
				+
			
 
				+			// now try to optimize these endpoints
			
 
				+			float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
			
 
				+
			
 
				+			// if we find an improvement, update the best so far and correct the output endpoints and errors
			
 
				+			if (temp_out_err < best_err)
			
 
				+			{
			
 
				+				best_err = temp_out_err;
			
 
				+				opt_err[region] = temp_out_err;
			
 
				+				opt_endpts[region] = temp_out;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+	for each pattern
			
 
				+		convert endpoints using pattern precision
			
 
				+		assign indices and get initial error
			
 
				+		compress indices (and possibly reorder endpoints)
			
 
				+		transform endpoints
			
 
				+		if transformed endpoints fit pattern
			
 
				+			get original endpoints back
			
 
				+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+			compress new indices
			
 
				+			transform new endpoints
			
 
				+			if new endpoints fit pattern AND if error is improved
			
 
				+				emit compressed block with new data
			
 
				+			else
			
 
				+				emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
			
 
				+{
			
 
				+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
			
 
				+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
			
 
				+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+	{
			
 
				+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
			
 
				+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
			
 
				+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
			
 
				+		if (patterns[sp].transformed)
			
 
				+			transform_forward(orig_endpts);
			
 
				+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
			
 
				+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
			
 
				+		if (endpts_fit(orig_endpts, patterns[sp]))
			
 
				+		{
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_inverse(orig_endpts);
			
 
				+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
			
 
				+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
			
 
				+			// (nreed) Commented out asserts because they go off all the time...not sure why
			
 
				+			//for (int i=0; i<NREGIONS; ++i)
			
 
				+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
			
 
				+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_forward(opt_endpts);
			
 
				+			orig_toterr = opt_toterr = 0;
			
 
				+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+			{
			
 
				+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
			
 
				+				return opt_toterr;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// either it stopped fitting when we optimized it, or there was no improvement
			
 
				+				// so go back to the unoptimized endpoints which we know will fit
			
 
				+				if (patterns[sp].transformed)
			
 
				+					transform_forward(orig_endpts);
			
 
				+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
			
 
				+				return orig_toterr;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+    nvAssert(false); // throw "No candidate found, should never happen (mode avpcl 0).";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void clamp(Vector4 &v)
			
 
				+{
			
 
				+	if (v.x < 0.0f) v.x = 0.0f;
			
 
				+	if (v.x > 255.0f) v.x = 255.0f;
			
 
				+	if (v.y < 0.0f) v.y = 0.0f;
			
 
				+	if (v.y > 255.0f) v.y = 255.0f;
			
 
				+	if (v.z < 0.0f) v.z = 0.0f;
			
 
				+	if (v.z > 255.0f) v.z = 255.0f;
			
 
				+	v.w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
			
 
				+}
			
 
				+
			
 
				+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
			
 
				+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	generate_palette_unquantized(endpts, palette);
			
 
				+
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+				besterr = err;
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+// for this mode, we assume alpha = 255 constant and compress only the RGB portion.
			
 
				+// however, we do the error check against the actual alpha values supplied for the tile.
			
 
				+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		int np = 0;
			
 
				+		Vector3 colors[Tile::TILE_TOTAL];
			
 
				+		float alphas[2];
			
 
				+		Vector4 mean(0,0,0,0);
			
 
				+
			
 
				+		for (int y = 0; y < tile.size_y; y++)
			
 
				+		for (int x = 0; x < tile.size_x; x++)
			
 
				+			if (REGION(x,y,shapeindex) == region)
			
 
				+			{
			
 
				+				colors[np] = tile.data[y][x].xyz();
			
 
				+				if (np < 2) alphas[np] = tile.data[y][x].w;
			
 
				+				mean += tile.data[y][x];
			
 
				+				++np;
			
 
				+			}
			
 
				+
			
 
				+		// handle simple cases	
			
 
				+		if (np == 0)
			
 
				+		{
			
 
				+			Vector4 zero(0,0,0,255.0f);
			
 
				+			endpts[region].A = zero;
			
 
				+			endpts[region].B = zero;
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 1)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[0], alphas[0]);
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 2)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[1], alphas[1]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		mean /= float(np);
			
 
				+
			
 
				+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+		// project each pixel value along the principal direction
			
 
				+		float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+		for (int i = 0; i < np; i++) 
			
 
				+		{
			
 
				+			float dp = dot(colors[i]-mean.xyz(), direction);
			
 
				+			if (dp < minp) minp = dp;
			
 
				+			if (dp > maxp) maxp = dp;
			
 
				+		}
			
 
				+
			
 
				+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+		endpts[region].A = mean + minp*Vector4(direction, 0);
			
 
				+		endpts[region].B = mean + maxp*Vector4(direction, 0);
			
 
				+
			
 
				+		// clamp endpoints
			
 
				+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+		// shape based on endpoints being clamped
			
 
				+		clamp(endpts[region].A);
			
 
				+		clamp(endpts[region].B);
			
 
				+	}
			
 
				+
			
 
				+	return map_colors(tile, shapeindex, endpts);
			
 
				+}
			
 
				+
			
 
				+static void swap(float *list1, int *list2, int i, int j)
			
 
				+{
			
 
				+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
			
 
				+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
			
 
				+}
			
 
				+
			
 
				+float AVPCL::compress_mode0(const Tile &t, char *block)
			
 
				+{
			
 
				+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
			
 
				+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
			
 
				+	const int NITEMS=NSHAPES/4;
			
 
				+
			
 
				+	// pick the best NITEMS shapes and refine these.
			
 
				+	struct {
			
 
				+		FltEndpts endpts[NREGIONS];
			
 
				+	} all[NSHAPES];
			
 
				+	float roughmse[NSHAPES];
			
 
				+	int index[NSHAPES];
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+
			
 
				+	for (int i=0; i<NSHAPES; ++i)
			
 
				+	{
			
 
				+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
			
 
				+		index[i] = i;
			
 
				+	}
			
 
				+
			
 
				+	// bubble sort -- only need to bubble up the first NITEMS items
			
 
				+	for (int i=0; i<NITEMS; ++i)
			
 
				+	for (int j=i+1; j<NSHAPES; ++j)
			
 
				+		if (roughmse[i] > roughmse[j])
			
 
				+			swap(roughmse, index, i, j);
			
 
				+
			
 
				+	for (int i=0; i<NITEMS && msebest>0; ++i)
			
 
				+	{
			
 
				+		int shape = index[i];
			
 
				+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
			
 
				+		if (mse < msebest)
			
 
				+		{
			
 
				+			memcpy(block, tempblock, sizeof(tempblock));
			
 
				+			msebest = mse;
			
 
				+		}
			
 
				+	}
			
 
				+	return msebest;
			
 
				+}
			
 
				+
			
--- a/3rdparty/nvtt/bc7/avpcl_mode1.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode1.cpp
@@ -0,0 +1,1047 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+// x10	(666x2).1 (666x2).1 64p 3bi
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Matrix.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "endpts.h"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+#include "shapes_two.h"
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+#define	NLSBMODES	2		// number of different lsb modes per region. since we have one .1 per region, that can have 2 values
			
 
				+
			
 
				+#define NINDICES	8
			
 
				+#define	INDEXBITS	3
			
 
				+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
			
 
				+#define	DENOM		(NINDICES-1)
			
 
				+#define	BIAS		(DENOM/2)
			
 
				+
			
 
				+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
			
 
				+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
			
 
				+// stop without having to touch all shapes?
			
 
				+
			
 
				+#define	POS_TO_X(pos)	((pos)&3)
			
 
				+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
			
 
				+
			
 
				+#define	NBITSIZES	(NREGIONS*2)
			
 
				+#define	ABITINDEX(region)	(2*(region)+0)
			
 
				+#define	BBITINDEX(region)	(2*(region)+1)
			
 
				+
			
 
				+struct ChanBits
			
 
				+{
			
 
				+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
			
 
				+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
			
 
				+	int mode;				// associated mode value
			
 
				+	int modebits;			// number of mode bits
			
 
				+	const char *encoding;			// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define	NPATTERNS 1
			
 
				+
			
 
				+static Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+	// red		green		blue		xfm	mode  mb
			
 
				+	6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x2, 2, "",
			
 
				+};
			
 
				+
			
 
				+struct RegionPrec
			
 
				+{
			
 
				+	int	endpt_a_prec[NCHANNELS_RGB];
			
 
				+	int endpt_b_prec[NCHANNELS_RGB];
			
 
				+};
			
 
				+
			
 
				+struct PatternPrec
			
 
				+{
			
 
				+	RegionPrec region_precs[NREGIONS];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// this is the precision for each channel and region
			
 
				+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
			
 
				+static PatternPrec pattern_precs[NPATTERNS] =
			
 
				+{
			
 
				+	6,6,6, 6,6,6, 6,6,6, 6,6,6,	
			
 
				+};
			
 
				+
			
 
				+// return # of bits needed to store n. handle signed or unsigned cases properly
			
 
				+static int nbits(int n, bool issigned)
			
 
				+{
			
 
				+	int nb;
			
 
				+	if (n==0)
			
 
				+		return 0;	// no bits needed for 0, signed or not
			
 
				+	else if (n > 0)
			
 
				+	{
			
 
				+		for (nb=0; n; ++nb, n>>=1) ;
			
 
				+		return nb + (issigned?1:0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nvAssert (issigned);
			
 
				+		for (nb=0; n<-1; ++nb, n>>=1) ;
			
 
				+		return nb + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void transform_forward(IntEndptsRGB_1 ep[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+static void transform_inverse(IntEndptsRGB_1 ep[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+// endpoints are 777,777; reduce to 666,666 and put the lsb bit majority in compr_bits
			
 
				+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_1& compr_endpts)
			
 
				+{
			
 
				+	int onescnt;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+	{
			
 
				+		onescnt += endpts.A[j] & 1;
			
 
				+		compr_endpts.A[j] = endpts.A[j] >> 1;
			
 
				+		onescnt += endpts.B[j] & 1;
			
 
				+		compr_endpts.B[j] = endpts.B[j] >> 1;
			
 
				+		nvAssert (compr_endpts.A[j] < 64);
			
 
				+		nvAssert (compr_endpts.B[j] < 64);
			
 
				+	}
			
 
				+	compr_endpts.lsb = onescnt >= 3;
			
 
				+}
			
 
				+
			
 
				+static void uncompress_one(const IntEndptsRGB_1& compr_endpts, IntEndptsRGB& endpts)
			
 
				+{
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+	{
			
 
				+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.lsb;
			
 
				+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.lsb;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void uncompress_endpoints(const IntEndptsRGB_1 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		uncompress_one(compr_endpts[i], endpts[i]);
			
 
				+}
			
 
				+
			
 
				+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_1 compr_endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		compress_one(endpts[i], compr_endpts[i]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_1 q_endpts[NREGIONS])
			
 
				+{
			
 
				+	IntEndptsRGB full_endpts[NREGIONS];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
			
 
				+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
			
 
				+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
			
 
				+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
			
 
				+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
			
 
				+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
			
 
				+		compress_one(full_endpts[region], q_endpts[region]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
			
 
				+static void swap_indices(IntEndptsRGB_1 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
			
 
				+
			
 
				+		int x = POS_TO_X(position);
			
 
				+		int y = POS_TO_Y(position);
			
 
				+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+		if (indices[y][x] & HIGH_INDEXBIT)
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[y][x] = NINDICES - 1 - indices[y][x];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool endpts_fit(IntEndptsRGB_1 endpts[NREGIONS], const Pattern &p)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void write_header(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
			
 
				+{
			
 
				+	out.write(p.mode, p.modebits);
			
 
				+	out.write(shapeindex, SHAPEBITS);
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		out.write(endpts[i].lsb, 1);
			
 
				+
			
 
				+	nvAssert (out.getptr() == 82);
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, IntEndptsRGB_1 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
			
 
				+{
			
 
				+	int mode = AVPCL::getmode(in);
			
 
				+
			
 
				+	pat_index = 0;
			
 
				+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+	nvAssert (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+	shapeindex = in.read(SHAPEBITS);
			
 
				+	p = patterns[pat_index];
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		endpts[i].lsb  = in.read(1);
			
 
				+	
			
 
				+	nvAssert (in.getptr() == 82);
			
 
				+}
			
 
				+
			
 
				+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
			
 
				+{
			
 
				+	int positions[NREGIONS];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	int positions[NREGIONS];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
			
 
				+{
			
 
				+	Bits out(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	write_header(endpts, shapeindex, p, out);
			
 
				+
			
 
				+	write_indices(indices, shapeindex, out);
			
 
				+
			
 
				+	nvAssert(out.getptr() == AVPCL::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized(const IntEndptsRGB_1 &endpts_1, const RegionPrec &region_prec, Vector4 palette[NINDICES])
			
 
				+{
			
 
				+	IntEndptsRGB endpts;
			
 
				+
			
 
				+	uncompress_one(endpts_1, endpts);
			
 
				+
			
 
				+	// scale endpoints
			
 
				+	int a, b;			// really need a IntVec4...
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
			
 
				+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
			
 
				+
			
 
				+	// note: don't simplify to a + ((b-a)*i + BIAS)/DENOM as that doesn't work due to the way C handles integer division of negatives
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	// constant alpha
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+// sign extend but only if it was transformed
			
 
				+static void sign_extend(Pattern &p, IntEndptsRGB_1 endpts[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress_mode1(const char *block, Tile &t)
			
 
				+{
			
 
				+	Bits in(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	Pattern p;
			
 
				+	IntEndptsRGB_1 endpts[NREGIONS];
			
 
				+	int shapeindex, pat_index;
			
 
				+
			
 
				+	read_header(in, endpts, shapeindex, p, pat_index);
			
 
				+	
			
 
				+	if (p.transformed)
			
 
				+	{
			
 
				+		sign_extend(p, endpts);
			
 
				+		transform_inverse(endpts);
			
 
				+	}
			
 
				+
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
			
 
				+
			
 
				+	int indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	read_indices(in, shapeindex, indices);
			
 
				+
			
 
				+	nvAssert(in.getptr() == AVPCL::BITSIZE);
			
 
				+
			
 
				+	// lookup
			
 
				+	for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_1 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	Vector4 palette[NINDICES];
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	generate_palette_quantized(endpts, region_prec, palette);
			
 
				+
			
 
				+	for (int i = 0; i < np; ++i)
			
 
				+	{
			
 
				+		float besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
			
 
				+		{
			
 
				+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[i] = j;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+
			
 
				+		// check for early exit
			
 
				+		if (toterr > current_err)
			
 
				+		{
			
 
				+			// fill out bogus index values so it's initialized at least
			
 
				+			for (int k = i; k < np; ++k)
			
 
				+				indices[k] = -1;
			
 
				+
			
 
				+			return FLT_MAX;
			
 
				+		}
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_1 endpts[NREGIONS], const PatternPrec &pattern_prec, 
			
 
				+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
			
 
				+		toterr[region] = 0;
			
 
				+	}
			
 
				+
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[y][x] = i;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr[region] += besterr;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
			
 
				+// this function returns either old_err or a value smaller (if it was successful in improving the error)
			
 
				+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_1 &old_endpts, IntEndptsRGB_1 &new_endpts, 
			
 
				+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	// we have the old endpoints: old_endpts
			
 
				+	// we have the perturbed endpoints: new_endpts
			
 
				+	// we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+	IntEndptsRGB_1 temp_endpts;
			
 
				+	float min_err = old_err;		// start with the best current error
			
 
				+	int beststep;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	// copy real endpoints so we can perturb them
			
 
				+	temp_endpts = new_endpts = old_endpts;
			
 
				+
			
 
				+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
			
 
				+
			
 
				+	// do a logarithmic search for the best error for this endpoint (which)
			
 
				+	for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+	{
			
 
				+		bool improved = false;
			
 
				+		for (int sign = -1; sign <= 1; sign += 2)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+			{
			
 
				+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
			
 
				+
			
 
				+			if (err < min_err)
			
 
				+			{
			
 
				+				improved = true;
			
 
				+				min_err = err;
			
 
				+				beststep = sign * step;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+		// if this was an improvement, move the endpoint and continue search from there
			
 
				+		if (improved)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+				new_endpts.A[ch] += beststep;
			
 
				+			else
			
 
				+				new_endpts.B[ch] += beststep;
			
 
				+		}
			
 
				+	}
			
 
				+	return min_err;
			
 
				+}
			
 
				+
			
 
				+// the larger the error the more time it is worth spending on an exhaustive search.
			
 
				+// perturb the endpoints at least -3 to 3.
			
 
				+// if err > 5000 perturb endpoints 50% of precision
			
 
				+// if err > 1000 25%
			
 
				+// if err > 200 12.5%
			
 
				+// if err > 40  6.25%
			
 
				+// for np = 16 -- adjust error thresholds as a function of np
			
 
				+// always ensure endpoint ordering is preserved (no need to overlap the scan)
			
 
				+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
			
 
				+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGB_1 &opt_endpts, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	IntEndptsRGB_1 temp_endpts;
			
 
				+	float best_err = orig_err;
			
 
				+	int aprec = region_prec.endpt_a_prec[ch];
			
 
				+	int bprec = region_prec.endpt_b_prec[ch];
			
 
				+	int good_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
			
 
				+
			
 
				+	if (orig_err == 0) return orig_err;
			
 
				+
			
 
				+	int adelta = 0, bdelta = 0;
			
 
				+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
			
 
				+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
			
 
				+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
			
 
				+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
			
 
				+	adelta = max(adelta, 3);
			
 
				+	bdelta = max(bdelta, 3);
			
 
				+
			
 
				+#ifdef	DISABLE_EXHAUSTIVE
			
 
				+	adelta = bdelta = 3;
			
 
				+#endif
			
 
				+
			
 
				+	temp_endpts = opt_endpts;
			
 
				+
			
 
				+	// ok figure out the range of A and B
			
 
				+	int alow = max(0, opt_endpts.A[ch] - adelta);
			
 
				+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
			
 
				+	int blow = max(0, opt_endpts.B[ch] - bdelta);
			
 
				+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
			
 
				+
			
 
				+	// now there's no need to swap the ordering of A and B
			
 
				+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
			
 
				+
			
 
				+	int amin, bmin;
			
 
				+
			
 
				+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
			
 
				+	{
			
 
				+		// keep a <= b
			
 
				+		for (int a = alow; a <= ahigh; ++a)
			
 
				+		for (int b = max(a, blow); b < bhigh; ++b)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// keep b <= a
			
 
				+		for (int b = blow; b < bhigh; ++b)
			
 
				+		for (int a = max(b, alow); a <= ahigh; ++a)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err; 
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (best_err < orig_err)
			
 
				+	{
			
 
				+		opt_endpts.A[ch] = amin;
			
 
				+		opt_endpts.B[ch] = bmin;
			
 
				+		// if we actually improved, update the indices
			
 
				+		for (int i=0; i<np; ++i)
			
 
				+			indices[i] = good_indices[i];
			
 
				+	}
			
 
				+	return best_err;
			
 
				+}
			
 
				+
			
 
				+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_1 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_1 &opt_endpts)
			
 
				+{
			
 
				+	float opt_err = orig_err;
			
 
				+
			
 
				+	opt_endpts = orig_endpts;
			
 
				+
			
 
				+	/*
			
 
				+		err0 = perturb(rgb0, delta0)
			
 
				+		err1 = perturb(rgb1, delta1)
			
 
				+		if (err0 < err1)
			
 
				+			if (err0 >= initial_error) break
			
 
				+			rgb0 += delta0
			
 
				+			next = 1
			
 
				+		else
			
 
				+			if (err1 >= initial_error) break
			
 
				+			rgb1 += delta1
			
 
				+			next = 0
			
 
				+		initial_err = map()
			
 
				+		for (;;)
			
 
				+			err = perturb(next ? rgb1:rgb0, delta)
			
 
				+			if (err >= initial_err) break
			
 
				+			next? rgb1 : rgb0 += delta
			
 
				+			initial_err = err
			
 
				+	*/
			
 
				+	IntEndptsRGB_1 new_a, new_b;
			
 
				+	IntEndptsRGB_1 new_endpt;
			
 
				+	int do_b;
			
 
				+	int orig_indices[Tile::TILE_TOTAL];
			
 
				+	int new_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices0[Tile::TILE_TOTAL];
			
 
				+	int temp_indices1[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	// now optimize each channel separately
			
 
				+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
			
 
				+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
			
 
				+	{
			
 
				+		// figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+		// if we just alternate, we can easily end up in a local minima
			
 
				+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
			
 
				+
			
 
				+		if (err0 < err1)
			
 
				+		{
			
 
				+			if (err0 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.A[ch] = new_a.A[ch];
			
 
				+			opt_err = err0;
			
 
				+			do_b = 1;		// do B next
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (err1 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices1[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.B[ch] = new_b.B[ch];
			
 
				+			opt_err = err1;
			
 
				+			do_b = 0;		// do A next
			
 
				+		}
			
 
				+		
			
 
				+		// now alternate endpoints and keep trying until there is no improvement
			
 
				+		for (;;)
			
 
				+		{
			
 
				+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
			
 
				+			if (err >= opt_err)
			
 
				+				break;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (new_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			if (do_b == 0)
			
 
				+				opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+			else
			
 
				+				opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+			opt_err = err;
			
 
				+			do_b = 1 - do_b;	// now move the other endpoint
			
 
				+		}
			
 
				+
			
 
				+		// see if the indices have changed
			
 
				+		int i;
			
 
				+		for (i=0; i<np; ++i)
			
 
				+			if (orig_indices[i] != new_indices[i])
			
 
				+				break;
			
 
				+
			
 
				+		if (i<np)
			
 
				+			ch = -1;	// start over
			
 
				+	}
			
 
				+
			
 
				+	// finally, do a small exhaustive search around what we think is the global minima to be sure
			
 
				+	// note this is independent of the above search, so we don't care about the indices from the above
			
 
				+	// we don't care about the above because if they differ, so what? we've already started at ch=0
			
 
				+	bool first = true;
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
			
 
				+	{
			
 
				+		float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
			
 
				+
			
 
				+		if (new_err < opt_err)
			
 
				+		{
			
 
				+			opt_err = new_err;
			
 
				+
			
 
				+			if (first)
			
 
				+			{
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+				{
			
 
				+					orig_indices[i] = temp_indices0[i];
			
 
				+					nvAssert (orig_indices[i] != -1);
			
 
				+				}
			
 
				+				first = false;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// see if the indices have changed
			
 
				+				int i;
			
 
				+				for (i=0; i<np; ++i)
			
 
				+					if (orig_indices[i] != temp_indices0[i])
			
 
				+						break;
			
 
				+
			
 
				+				if (i<np)
			
 
				+				{
			
 
				+					ch = -1;	// start over
			
 
				+					first = true;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return opt_err;
			
 
				+}
			
 
				+
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
			
 
				+							IntEndptsRGB_1 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_1 opt_endpts[NREGIONS])
			
 
				+{
			
 
				+	Vector4 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+	IntEndptsRGB_1 temp_in, temp_out;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		// collect the pixels in the region
			
 
				+		int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    np++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+		opt_endpts[region] = temp_in = orig_endpts[region];
			
 
				+		opt_err[region] = orig_err[region];
			
 
				+
			
 
				+		float best_err = orig_err[region];
			
 
				+
			
 
				+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
			
 
				+		{
			
 
				+			temp_in.lsb = lsbmode;
			
 
				+
			
 
				+			// make sure we have a valid error for temp_in
			
 
				+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
			
 
				+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
			
 
				+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
			
 
				+
			
 
				+			// now try to optimize these endpoints
			
 
				+			float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
			
 
				+
			
 
				+			// if we find an improvement, update the best so far and correct the output endpoints and errors
			
 
				+			if (temp_out_err < best_err)
			
 
				+			{
			
 
				+				best_err = temp_out_err;
			
 
				+				opt_err[region] = temp_out_err;
			
 
				+				opt_endpts[region] = temp_out;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+	for each pattern
			
 
				+		convert endpoints using pattern precision
			
 
				+		assign indices and get initial error
			
 
				+		compress indices (and possibly reorder endpoints)
			
 
				+		transform endpoints
			
 
				+		if transformed endpoints fit pattern
			
 
				+			get original endpoints back
			
 
				+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+			compress new indices
			
 
				+			transform new endpoints
			
 
				+			if new endpoints fit pattern AND if error is improved
			
 
				+				emit compressed block with new data
			
 
				+			else
			
 
				+				emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
			
 
				+{
			
 
				+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
			
 
				+	IntEndptsRGB_1 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
			
 
				+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+	{
			
 
				+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
			
 
				+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
			
 
				+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
			
 
				+		if (patterns[sp].transformed)
			
 
				+			transform_forward(orig_endpts);
			
 
				+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
			
 
				+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
			
 
				+		if (endpts_fit(orig_endpts, patterns[sp]))
			
 
				+		{
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_inverse(orig_endpts);
			
 
				+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
			
 
				+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
			
 
				+			// (nreed) Commented out asserts because they go off all the time...not sure why
			
 
				+			//for (int i=0; i<NREGIONS; ++i)
			
 
				+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
			
 
				+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_forward(opt_endpts);
			
 
				+			orig_toterr = opt_toterr = 0;
			
 
				+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+			//nvAssert(opt_toterr <= orig_toterr);
			
 
				+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+			{
			
 
				+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
			
 
				+				return opt_toterr;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// either it stopped fitting when we optimized it, or there was no improvement
			
 
				+				// so go back to the unoptimized endpoints which we know will fit
			
 
				+				if (patterns[sp].transformed)
			
 
				+					transform_forward(orig_endpts);
			
 
				+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
			
 
				+				return orig_toterr;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 1).";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void clamp(Vector4 &v)
			
 
				+{
			
 
				+	if (v.x < 0.0f) v.x = 0.0f;
			
 
				+	if (v.x > 255.0f) v.x = 255.0f;
			
 
				+	if (v.y < 0.0f) v.y = 0.0f;
			
 
				+	if (v.y > 255.0f) v.y = 255.0f;
			
 
				+	if (v.z < 0.0f) v.z = 0.0f;
			
 
				+	if (v.z > 255.0f) v.z = 255.0f;
			
 
				+	v.w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
			
 
				+}
			
 
				+
			
 
				+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
			
 
				+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	generate_palette_unquantized(endpts, palette);
			
 
				+
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			float err = Utils::metric4(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+				besterr = err;
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		int np = 0;
			
 
				+		Vector3 colors[Tile::TILE_TOTAL];
			
 
				+		float alphas[2];
			
 
				+		Vector4 mean(0,0,0,0);
			
 
				+
			
 
				+		for (int y = 0; y < tile.size_y; y++)
			
 
				+		for (int x = 0; x < tile.size_x; x++)
			
 
				+			if (REGION(x,y,shapeindex) == region)
			
 
				+			{
			
 
				+				colors[np] = tile.data[y][x].xyz();
			
 
				+				if (np < 2) alphas[np] = tile.data[y][x].w;
			
 
				+				mean += tile.data[y][x];
			
 
				+				++np;
			
 
				+			}
			
 
				+
			
 
				+		// handle simple cases	
			
 
				+		if (np == 0)
			
 
				+		{
			
 
				+			Vector4 zero(0,0,0,255.0f);
			
 
				+			endpts[region].A = zero;
			
 
				+			endpts[region].B = zero;
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 1)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[0], alphas[0]);
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 2)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[1], alphas[1]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		mean /= float(np);
			
 
				+
			
 
				+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+		// project each pixel value along the principal direction
			
 
				+		float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+		for (int i = 0; i < np; i++) 
			
 
				+		{
			
 
				+			float dp = dot(colors[i]-mean.xyz(), direction);
			
 
				+			if (dp < minp) minp = dp;
			
 
				+			if (dp > maxp) maxp = dp;
			
 
				+		}
			
 
				+
			
 
				+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+		endpts[region].A = mean + minp*Vector4(direction, 0);
			
 
				+		endpts[region].B = mean + maxp*Vector4(direction, 0);
			
 
				+
			
 
				+		// clamp endpoints
			
 
				+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+		// shape based on endpoints being clamped
			
 
				+		clamp(endpts[region].A);
			
 
				+		clamp(endpts[region].B);
			
 
				+	}
			
 
				+
			
 
				+	return map_colors(tile, shapeindex, endpts);
			
 
				+}
			
 
				+
			
 
				+static void swap(float *list1, int *list2, int i, int j)
			
 
				+{
			
 
				+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
			
 
				+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
			
 
				+}
			
 
				+
			
 
				+float AVPCL::compress_mode1(const Tile &t, char *block)
			
 
				+{
			
 
				+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
			
 
				+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
			
 
				+	const int NITEMS=NSHAPES/4;
			
 
				+
			
 
				+	// pick the best NITEMS shapes and refine these.
			
 
				+	struct {
			
 
				+		FltEndpts endpts[NREGIONS];
			
 
				+	} all[NSHAPES];
			
 
				+	float roughmse[NSHAPES];
			
 
				+	int index[NSHAPES];
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+
			
 
				+	for (int i=0; i<NSHAPES; ++i)
			
 
				+	{
			
 
				+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
			
 
				+		index[i] = i;
			
 
				+	}
			
 
				+
			
 
				+	// bubble sort -- only need to bubble up the first NITEMS items
			
 
				+	for (int i=0; i<NITEMS; ++i)
			
 
				+	for (int j=i+1; j<NSHAPES; ++j)
			
 
				+		if (roughmse[i] > roughmse[j])
			
 
				+			swap(roughmse, index, i, j);
			
 
				+
			
 
				+	for (int i=0; i<NITEMS && msebest>0; ++i)
			
 
				+	{
			
 
				+		int shape = index[i];
			
 
				+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
			
 
				+		if (mse < msebest)
			
 
				+		{
			
 
				+			memcpy(block, tempblock, sizeof(tempblock));
			
 
				+			msebest = mse;
			
 
				+		}
			
 
				+	}
			
 
				+	return msebest;
			
 
				+}
			
 
				+
			
--- a/3rdparty/nvtt/bc7/avpcl_mode2.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode2.cpp
@@ -0,0 +1,1004 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+// x100 555x6 64p 2bi
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Matrix.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "endpts.h"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+#include "shapes_three.h"
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+#define NINDICES	4
			
 
				+#define	INDEXBITS	2
			
 
				+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
			
 
				+#define	DENOM		(NINDICES-1)
			
 
				+#define	BIAS		(DENOM/2)
			
 
				+
			
 
				+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
			
 
				+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
			
 
				+// stop without having to touch all shapes?
			
 
				+
			
 
				+#define	POS_TO_X(pos)	((pos)&3)
			
 
				+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
			
 
				+
			
 
				+#define	NBITSIZES	6
			
 
				+
			
 
				+struct ChanBits
			
 
				+{
			
 
				+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
			
 
				+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
			
 
				+	int mode;				// associated mode value
			
 
				+	int modebits;			// number of mode bits
			
 
				+	const char *encoding;			// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define	NPATTERNS 1
			
 
				+
			
 
				+static Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+	// red			green			blue			xfm	mode  mb
			
 
				+	5,5,5,5,5,5,	5,5,5,5,5,5,	5,5,5,5,5,5,	0,	0x4, 3, "",
			
 
				+};
			
 
				+
			
 
				+
			
 
				+struct RegionPrec
			
 
				+{
			
 
				+	int	endpt_a_prec[NCHANNELS_RGB];
			
 
				+	int endpt_b_prec[NCHANNELS_RGB];
			
 
				+};
			
 
				+
			
 
				+struct PatternPrec
			
 
				+{
			
 
				+	RegionPrec region_precs[NREGIONS_THREE];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// this is the precision for each channel and region
			
 
				+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
			
 
				+
			
 
				+static PatternPrec pattern_precs[NPATTERNS] =
			
 
				+{
			
 
				+	5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 
			
 
				+};
			
 
				+
			
 
				+// return # of bits needed to store n. handle signed or unsigned cases properly
			
 
				+static int nbits(int n, bool issigned)
			
 
				+{
			
 
				+	int nb;
			
 
				+	if (n==0)
			
 
				+		return 0;	// no bits needed for 0, signed or not
			
 
				+	else if (n > 0)
			
 
				+	{
			
 
				+		for (nb=0; n; ++nb, n>>=1) ;
			
 
				+		return nb + (issigned?1:0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nvAssert (issigned);
			
 
				+		for (nb=0; n<-1; ++nb, n>>=1) ;
			
 
				+		return nb + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#define	R_0	ep[0].A[i]
			
 
				+#define	R_1 ep[0].B[i]
			
 
				+#define	R_2 ep[1].A[i]
			
 
				+#define	R_3	ep[1].B[i]
			
 
				+
			
 
				+static void transform_forward(IntEndptsRGB ep[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NCHANNELS_RGB; ++i)
			
 
				+	{
			
 
				+		R_1 -= R_3; R_2 -= R_3; R_0 -= R_3;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void transform_inverse(IntEndptsRGB ep[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NCHANNELS_RGB; ++i)
			
 
				+	{
			
 
				+		R_0 += R_3; R_2 += R_3; R_1 += R_3;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, IntEndptsRGB q_endpts[NREGIONS_THREE])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS_THREE; ++region)
			
 
				+	{
			
 
				+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
			
 
				+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
			
 
				+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
			
 
				+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
			
 
				+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
			
 
				+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
			
 
				+static void swap_indices(IntEndptsRGB endpts[NREGIONS_THREE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS_THREE; ++region)
			
 
				+	{
			
 
				+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
			
 
				+
			
 
				+		int x = POS_TO_X(position);
			
 
				+		int y = POS_TO_Y(position);
			
 
				+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+		if (indices[y][x] & HIGH_INDEXBIT)
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[y][x] = NINDICES - 1 - indices[y][x];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool endpts_fit(IntEndptsRGB endpts[NREGIONS_THREE], const Pattern &p)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void write_header(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, Bits &out)
			
 
				+{
			
 
				+	out.write(p.mode, p.modebits);
			
 
				+	out.write(shapeindex, SHAPEBITS);
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+		for (int i=0; i<NREGIONS_THREE; ++i)
			
 
				+		{
			
 
				+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[i*2+0]);
			
 
				+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[i*2+1]);
			
 
				+		}
			
 
				+	nvAssert (out.getptr() == 99);
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, IntEndptsRGB endpts[NREGIONS_THREE], int &shapeindex, Pattern &p, int &pat_index)
			
 
				+{
			
 
				+	int mode = AVPCL::getmode(in);
			
 
				+
			
 
				+	pat_index = 0;
			
 
				+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+	nvAssert (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+	shapeindex = in.read(SHAPEBITS);
			
 
				+
			
 
				+	p = patterns[pat_index];
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+		for (int i=0; i<NREGIONS_THREE; ++i)
			
 
				+		{
			
 
				+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[i*2+0]);
			
 
				+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[i*2+1]);
			
 
				+		}
			
 
				+	nvAssert (in.getptr() == 99);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+// WORK PLACEHOLDER -- keep it simple for now
			
 
				+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
			
 
				+{
			
 
				+	int positions[NREGIONS_THREE];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS_THREE; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS_THREE; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	int positions[NREGIONS_THREE];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS_THREE; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS_THREE; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
			
 
				+{
			
 
				+	Bits out(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	write_header(endpts, shapeindex, p, out);
			
 
				+
			
 
				+	write_indices(indices, shapeindex, out);
			
 
				+
			
 
				+	nvAssert(out.getptr() == AVPCL::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized(const IntEndptsRGB &endpts, const RegionPrec &region_prec, Vector4 palette[NINDICES])
			
 
				+{
			
 
				+	// scale endpoints
			
 
				+	int a, b;			// really need a IntVec4...
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
			
 
				+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
			
 
				+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
			
 
				+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	// constant alpha
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+// sign extend but only if it was transformed
			
 
				+static void sign_extend(Pattern &p, IntEndptsRGB endpts[NREGIONS_THREE])
			
 
				+{
			
 
				+	nvAssert (p.transformed != 0);
			
 
				+
			
 
				+	for (int i=0; i<NCHANNELS_RGB; ++i)
			
 
				+	{
			
 
				+		// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
			
 
				+		endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]);
			
 
				+		endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]);
			
 
				+		endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]);
			
 
				+		endpts[2].A[i] = SIGN_EXTEND(endpts[2].A[i], p.chan[i].nbitsizes[4]);
			
 
				+		endpts[2].B[i] = SIGN_EXTEND(endpts[2].B[i], p.chan[i].nbitsizes[5]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress_mode2(const char *block, Tile &t)
			
 
				+{
			
 
				+	Bits in(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	Pattern p;
			
 
				+	IntEndptsRGB endpts[NREGIONS_THREE];
			
 
				+	int shapeindex, pat_index;
			
 
				+
			
 
				+	read_header(in, endpts, shapeindex, p, pat_index);
			
 
				+	
			
 
				+	if (p.transformed)
			
 
				+	{
			
 
				+		sign_extend(p, endpts);
			
 
				+		transform_inverse(endpts);
			
 
				+	}
			
 
				+
			
 
				+	Vector4 palette[NREGIONS_THREE][NINDICES];
			
 
				+	for (int r = 0; r < NREGIONS_THREE; ++r)
			
 
				+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
			
 
				+
			
 
				+	int indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	read_indices(in, shapeindex, indices);
			
 
				+
			
 
				+	nvAssert(in.getptr() == AVPCL::BITSIZE);
			
 
				+
			
 
				+	// lookup
			
 
				+	for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	Vector4 palette[NINDICES];
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	generate_palette_quantized(endpts, region_prec, palette);
			
 
				+
			
 
				+	for (int i = 0; i < np; ++i)
			
 
				+	{
			
 
				+		float besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
			
 
				+		{
			
 
				+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[i] = j;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+
			
 
				+		// check for early exit
			
 
				+		if (toterr > current_err)
			
 
				+		{
			
 
				+			// fill out bogus index values so it's initialized at least
			
 
				+			for (int k = i; k < np; ++k)
			
 
				+				indices[k] = -1;
			
 
				+
			
 
				+			return FLT_MAX;
			
 
				+		}
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, 
			
 
				+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_THREE])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS_THREE][NINDICES];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS_THREE; ++region)
			
 
				+	{
			
 
				+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
			
 
				+		toterr[region] = 0;
			
 
				+	}
			
 
				+
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[y][x] = i;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr[region] += besterr;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
			
 
				+// this function returns either old_err or a value smaller (if it was successful in improving the error)
			
 
				+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB &old_endpts, IntEndptsRGB &new_endpts, 
			
 
				+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	// we have the old endpoints: old_endpts
			
 
				+	// we have the perturbed endpoints: new_endpts
			
 
				+	// we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+	IntEndptsRGB temp_endpts;
			
 
				+	float min_err = old_err;		// start with the best current error
			
 
				+	int beststep;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	// copy real endpoints so we can perturb them
			
 
				+	temp_endpts = new_endpts = old_endpts;
			
 
				+
			
 
				+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
			
 
				+
			
 
				+	// do a logarithmic search for the best error for this endpoint (which)
			
 
				+	for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+	{
			
 
				+		bool improved = false;
			
 
				+		for (int sign = -1; sign <= 1; sign += 2)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+			{
			
 
				+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
			
 
				+
			
 
				+			if (err < min_err)
			
 
				+			{
			
 
				+				improved = true;
			
 
				+				min_err = err;
			
 
				+				beststep = sign * step;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+		// if this was an improvement, move the endpoint and continue search from there
			
 
				+		if (improved)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+				new_endpts.A[ch] += beststep;
			
 
				+			else
			
 
				+				new_endpts.B[ch] += beststep;
			
 
				+		}
			
 
				+	}
			
 
				+	return min_err;
			
 
				+}
			
 
				+
			
 
				+// the larger the error the more time it is worth spending on an exhaustive search.
			
 
				+// perturb the endpoints at least -3 to 3.
			
 
				+// if err > 5000 perturb endpoints 50% of precision
			
 
				+// if err > 1000 25%
			
 
				+// if err > 200 12.5%
			
 
				+// if err > 40  6.25%
			
 
				+// for np = 16 -- adjust error thresholds as a function of np
			
 
				+// always ensure endpoint ordering is preserved (no need to overlap the scan)
			
 
				+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
			
 
				+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGB &opt_endpts, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	IntEndptsRGB temp_endpts;
			
 
				+	float best_err = orig_err;
			
 
				+	int aprec = region_prec.endpt_a_prec[ch];
			
 
				+	int bprec = region_prec.endpt_b_prec[ch];
			
 
				+	int good_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
			
 
				+
			
 
				+	if (orig_err == 0) return orig_err;
			
 
				+
			
 
				+	int adelta = 0, bdelta = 0;
			
 
				+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
			
 
				+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
			
 
				+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
			
 
				+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
			
 
				+	adelta = max(adelta, 3);
			
 
				+	bdelta = max(bdelta, 3);
			
 
				+
			
 
				+#ifdef	DISABLE_EXHAUSTIVE
			
 
				+	adelta = bdelta = 3;
			
 
				+#endif
			
 
				+
			
 
				+	temp_endpts = opt_endpts;
			
 
				+
			
 
				+	// ok figure out the range of A and B
			
 
				+	int alow = max(0, opt_endpts.A[ch] - adelta);
			
 
				+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
			
 
				+	int blow = max(0, opt_endpts.B[ch] - bdelta);
			
 
				+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
			
 
				+
			
 
				+	// now there's no need to swap the ordering of A and B
			
 
				+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
			
 
				+
			
 
				+	int amin, bmin;
			
 
				+
			
 
				+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
			
 
				+	{
			
 
				+		// keep a <= b
			
 
				+		for (int a = alow; a <= ahigh; ++a)
			
 
				+		for (int b = max(a, blow); b < bhigh; ++b)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// keep b <= a
			
 
				+		for (int b = blow; b < bhigh; ++b)
			
 
				+		for (int a = max(b, alow); a <= ahigh; ++a)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err; 
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (best_err < orig_err)
			
 
				+	{
			
 
				+		opt_endpts.A[ch] = amin;
			
 
				+		opt_endpts.B[ch] = bmin;
			
 
				+		orig_err = best_err;
			
 
				+		// if we actually improved, update the indices
			
 
				+		for (int i=0; i<np; ++i)
			
 
				+			indices[i] = good_indices[i];
			
 
				+	}
			
 
				+	return best_err;
			
 
				+}
			
 
				+
			
 
				+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB &opt_endpts)
			
 
				+{
			
 
				+	float opt_err = orig_err;
			
 
				+
			
 
				+	opt_endpts = orig_endpts;
			
 
				+
			
 
				+	/*
			
 
				+		err0 = perturb(rgb0, delta0)
			
 
				+		err1 = perturb(rgb1, delta1)
			
 
				+		if (err0 < err1)
			
 
				+			if (err0 >= initial_error) break
			
 
				+			rgb0 += delta0
			
 
				+			next = 1
			
 
				+		else
			
 
				+			if (err1 >= initial_error) break
			
 
				+			rgb1 += delta1
			
 
				+			next = 0
			
 
				+		initial_err = map()
			
 
				+		for (;;)
			
 
				+			err = perturb(next ? rgb1:rgb0, delta)
			
 
				+			if (err >= initial_err) break
			
 
				+			next? rgb1 : rgb0 += delta
			
 
				+			initial_err = err
			
 
				+	*/
			
 
				+	IntEndptsRGB new_a, new_b;
			
 
				+	IntEndptsRGB new_endpt;
			
 
				+	int do_b;
			
 
				+	int orig_indices[Tile::TILE_TOTAL];
			
 
				+	int new_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices0[Tile::TILE_TOTAL];
			
 
				+	int temp_indices1[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	// now optimize each channel separately
			
 
				+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
			
 
				+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
			
 
				+	{
			
 
				+		// figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+		// if we just alternate, we can easily end up in a local minima
			
 
				+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
			
 
				+
			
 
				+		if (err0 < err1)
			
 
				+		{
			
 
				+			if (err0 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.A[ch] = new_a.A[ch];
			
 
				+			opt_err = err0;
			
 
				+			do_b = 1;		// do B next
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (err1 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices1[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.B[ch] = new_b.B[ch];
			
 
				+			opt_err = err1;
			
 
				+			do_b = 0;		// do A next
			
 
				+		}
			
 
				+		
			
 
				+		// now alternate endpoints and keep trying until there is no improvement
			
 
				+		for (;;)
			
 
				+		{
			
 
				+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
			
 
				+			if (err >= opt_err)
			
 
				+				break;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (new_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			if (do_b == 0)
			
 
				+				opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+			else
			
 
				+				opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+			opt_err = err;
			
 
				+			do_b = 1 - do_b;	// now move the other endpoint
			
 
				+		}
			
 
				+
			
 
				+		// see if the indices have changed
			
 
				+		int i;
			
 
				+		for (i=0; i<np; ++i)
			
 
				+			if (orig_indices[i] != new_indices[i])
			
 
				+				break;
			
 
				+
			
 
				+		if (i<np)
			
 
				+			ch = -1;	// start over
			
 
				+	}
			
 
				+
			
 
				+	// finally, do a small exhaustive search around what we think is the global minima to be sure
			
 
				+	// note this is independent of the above search, so we don't care about the indices from the above
			
 
				+	// we don't care about the above because if they differ, so what? we've already started at ch=0
			
 
				+	bool first = true;
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
			
 
				+	{
			
 
				+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
			
 
				+
			
 
				+		if (new_err < opt_err)
			
 
				+		{
			
 
				+			opt_err = new_err;
			
 
				+
			
 
				+			if (first)
			
 
				+			{
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+				{
			
 
				+					orig_indices[i] = temp_indices0[i];
			
 
				+					nvAssert (orig_indices[i] != -1);
			
 
				+				}
			
 
				+				first = false;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// see if the indices have changed
			
 
				+				int i;
			
 
				+				for (i=0; i<np; ++i)
			
 
				+					if (orig_indices[i] != temp_indices0[i])
			
 
				+						break;
			
 
				+
			
 
				+				if (i<np)
			
 
				+				{
			
 
				+					ch = -1;	// start over
			
 
				+					first = true;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return opt_err;
			
 
				+}
			
 
				+
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_THREE], 
			
 
				+							const IntEndptsRGB orig_endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB opt_endpts[NREGIONS_THREE])
			
 
				+{
			
 
				+	Vector4 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+	IntEndptsRGB temp_in, temp_out;
			
 
				+
			
 
				+	for (int region=0; region<NREGIONS_THREE; ++region)
			
 
				+	{
			
 
				+		// collect the pixels in the region
			
 
				+		int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    np++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+		opt_endpts[region] = temp_in = orig_endpts[region];
			
 
				+		opt_err[region] = orig_err[region];
			
 
				+
			
 
				+		float best_err = orig_err[region];
			
 
				+
			
 
				+		// make sure we have a valid error for temp_in
			
 
				+		// we didn't change temp_in, so orig_err[region] is still valid
			
 
				+		float temp_in_err = orig_err[region];
			
 
				+
			
 
				+		// now try to optimize these endpoints
			
 
				+		float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
			
 
				+
			
 
				+		// if we find an improvement, update the best so far and correct the output endpoints and errors
			
 
				+		if (temp_out_err < best_err)
			
 
				+		{
			
 
				+			best_err = temp_out_err;
			
 
				+			opt_err[region] = temp_out_err;
			
 
				+			opt_endpts[region] = temp_out;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+	for each pattern
			
 
				+		convert endpoints using pattern precision
			
 
				+		assign indices and get initial error
			
 
				+		compress indices (and possibly reorder endpoints)
			
 
				+		transform endpoints
			
 
				+		if transformed endpoints fit pattern
			
 
				+			get original endpoints back
			
 
				+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+			compress new indices
			
 
				+			transform new endpoints
			
 
				+			if new endpoints fit pattern AND if error is improved
			
 
				+				emit compressed block with new data
			
 
				+			else
			
 
				+				emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_THREE], char *block)
			
 
				+{
			
 
				+	float orig_err[NREGIONS_THREE], opt_err[NREGIONS_THREE], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
			
 
				+	IntEndptsRGB orig_endpts[NREGIONS_THREE], opt_endpts[NREGIONS_THREE];
			
 
				+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+	{
			
 
				+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
			
 
				+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
			
 
				+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
			
 
				+		if (patterns[sp].transformed)
			
 
				+			transform_forward(orig_endpts);
			
 
				+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
			
 
				+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
			
 
				+		if (endpts_fit(orig_endpts, patterns[sp]))
			
 
				+		{
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_inverse(orig_endpts);
			
 
				+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
			
 
				+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
			
 
				+			// (nreed) Commented out asserts because they go off all the time...not sure why
			
 
				+			//for (int i=0; i<NREGIONS; ++i)
			
 
				+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
			
 
				+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_forward(opt_endpts);
			
 
				+			orig_toterr = opt_toterr = 0;
			
 
				+			for (int i=0; i < NREGIONS_THREE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+			{
			
 
				+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
			
 
				+				return opt_toterr;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// either it stopped fitting when we optimized it, or there was no improvement
			
 
				+				// so go back to the unoptimized endpoints which we know will fit
			
 
				+				if (patterns[sp].transformed)
			
 
				+					transform_forward(orig_endpts);
			
 
				+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
			
 
				+				return orig_toterr;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 2).";
			
 
				+	return FLT_MAX;
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void clamp(Vector4 &v)
			
 
				+{
			
 
				+	if (v.x < 0.0f) v.x = 0.0f;
			
 
				+	if (v.x > 255.0f) v.x = 255.0f;
			
 
				+	if (v.y < 0.0f) v.y = 0.0f;
			
 
				+	if (v.y > 255.0f) v.y = 255.0f;
			
 
				+	if (v.z < 0.0f) v.z = 0.0f;
			
 
				+	if (v.z > 255.0f) v.z = 255.0f;
			
 
				+	v.w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_THREE], Vector4 palette[NREGIONS_THREE][NINDICES])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS_THREE; ++region)
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
			
 
				+}
			
 
				+
			
 
				+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
			
 
				+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_THREE])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS_THREE][NINDICES];
			
 
				+
			
 
				+	generate_palette_unquantized(endpts, palette);
			
 
				+
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+				besterr = err;
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_THREE])
			
 
				+{
			
 
				+	for (int region=0; region<NREGIONS_THREE; ++region)
			
 
				+	{
			
 
				+		int np = 0;
			
 
				+		Vector3 colors[Tile::TILE_TOTAL];
			
 
				+		float alphas[2];
			
 
				+		Vector4 mean(0,0,0,0);
			
 
				+
			
 
				+		for (int y = 0; y < tile.size_y; y++)
			
 
				+		for (int x = 0; x < tile.size_x; x++)
			
 
				+			if (REGION(x,y,shapeindex) == region)
			
 
				+			{
			
 
				+				colors[np] = tile.data[y][x].xyz();
			
 
				+				if (np < 2) alphas[np] = tile.data[y][x].w;
			
 
				+				mean += tile.data[y][x];
			
 
				+				++np;
			
 
				+			}
			
 
				+
			
 
				+		// handle simple cases	
			
 
				+		if (np == 0)
			
 
				+		{
			
 
				+			Vector4 zero(0,0,0,255.0f);
			
 
				+			endpts[region].A = zero;
			
 
				+			endpts[region].B = zero;
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 1)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[0], alphas[0]);
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 2)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[1], alphas[1]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		mean /= float(np);
			
 
				+
			
 
				+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+		// project each pixel value along the principal direction
			
 
				+		float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+		for (int i = 0; i < np; i++) 
			
 
				+		{
			
 
				+			float dp = dot(colors[i]-mean.xyz(), direction);
			
 
				+			if (dp < minp) minp = dp;
			
 
				+			if (dp > maxp) maxp = dp;
			
 
				+		}
			
 
				+
			
 
				+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+		endpts[region].A = mean + minp*Vector4(direction, 0);
			
 
				+		endpts[region].B = mean + maxp*Vector4(direction, 0);
			
 
				+
			
 
				+		// clamp endpoints
			
 
				+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+		// shape based on endpoints being clamped
			
 
				+		clamp(endpts[region].A);
			
 
				+		clamp(endpts[region].B);
			
 
				+	}
			
 
				+
			
 
				+	return map_colors(tile, shapeindex, endpts);
			
 
				+}
			
 
				+
			
 
				+static void swap(float *list1, int *list2, int i, int j)
			
 
				+{
			
 
				+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
			
 
				+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
			
 
				+}
			
 
				+
			
 
				+float AVPCL::compress_mode2(const Tile &t, char *block)
			
 
				+{
			
 
				+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
			
 
				+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
			
 
				+	const int NITEMS=NSHAPES/4;
			
 
				+
			
 
				+	// pick the best NITEMS shapes and refine these.
			
 
				+	struct {
			
 
				+		FltEndpts endpts[NREGIONS_THREE];
			
 
				+	} all[NSHAPES];
			
 
				+	float roughmse[NSHAPES];
			
 
				+	int index[NSHAPES];
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+
			
 
				+	for (int i=0; i<NSHAPES; ++i)
			
 
				+	{
			
 
				+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
			
 
				+		index[i] = i;
			
 
				+	}
			
 
				+
			
 
				+	// bubble sort -- only need to bubble up the first NITEMS items
			
 
				+	for (int i=0; i<NITEMS; ++i)
			
 
				+	for (int j=i+1; j<NSHAPES; ++j)
			
 
				+		if (roughmse[i] > roughmse[j])
			
 
				+			swap(roughmse, index, i, j);
			
 
				+
			
 
				+	for (int i=0; i<NITEMS && msebest>0; ++i)
			
 
				+	{
			
 
				+		int shape = index[i];
			
 
				+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
			
 
				+		if (mse < msebest)
			
 
				+		{
			
 
				+			memcpy(block, tempblock, sizeof(tempblock));
			
 
				+			msebest = mse;
			
 
				+		}
			
 
				+	}
			
 
				+	return msebest;
			
 
				+}
			
 
				+
			
--- a/3rdparty/nvtt/bc7/avpcl_mode3.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode3.cpp
@@ -0,0 +1,1059 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+// x1000 777.1x4 64p 2bi (30b)
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Matrix.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "endpts.h"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+#include "shapes_two.h"
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
			
 
				+
			
 
				+#define NINDICES	4
			
 
				+#define	INDEXBITS	2
			
 
				+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
			
 
				+#define	DENOM		(NINDICES-1)
			
 
				+#define	BIAS		(DENOM/2)
			
 
				+
			
 
				+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
			
 
				+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
			
 
				+// stop without having to touch all shapes?
			
 
				+
			
 
				+#define	POS_TO_X(pos)	((pos)&3)
			
 
				+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
			
 
				+
			
 
				+#define	NBITSIZES	(NREGIONS*2)
			
 
				+#define	ABITINDEX(region)	(2*(region)+0)
			
 
				+#define	BBITINDEX(region)	(2*(region)+1)
			
 
				+
			
 
				+struct ChanBits
			
 
				+{
			
 
				+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
			
 
				+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
			
 
				+	int mode;				// associated mode value
			
 
				+	int modebits;			// number of mode bits
			
 
				+	const char *encoding;			// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define	NPATTERNS 1
			
 
				+#define	NREGIONS  2
			
 
				+
			
 
				+static Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+	// red		green		blue		xfm	mode  mb
			
 
				+	7,7,7,7,	7,7,7,7,	7,7,7,7,	0,	0x8, 4, "",
			
 
				+};
			
 
				+
			
 
				+struct RegionPrec
			
 
				+{
			
 
				+	int	endpt_a_prec[NCHANNELS_RGB];
			
 
				+	int endpt_b_prec[NCHANNELS_RGB];
			
 
				+};
			
 
				+
			
 
				+struct PatternPrec
			
 
				+{
			
 
				+	RegionPrec region_precs[NREGIONS];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// this is the precision for each channel and region
			
 
				+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
			
 
				+static PatternPrec pattern_precs[NPATTERNS] =
			
 
				+{
			
 
				+	7,7,7, 7,7,7, 7,7,7, 7,7,7,
			
 
				+};
			
 
				+
			
 
				+// return # of bits needed to store n. handle signed or unsigned cases properly
			
 
				+static int nbits(int n, bool issigned)
			
 
				+{
			
 
				+	int nb;
			
 
				+	if (n==0)
			
 
				+		return 0;	// no bits needed for 0, signed or not
			
 
				+	else if (n > 0)
			
 
				+	{
			
 
				+		for (nb=0; n; ++nb, n>>=1) ;
			
 
				+		return nb + (issigned?1:0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nvAssert (issigned);
			
 
				+		for (nb=0; n<-1; ++nb, n>>=1) ;
			
 
				+		return nb + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+// endpoints are 888,888; reduce to 777,777 and put the lsb bit majority in compr_bits
			
 
				+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
			
 
				+{
			
 
				+	int onescnt;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+	{
			
 
				+		onescnt += endpts.A[j] & 1;
			
 
				+		compr_endpts.A[j] = endpts.A[j] >> 1;
			
 
				+		nvAssert (compr_endpts.A[j] < 128);
			
 
				+	}
			
 
				+	compr_endpts.a_lsb = onescnt >= 2;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+	{
			
 
				+		onescnt += endpts.B[j] & 1;
			
 
				+		compr_endpts.B[j] = endpts.B[j] >> 1;
			
 
				+		nvAssert (compr_endpts.B[j] < 128);
			
 
				+	}
			
 
				+	compr_endpts.b_lsb = onescnt >= 2;
			
 
				+}
			
 
				+
			
 
				+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
			
 
				+{
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+	{
			
 
				+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
			
 
				+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		uncompress_one(compr_endpts[i], endpts[i]);
			
 
				+}
			
 
				+
			
 
				+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		compress_one(endpts[i], compr_endpts[i]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
			
 
				+{
			
 
				+	IntEndptsRGB full_endpts[NREGIONS];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
			
 
				+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
			
 
				+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
			
 
				+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
			
 
				+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
			
 
				+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
			
 
				+		compress_one(full_endpts[region], q_endpts[region]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
			
 
				+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
			
 
				+
			
 
				+		int x = POS_TO_X(position);
			
 
				+		int y = POS_TO_Y(position);
			
 
				+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+		if (indices[y][x] & HIGH_INDEXBIT)
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=0; i<NCHANNELS_RGB; ++i) 
			
 
				+			{
			
 
				+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
			
 
				+			}
			
 
				+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[y][x] = NINDICES - 1 - indices[y][x];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
			
 
				+{
			
 
				+	out.write(p.mode, p.modebits);
			
 
				+	out.write(shapeindex, SHAPEBITS);
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+	{
			
 
				+		out.write(endpts[i].a_lsb, 1);
			
 
				+		out.write(endpts[i].b_lsb, 1);
			
 
				+	}
			
 
				+
			
 
				+	nvAssert (out.getptr() == 98);
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
			
 
				+{
			
 
				+	int mode = AVPCL::getmode(in);
			
 
				+
			
 
				+	pat_index = 0;
			
 
				+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+	nvAssert (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+	shapeindex = in.read(SHAPEBITS);
			
 
				+	p = patterns[pat_index];
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGB; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+	
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+	{
			
 
				+		endpts[i].a_lsb  = in.read(1);
			
 
				+		endpts[i].b_lsb  = in.read(1);
			
 
				+	}
			
 
				+
			
 
				+	nvAssert (in.getptr() == 98);
			
 
				+}
			
 
				+
			
 
				+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
			
 
				+{
			
 
				+	int positions[NREGIONS];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	int positions[NREGIONS];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
			
 
				+{
			
 
				+	Bits out(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	write_header(endpts, shapeindex, p, out);
			
 
				+
			
 
				+	write_indices(indices, shapeindex, out);
			
 
				+
			
 
				+	nvAssert(out.getptr() == AVPCL::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
			
 
				+{
			
 
				+	IntEndptsRGB endpts;
			
 
				+
			
 
				+	uncompress_one(endpts_2, endpts);
			
 
				+
			
 
				+	// scale endpoints
			
 
				+	int a, b;			// really need a IntVec4...
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
			
 
				+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	// constant alpha
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress_mode3(const char *block, Tile &t)
			
 
				+{
			
 
				+	Bits in(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	Pattern p;
			
 
				+	IntEndptsRGB_2 endpts[NREGIONS];
			
 
				+	int shapeindex, pat_index;
			
 
				+
			
 
				+	read_header(in, endpts, shapeindex, p, pat_index);
			
 
				+	
			
 
				+	if (p.transformed)
			
 
				+	{
			
 
				+		sign_extend(p, endpts);
			
 
				+		transform_inverse(endpts);
			
 
				+	}
			
 
				+
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
			
 
				+
			
 
				+	int indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	read_indices(in, shapeindex, indices);
			
 
				+
			
 
				+	nvAssert(in.getptr() == AVPCL::BITSIZE);
			
 
				+
			
 
				+	// lookup
			
 
				+	for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	Vector4 palette[NINDICES];
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	generate_palette_quantized(endpts, region_prec, palette);
			
 
				+
			
 
				+	for (int i = 0; i < np; ++i)
			
 
				+	{
			
 
				+		float besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
			
 
				+		{
			
 
				+            float err = Utils::metric4(colors[i], palette[j]) * importance[i];
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[i] = j;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+
			
 
				+		// check for early exit
			
 
				+		if (toterr > current_err)
			
 
				+		{
			
 
				+			// fill out bogus index values so it's initialized at least
			
 
				+			for (int k = i; k < np; ++k)
			
 
				+				indices[k] = -1;
			
 
				+
			
 
				+			return FLT_MAX;
			
 
				+		}
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
			
 
				+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
			
 
				+		toterr[region] = 0;
			
 
				+	}
			
 
				+
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[y][x] = i;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr[region] += besterr;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
			
 
				+// this function returns either old_err or a value smaller (if it was successful in improving the error)
			
 
				+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
			
 
				+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	// we have the old endpoints: old_endpts
			
 
				+	// we have the perturbed endpoints: new_endpts
			
 
				+	// we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+	IntEndptsRGB_2 temp_endpts;
			
 
				+	float min_err = old_err;		// start with the best current error
			
 
				+	int beststep;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	// copy real endpoints so we can perturb them
			
 
				+	temp_endpts = new_endpts = old_endpts;
			
 
				+
			
 
				+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
			
 
				+
			
 
				+	// do a logarithmic search for the best error for this endpoint (which)
			
 
				+	for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+	{
			
 
				+		bool improved = false;
			
 
				+		for (int sign = -1; sign <= 1; sign += 2)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+			{
			
 
				+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
			
 
				+
			
 
				+			if (err < min_err)
			
 
				+			{
			
 
				+				improved = true;
			
 
				+				min_err = err;
			
 
				+				beststep = sign * step;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+		// if this was an improvement, move the endpoint and continue search from there
			
 
				+		if (improved)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+				new_endpts.A[ch] += beststep;
			
 
				+			else
			
 
				+				new_endpts.B[ch] += beststep;
			
 
				+		}
			
 
				+	}
			
 
				+	return min_err;
			
 
				+}
			
 
				+
			
 
				+// the larger the error the more time it is worth spending on an exhaustive search.
			
 
				+// perturb the endpoints at least -3 to 3.
			
 
				+// if err > 5000 perturb endpoints 50% of precision
			
 
				+// if err > 1000 25%
			
 
				+// if err > 200 12.5%
			
 
				+// if err > 40  6.25%
			
 
				+// for np = 16 -- adjust error thresholds as a function of np
			
 
				+// always ensure endpoint ordering is preserved (no need to overlap the scan)
			
 
				+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
			
 
				+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	IntEndptsRGB_2 temp_endpts;
			
 
				+	float best_err = orig_err;
			
 
				+	int aprec = region_prec.endpt_a_prec[ch];
			
 
				+	int bprec = region_prec.endpt_b_prec[ch];
			
 
				+	int good_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
			
 
				+
			
 
				+	if (orig_err == 0) return orig_err;
			
 
				+
			
 
				+	int adelta = 0, bdelta = 0;
			
 
				+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
			
 
				+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
			
 
				+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
			
 
				+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
			
 
				+	adelta = max(adelta, 3);
			
 
				+	bdelta = max(bdelta, 3);
			
 
				+
			
 
				+#ifdef	DISABLE_EXHAUSTIVE
			
 
				+	adelta = bdelta = 3;
			
 
				+#endif
			
 
				+
			
 
				+	temp_endpts = opt_endpts;
			
 
				+
			
 
				+	// ok figure out the range of A and B
			
 
				+	int alow = max(0, opt_endpts.A[ch] - adelta);
			
 
				+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
			
 
				+	int blow = max(0, opt_endpts.B[ch] - bdelta);
			
 
				+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
			
 
				+
			
 
				+	// now there's no need to swap the ordering of A and B
			
 
				+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
			
 
				+
			
 
				+	int amin, bmin;
			
 
				+
			
 
				+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
			
 
				+	{
			
 
				+		// keep a <= b
			
 
				+		for (int a = alow; a <= ahigh; ++a)
			
 
				+		for (int b = max(a, blow); b < bhigh; ++b)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// keep b <= a
			
 
				+		for (int b = blow; b < bhigh; ++b)
			
 
				+		for (int a = max(b, alow); a <= ahigh; ++a)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err; 
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (best_err < orig_err)
			
 
				+	{
			
 
				+		opt_endpts.A[ch] = amin;
			
 
				+		opt_endpts.B[ch] = bmin;
			
 
				+		orig_err = best_err;
			
 
				+		// if we actually improved, update the indices
			
 
				+		for (int i=0; i<np; ++i)
			
 
				+			indices[i] = good_indices[i];
			
 
				+	}
			
 
				+	return best_err;
			
 
				+}
			
 
				+
			
 
				+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
			
 
				+{
			
 
				+	float opt_err = orig_err;
			
 
				+
			
 
				+	opt_endpts = orig_endpts;
			
 
				+
			
 
				+	/*
			
 
				+		err0 = perturb(rgb0, delta0)
			
 
				+		err1 = perturb(rgb1, delta1)
			
 
				+		if (err0 < err1)
			
 
				+			if (err0 >= initial_error) break
			
 
				+			rgb0 += delta0
			
 
				+			next = 1
			
 
				+		else
			
 
				+			if (err1 >= initial_error) break
			
 
				+			rgb1 += delta1
			
 
				+			next = 0
			
 
				+		initial_err = map()
			
 
				+		for (;;)
			
 
				+			err = perturb(next ? rgb1:rgb0, delta)
			
 
				+			if (err >= initial_err) break
			
 
				+			next? rgb1 : rgb0 += delta
			
 
				+			initial_err = err
			
 
				+	*/
			
 
				+	IntEndptsRGB_2 new_a, new_b;
			
 
				+	IntEndptsRGB_2 new_endpt;
			
 
				+	int do_b;
			
 
				+	int orig_indices[Tile::TILE_TOTAL];
			
 
				+	int new_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices0[Tile::TILE_TOTAL];
			
 
				+	int temp_indices1[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	// now optimize each channel separately
			
 
				+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
			
 
				+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
			
 
				+	{
			
 
				+		// figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+		// if we just alternate, we can easily end up in a local minima
			
 
				+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
			
 
				+
			
 
				+		if (err0 < err1)
			
 
				+		{
			
 
				+			if (err0 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.A[ch] = new_a.A[ch];
			
 
				+			opt_err = err0;
			
 
				+			do_b = 1;		// do B next
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (err1 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices1[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.B[ch] = new_b.B[ch];
			
 
				+			opt_err = err1;
			
 
				+			do_b = 0;		// do A next
			
 
				+		}
			
 
				+		
			
 
				+		// now alternate endpoints and keep trying until there is no improvement
			
 
				+		for (;;)
			
 
				+		{
			
 
				+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
			
 
				+			if (err >= opt_err)
			
 
				+				break;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (new_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			if (do_b == 0)
			
 
				+				opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+			else
			
 
				+				opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+			opt_err = err;
			
 
				+			do_b = 1 - do_b;	// now move the other endpoint
			
 
				+		}
			
 
				+
			
 
				+		// see if the indices have changed
			
 
				+		int i;
			
 
				+		for (i=0; i<np; ++i)
			
 
				+			if (orig_indices[i] != new_indices[i])
			
 
				+				break;
			
 
				+
			
 
				+		if (i<np)
			
 
				+			ch = -1;	// start over
			
 
				+	}
			
 
				+
			
 
				+	// finally, do a small exhaustive search around what we think is the global minima to be sure
			
 
				+	// note this is independent of the above search, so we don't care about the indices from the above
			
 
				+	// we don't care about the above because if they differ, so what? we've already started at ch=0
			
 
				+	bool first = true;
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
			
 
				+	{
			
 
				+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
			
 
				+
			
 
				+		if (new_err < opt_err)
			
 
				+		{
			
 
				+			opt_err = new_err;
			
 
				+
			
 
				+			if (first)
			
 
				+			{
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+				{
			
 
				+					orig_indices[i] = temp_indices0[i];
			
 
				+					nvAssert (orig_indices[i] != -1);
			
 
				+				}
			
 
				+				first = false;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// see if the indices have changed
			
 
				+				int i;
			
 
				+				for (i=0; i<np; ++i)
			
 
				+					if (orig_indices[i] != temp_indices0[i])
			
 
				+						break;
			
 
				+
			
 
				+				if (i<np)
			
 
				+				{
			
 
				+					ch = -1;	// start over
			
 
				+					first = true;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return opt_err;
			
 
				+}
			
 
				+
			
 
				+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
			
 
				+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
			
 
				+{
			
 
				+	Vector4 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+	IntEndptsRGB_2 temp_in, temp_out;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		// collect the pixels in the region
			
 
				+		int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    np++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+		opt_endpts[region] = temp_in = orig_endpts[region];
			
 
				+		opt_err[region] = orig_err[region];
			
 
				+
			
 
				+		float best_err = orig_err[region];
			
 
				+
			
 
				+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
			
 
				+		{
			
 
				+			temp_in.a_lsb = lsbmode & 1;
			
 
				+			temp_in.b_lsb = (lsbmode >> 1) & 1;
			
 
				+
			
 
				+			// make sure we have a valid error for temp_in
			
 
				+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
			
 
				+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
			
 
				+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
			
 
				+
			
 
				+			// now try to optimize these endpoints
			
 
				+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
			
 
				+
			
 
				+			// if we find an improvement, update the best so far and correct the output endpoints and errors
			
 
				+			if (temp_out_err < best_err)
			
 
				+			{
			
 
				+				best_err = temp_out_err;
			
 
				+				opt_err[region] = temp_out_err;
			
 
				+				opt_endpts[region] = temp_out;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+	for each pattern
			
 
				+		convert endpoints using pattern precision
			
 
				+		assign indices and get initial error
			
 
				+		compress indices (and possibly reorder endpoints)
			
 
				+		transform endpoints
			
 
				+		if transformed endpoints fit pattern
			
 
				+			get original endpoints back
			
 
				+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+			compress new indices
			
 
				+			transform new endpoints
			
 
				+			if new endpoints fit pattern AND if error is improved
			
 
				+				emit compressed block with new data
			
 
				+			else
			
 
				+				emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
			
 
				+{
			
 
				+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
			
 
				+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
			
 
				+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+	{
			
 
				+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
			
 
				+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
			
 
				+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
			
 
				+		if (patterns[sp].transformed)
			
 
				+			transform_forward(orig_endpts);
			
 
				+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
			
 
				+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
			
 
				+		if (endpts_fit(orig_endpts, patterns[sp]))
			
 
				+		{
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_inverse(orig_endpts);
			
 
				+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
			
 
				+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
			
 
				+			// (nreed) Commented out asserts because they go off all the time...not sure why
			
 
				+			//for (int i=0; i<NREGIONS; ++i)
			
 
				+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
			
 
				+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_forward(opt_endpts);
			
 
				+			orig_toterr = opt_toterr = 0;
			
 
				+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+			{
			
 
				+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
			
 
				+				return opt_toterr;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// either it stopped fitting when we optimized it, or there was no improvement
			
 
				+				// so go back to the unoptimized endpoints which we know will fit
			
 
				+				if (patterns[sp].transformed)
			
 
				+					transform_forward(orig_endpts);
			
 
				+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
			
 
				+				return orig_toterr;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 3).";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void clamp(Vector4 &v)
			
 
				+{
			
 
				+	if (v.x < 0.0f) v.x = 0.0f;
			
 
				+	if (v.x > 255.0f) v.x = 255.0f;
			
 
				+	if (v.y < 0.0f) v.y = 0.0f;
			
 
				+	if (v.y > 255.0f) v.y = 255.0f;
			
 
				+	if (v.z < 0.0f) v.z = 0.0f;
			
 
				+	if (v.z > 255.0f) v.z = 255.0f;
			
 
				+	v.w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
			
 
				+}
			
 
				+
			
 
				+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
			
 
				+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	generate_palette_unquantized(endpts, palette);
			
 
				+
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+				besterr = err;
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		int np = 0;
			
 
				+		Vector3 colors[Tile::TILE_TOTAL];
			
 
				+		float alphas[2];
			
 
				+		Vector4 mean(0,0,0,0);
			
 
				+
			
 
				+		for (int y = 0; y < tile.size_y; y++)
			
 
				+		for (int x = 0; x < tile.size_x; x++)
			
 
				+			if (REGION(x,y,shapeindex) == region)
			
 
				+			{
			
 
				+				colors[np] = tile.data[y][x].xyz();
			
 
				+				if (np < 2) alphas[np] = tile.data[y][x].w;
			
 
				+				mean += tile.data[y][x];
			
 
				+				++np;
			
 
				+			}
			
 
				+
			
 
				+		// handle simple cases	
			
 
				+		if (np == 0)
			
 
				+		{
			
 
				+			Vector4 zero(0,0,0,255.0f);
			
 
				+			endpts[region].A = zero;
			
 
				+			endpts[region].B = zero;
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 1)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[0], alphas[0]);
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 2)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[1], alphas[1]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		mean /= float(np);
			
 
				+
			
 
				+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+		// project each pixel value along the principal direction
			
 
				+		float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+		for (int i = 0; i < np; i++) 
			
 
				+		{
			
 
				+			float dp = dot(colors[i]-mean.xyz(), direction);
			
 
				+			if (dp < minp) minp = dp;
			
 
				+			if (dp > maxp) maxp = dp;
			
 
				+		}
			
 
				+
			
 
				+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+		endpts[region].A = mean + minp*Vector4(direction, 0);
			
 
				+		endpts[region].B = mean + maxp*Vector4(direction, 0);
			
 
				+
			
 
				+		// clamp endpoints
			
 
				+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+		// shape based on endpoints being clamped
			
 
				+		clamp(endpts[region].A);
			
 
				+		clamp(endpts[region].B);
			
 
				+	}
			
 
				+
			
 
				+	return map_colors(tile, shapeindex, endpts);
			
 
				+}
			
 
				+
			
 
				+static void swap(float *list1, int *list2, int i, int j)
			
 
				+{
			
 
				+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
			
 
				+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
			
 
				+}
			
 
				+
			
 
				+float AVPCL::compress_mode3(const Tile &t, char *block)
			
 
				+{
			
 
				+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
			
 
				+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
			
 
				+	const int NITEMS=NSHAPES/4;
			
 
				+
			
 
				+	// pick the best NITEMS shapes and refine these.
			
 
				+	struct {
			
 
				+		FltEndpts endpts[NREGIONS];
			
 
				+	} all[NSHAPES];
			
 
				+	float roughmse[NSHAPES];
			
 
				+	int index[NSHAPES];
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+
			
 
				+	for (int i=0; i<NSHAPES; ++i)
			
 
				+	{
			
 
				+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
			
 
				+		index[i] = i;
			
 
				+	}
			
 
				+
			
 
				+	// bubble sort -- only need to bubble up the first NITEMS items
			
 
				+	for (int i=0; i<NITEMS; ++i)
			
 
				+	for (int j=i+1; j<NSHAPES; ++j)
			
 
				+		if (roughmse[i] > roughmse[j])
			
 
				+			swap(roughmse, index, i, j);
			
 
				+
			
 
				+	for (int i=0; i<NITEMS && msebest>0; ++i)
			
 
				+	{
			
 
				+		int shape = index[i];
			
 
				+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
			
 
				+		if (mse < msebest)
			
 
				+		{
			
 
				+			memcpy(block, tempblock, sizeof(tempblock));
			
 
				+			msebest = mse;
			
 
				+		}
			
 
				+	}
			
 
				+	return msebest;
			
 
				+}
			
 
				+
			
--- a/3rdparty/nvtt/bc7/avpcl_mode4.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode4.cpp
@@ -0,0 +1,1214 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+// x10000 2r 1i 555x2 6x2 2bi 3bi
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Matrix.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "endpts.h"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
			
 
				+// array 0 is always the RGB array and array 1 is always the A array
			
 
				+#define	NINDEXARRAYS	2
			
 
				+#define	INDEXARRAY_RGB	0
			
 
				+#define INDEXARRAY_A	1
			
 
				+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
			
 
				+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
			
 
				+
			
 
				+#define NINDICES3	8
			
 
				+#define	INDEXBITS3	3
			
 
				+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
			
 
				+#define	DENOM3		(NINDICES3-1)
			
 
				+#define	BIAS3		(DENOM3/2)
			
 
				+
			
 
				+#define NINDICES2	4
			
 
				+#define	INDEXBITS2	2
			
 
				+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
			
 
				+#define	DENOM2		(NINDICES2-1)
			
 
				+#define	BIAS2		(DENOM2/2)
			
 
				+
			
 
				+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
			
 
				+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
			
 
				+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
			
 
				+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
			
 
				+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
			
 
				+
			
 
				+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
			
 
				+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
			
 
				+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
			
 
				+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
			
 
				+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
			
 
				+
			
 
				+#define	NSHAPES	1
			
 
				+
			
 
				+static int shapes[NSHAPES] =
			
 
				+{
			
 
				+	0x0000,
			
 
				+};
			
 
				+
			
 
				+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
			
 
				+
			
 
				+#define NREGIONS	1			// keep the region stuff in just in case...
			
 
				+
			
 
				+// encoded index compression location: region 0 is always at 0,0.
			
 
				+
			
 
				+#define	NBITSIZES	2			// one endpoint pair
			
 
				+
			
 
				+struct ChanBits
			
 
				+{
			
 
				+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
			
 
				+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
			
 
				+	int mode;				// associated mode value
			
 
				+	int modebits;			// number of mode bits
			
 
				+	const char *encoding;			// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define	TRANSFORM_MODE_ALPHA	1
			
 
				+#define	TRANSFORM_MODE_RGB	2
			
 
				+
			
 
				+#define	NPATTERNS 1
			
 
				+
			
 
				+static Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+	// red		green		blue		alpha	xfm	mode  mb encoding
			
 
				+	5,5,		5,5,		5,5,		6,6,	0x0, 0x10, 5, "",
			
 
				+};
			
 
				+
			
 
				+struct RegionPrec
			
 
				+{
			
 
				+	int	endpt_a_prec[NCHANNELS_RGBA];
			
 
				+	int endpt_b_prec[NCHANNELS_RGBA];
			
 
				+};
			
 
				+
			
 
				+struct PatternPrec
			
 
				+{
			
 
				+	RegionPrec region_precs[NREGIONS];
			
 
				+};
			
 
				+
			
 
				+// this is the precision for each channel and region
			
 
				+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
			
 
				+static PatternPrec pattern_precs[NPATTERNS] =
			
 
				+{
			
 
				+	5,5,5,6,	5,5,5,6,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// return # of bits needed to store n. handle signed or unsigned cases properly
			
 
				+static int nbits(int n, bool issigned)
			
 
				+{
			
 
				+	int nb;
			
 
				+	if (n==0)
			
 
				+		return 0;	// no bits needed for 0, signed or not
			
 
				+	else if (n > 0)
			
 
				+	{
			
 
				+		for (nb=0; n; ++nb, n>>=1) ;
			
 
				+		return nb + (issigned?1:0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nvAssert (issigned);
			
 
				+		for (nb=0; n<-1; ++nb, n>>=1) ;
			
 
				+		return nb + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#define	R_0	ep[0].A[i]
			
 
				+#define	R_1 ep[0].B[i]
			
 
				+
			
 
				+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	if (transform_mode & TRANSFORM_MODE_RGB)
			
 
				+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
			
 
				+			R_1 -= R_0;
			
 
				+	if (transform_mode & TRANSFORM_MODE_ALPHA)
			
 
				+	{
			
 
				+		i = CHANNEL_A;
			
 
				+		R_1 -= R_0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	if (transform_mode & TRANSFORM_MODE_RGB)
			
 
				+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
			
 
				+			R_1 += R_0;
			
 
				+	if (transform_mode & TRANSFORM_MODE_ALPHA)
			
 
				+	{
			
 
				+		i = CHANNEL_A;
			
 
				+		R_1 += R_0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
			
 
				+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
			
 
				+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
			
 
				+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]);
			
 
				+
			
 
				+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
			
 
				+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
			
 
				+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
			
 
				+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
			
 
				+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
			
 
				+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	int index_positions[NREGIONS];
			
 
				+
			
 
				+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		int x = index_positions[region] & 3;
			
 
				+		int y = (index_positions[region] >> 2) & 3;
			
 
				+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+
			
 
				+		// swap RGB
			
 
				+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
			
 
				+		}
			
 
				+
			
 
				+		// swap A
			
 
				+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
			
 
				+{
			
 
				+	// ignore shapeindex
			
 
				+	out.write(p.mode, p.modebits);
			
 
				+	out.write(rotatemode, ROTATEMODE_BITS);
			
 
				+	out.write(indexmode, INDEXMODE_BITS);
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+		{
			
 
				+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
			
 
				+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
			
 
				+		}
			
 
				+	nvAssert (out.getptr() == 50);
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
			
 
				+{
			
 
				+	int mode = AVPCL::getmode(in);
			
 
				+
			
 
				+	pat_index = 0;
			
 
				+
			
 
				+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+	nvAssert (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+	p = patterns[pat_index];
			
 
				+
			
 
				+	shapeindex = 0;		// we don't have any
			
 
				+
			
 
				+	rotatemode = in.read(ROTATEMODE_BITS);
			
 
				+	indexmode = in.read(INDEXMODE_BITS);
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+		{
			
 
				+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
			
 
				+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
			
 
				+		}
			
 
				+	nvAssert (in.getptr() == 50);
			
 
				+}
			
 
				+
			
 
				+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
			
 
				+{
			
 
				+	// the indices we shorten is always index 0
			
 
				+
			
 
				+	// do the 2 bit indices first
			
 
				+	nvAssert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
			
 
				+
			
 
				+	// then the 3 bit indices
			
 
				+	nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	// the indices we shorten is always index 0
			
 
				+
			
 
				+	// do the 2 bit indices first
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
			
 
				+
			
 
				+	// then the 3 bit indices
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
			
 
				+{
			
 
				+	Bits out(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
			
 
				+
			
 
				+	write_indices(indices, shapeindex, indexmode, out);
			
 
				+
			
 
				+	nvAssert(out.getptr() == AVPCL::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
			
 
				+{
			
 
				+	// scale endpoints for RGB
			
 
				+	int a, b;
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
			
 
				+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
			
 
				+
			
 
				+	// interpolate R
			
 
				+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
			
 
				+		palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
			
 
				+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
			
 
				+
			
 
				+	// interpolate G
			
 
				+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
			
 
				+		palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
			
 
				+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
			
 
				+
			
 
				+	// interpolate B
			
 
				+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
			
 
				+		palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
			
 
				+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
			
 
				+
			
 
				+	// interpolate A
			
 
				+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
			
 
				+		palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode)));
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NCHANNELS_RGBA; ++i)
			
 
				+	{
			
 
				+		if (p.transform_mode)
			
 
				+		{
			
 
				+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
			
 
				+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);
			
 
				+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]);
			
 
				+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
			
 
				+{
			
 
				+	out.size_x = in.size_x;
			
 
				+	out.size_y = in.size_y;
			
 
				+
			
 
				+	for (int y=0; y<in.size_y; ++y)
			
 
				+	for (int x=0; x<in.size_x; ++x)
			
 
				+	{
			
 
				+		float t;
			
 
				+		out.data[y][x] = in.data[y][x];
			
 
				+
			
 
				+		switch(rotatemode)
			
 
				+		{
			
 
				+		case ROTATEMODE_RGBA_RGBA: break;
			
 
				+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).x; (out.data[y][x]).x = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
			
 
				+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).y; (out.data[y][x]).y = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
			
 
				+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).z; (out.data[y][x]).z = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
			
 
				+		default: nvUnreachable();
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress_mode4(const char *block, Tile &t)
			
 
				+{
			
 
				+	Bits in(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	Pattern p;
			
 
				+	IntEndptsRGBA endpts[NREGIONS];
			
 
				+	int shapeindex, pat_index, rotatemode, indexmode;
			
 
				+
			
 
				+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
			
 
				+	
			
 
				+	sign_extend(p, endpts);
			
 
				+
			
 
				+	if (p.transform_mode)
			
 
				+		transform_inverse(p.transform_mode, endpts);
			
 
				+
			
 
				+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
			
 
				+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
			
 
				+
			
 
				+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	read_indices(in, shapeindex, indexmode, indices);
			
 
				+
			
 
				+	nvAssert(in.getptr() == AVPCL::BITSIZE);
			
 
				+
			
 
				+	Tile temp(t.size_x, t.size_y);
			
 
				+
			
 
				+	// lookup
			
 
				+	for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+		temp.data[y][x] = Vector4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
			
 
				+
			
 
				+	rotate_tile(temp, rotatemode, t);
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
			
 
				+// exceeds what we already have
			
 
				+static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	Vector3 palette_rgb[NINDICES3];	// could be nindices2
			
 
				+	float palette_a[NINDICES3];	// could be nindices2
			
 
				+	float toterr = 0;
			
 
				+
			
 
				+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
			
 
				+
			
 
				+	Vector3 rgb;
			
 
				+	float a;
			
 
				+
			
 
				+	for (int i = 0; i < np; ++i)
			
 
				+	{
			
 
				+		float err, besterr;
			
 
				+		float palette_alpha = 0, tile_alpha = 0;
			
 
				+
			
 
				+		if(AVPCL::flag_premult)
			
 
				+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).x :
			
 
				+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).y :
			
 
				+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).z : (colors[i]).w;
			
 
				+
			
 
				+		rgb.x = (colors[i]).x;
			
 
				+		rgb.y = (colors[i]).y;
			
 
				+		rgb.z = (colors[i]).z;
			
 
				+		a = (colors[i]).w;
			
 
				+
			
 
				+		// compute the two indices separately
			
 
				+		// if we're doing premultiplied alpha, we need to choose first the index that
			
 
				+		// determines the alpha value, and then do the other index
			
 
				+
			
 
				+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
			
 
				+		{
			
 
				+			// do A index first as it has the alpha
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
			
 
				+			{
			
 
				+				err = Utils::metric1(a, palette_a[j], rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					palette_alpha = palette_a[j];
			
 
				+					indices[INDEXARRAY_A][i] = j;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
			
 
				+
			
 
				+			// do RGB index
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
			
 
				+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_RGB][i] = j;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr += besterr;
			
 
				+			if (toterr > current_besterr)
			
 
				+			{
			
 
				+				// fill out bogus index values so it's initialized at least
			
 
				+				for (int k = i; k < np; ++k)
			
 
				+				{
			
 
				+					indices[INDEXARRAY_RGB][k] = -1;
			
 
				+					indices[INDEXARRAY_A][k] = -1;
			
 
				+				}
			
 
				+				return FLT_MAX;
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// do RGB index
			
 
				+			besterr = FLT_MAX;
			
 
				+			int bestindex;
			
 
				+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
			
 
				+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					bestindex = j;
			
 
				+					indices[INDEXARRAY_RGB][i] = j;
			
 
				+				}
			
 
				+			}
			
 
				+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x :
			
 
				+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y :
			
 
				+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0);
			
 
				+			toterr += besterr;
			
 
				+
			
 
				+			// do A index
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
			
 
				+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_A][i] = j;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
			
 
				+			if (toterr > current_besterr)
			
 
				+			{
			
 
				+				// fill out bogus index values so it's initialized at least
			
 
				+				for (int k = i; k < np; ++k)
			
 
				+				{
			
 
				+					indices[INDEXARRAY_RGB][k] = -1;
			
 
				+					indices[INDEXARRAY_A][k] = -1;
			
 
				+				}
			
 
				+				return FLT_MAX;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
			
 
				+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
			
 
				+{
			
 
				+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
			
 
				+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
			
 
				+		toterr[region] = 0;
			
 
				+	}
			
 
				+
			
 
				+	Vector3 rgb;
			
 
				+	float a;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr;
			
 
				+		float palette_alpha = 0, tile_alpha = 0;
			
 
				+
			
 
				+		rgb.x = (tile.data[y][x]).x;
			
 
				+		rgb.y = (tile.data[y][x]).y;
			
 
				+		rgb.z = (tile.data[y][x]).z;
			
 
				+		a = (tile.data[y][x]).w;
			
 
				+
			
 
				+		if(AVPCL::flag_premult)
			
 
				+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x :
			
 
				+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y :
			
 
				+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w;
			
 
				+
			
 
				+		// compute the two indices separately
			
 
				+		// if we're doing premultiplied alpha, we need to choose first the index that
			
 
				+		// determines the alpha value, and then do the other index
			
 
				+
			
 
				+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
			
 
				+		{
			
 
				+			// do A index first as it has the alpha
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
			
 
				+			{
			
 
				+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_A][y][x] = i;
			
 
				+					palette_alpha = palette_a[region][i];
			
 
				+				}
			
 
				+			}
			
 
				+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
			
 
				+
			
 
				+			// do RGB index
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
			
 
				+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_RGB][y][x] = i;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr[region] += besterr;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// do RGB index first as it has the alpha
			
 
				+			besterr = FLT_MAX;
			
 
				+			int bestindex;
			
 
				+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
			
 
				+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_RGB][y][x] = i;
			
 
				+					bestindex = i;
			
 
				+				}
			
 
				+			}
			
 
				+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x :
			
 
				+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y :
			
 
				+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0);
			
 
				+			toterr[region] += besterr;
			
 
				+
			
 
				+			// do A index
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
			
 
				+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_A][y][x] = i;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
			
 
				+// this function returns either old_err or a value smaller (if it was successful in improving the error)
			
 
				+static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, 
			
 
				+						  float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	// we have the old endpoints: old_endpts
			
 
				+	// we have the perturbed endpoints: new_endpts
			
 
				+	// we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+	IntEndptsRGBA temp_endpts;
			
 
				+	float min_err = old_err;		// start with the best current error
			
 
				+	int beststep;
			
 
				+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[j][i] = -1;
			
 
				+
			
 
				+	// copy real endpoints so we can perturb them
			
 
				+	temp_endpts = new_endpts = old_endpts;
			
 
				+
			
 
				+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
			
 
				+
			
 
				+	// do a logarithmic search for the best error for this endpoint (which)
			
 
				+	for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+	{
			
 
				+		bool improved = false;
			
 
				+		for (int sign = -1; sign <= 1; sign += 2)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+			{
			
 
				+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
			
 
				+
			
 
				+			if (err < min_err)
			
 
				+			{
			
 
				+				improved = true;
			
 
				+				min_err = err;
			
 
				+				beststep = sign * step;
			
 
				+				for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					indices[j][i] = temp_indices[j][i];
			
 
				+			}
			
 
				+		}
			
 
				+		// if this was an improvement, move the endpoint and continue search from there
			
 
				+		if (improved)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+				new_endpts.A[ch] += beststep;
			
 
				+			else
			
 
				+				new_endpts.B[ch] += beststep;
			
 
				+		}
			
 
				+	}
			
 
				+	return min_err;
			
 
				+}
			
 
				+
			
 
				+// the larger the error the more time it is worth spending on an exhaustive search.
			
 
				+// perturb the endpoints at least -3 to 3.
			
 
				+// if err > 5000 perturb endpoints 50% of precision
			
 
				+// if err > 1000 25%
			
 
				+// if err > 200 12.5%
			
 
				+// if err > 40  6.25%
			
 
				+// for np = 16 -- adjust error thresholds as a function of np
			
 
				+// always ensure endpoint ordering is preserved (no need to overlap the scan)
			
 
				+static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	IntEndptsRGBA temp_endpts;
			
 
				+	float best_err = orig_err;
			
 
				+	int aprec = region_prec.endpt_a_prec[ch];
			
 
				+	int bprec = region_prec.endpt_b_prec[ch];
			
 
				+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[j][i] = -1;
			
 
				+
			
 
				+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
			
 
				+
			
 
				+	if (orig_err == 0) return orig_err;
			
 
				+
			
 
				+	int adelta = 0, bdelta = 0;
			
 
				+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
			
 
				+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
			
 
				+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
			
 
				+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
			
 
				+	adelta = max(adelta, 3);
			
 
				+	bdelta = max(bdelta, 3);
			
 
				+
			
 
				+#ifdef	DISABLE_EXHAUSTIVE
			
 
				+	adelta = bdelta = 3;
			
 
				+#endif
			
 
				+
			
 
				+	temp_endpts = opt_endpts;
			
 
				+
			
 
				+	// ok figure out the range of A and B
			
 
				+	int alow = max(0, opt_endpts.A[ch] - adelta);
			
 
				+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
			
 
				+	int blow = max(0, opt_endpts.B[ch] - bdelta);
			
 
				+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
			
 
				+
			
 
				+	// now there's no need to swap the ordering of A and B
			
 
				+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
			
 
				+
			
 
				+	int amin, bmin;
			
 
				+
			
 
				+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
			
 
				+	{
			
 
				+		// keep a <= b
			
 
				+		for (int a = alow; a <= ahigh; ++a)
			
 
				+		for (int b = max(a, blow); b < bhigh; ++b)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[j][i] = temp_indices[j][i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// keep b <= a
			
 
				+		for (int b = blow; b < bhigh; ++b)
			
 
				+		for (int a = max(b, alow); a <= ahigh; ++a)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[j][i] = temp_indices[j][i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (best_err < orig_err)
			
 
				+	{
			
 
				+		opt_endpts.A[ch] = amin;
			
 
				+		opt_endpts.B[ch] = bmin;
			
 
				+		orig_err = best_err;
			
 
				+		for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+		for (int i=0; i<np; ++i)
			
 
				+			indices[j][i] = good_indices[j][i];
			
 
				+	}
			
 
				+
			
 
				+	return best_err;
			
 
				+}
			
 
				+
			
 
				+static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
			
 
				+{
			
 
				+	float opt_err = orig_err;
			
 
				+
			
 
				+	opt_endpts = orig_endpts;
			
 
				+
			
 
				+	/*
			
 
				+		err0 = perturb(rgb0, delta0)
			
 
				+		err1 = perturb(rgb1, delta1)
			
 
				+		if (err0 < err1)
			
 
				+			if (err0 >= initial_error) break
			
 
				+			rgb0 += delta0
			
 
				+			next = 1
			
 
				+		else
			
 
				+			if (err1 >= initial_error) break
			
 
				+			rgb1 += delta1
			
 
				+			next = 0
			
 
				+		initial_err = map()
			
 
				+		for (;;)
			
 
				+			err = perturb(next ? rgb1:rgb0, delta)
			
 
				+			if (err >= initial_err) break
			
 
				+			next? rgb1 : rgb0 += delta
			
 
				+			initial_err = err
			
 
				+	*/
			
 
				+	IntEndptsRGBA new_a, new_b;
			
 
				+	IntEndptsRGBA new_endpt;
			
 
				+	int do_b;
			
 
				+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+
			
 
				+	// now optimize each channel separately
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
			
 
				+	{
			
 
				+		// figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+		// if we just alternate, we can easily end up in a local minima
			
 
				+		float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
			
 
				+
			
 
				+		if (err0 < err1)
			
 
				+		{
			
 
				+			if (err0 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
			
 
				+				nvAssert (orig_indices[j][i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.A[ch] = new_a.A[ch];
			
 
				+			opt_err = err0;
			
 
				+			do_b = 1;		// do B next
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (err1 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
			
 
				+				nvAssert (orig_indices[j][i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.B[ch] = new_b.B[ch];
			
 
				+			opt_err = err1;
			
 
				+			do_b = 0;		// do A next
			
 
				+		}
			
 
				+		
			
 
				+		// now alternate endpoints and keep trying until there is no improvement
			
 
				+		for (;;)
			
 
				+		{
			
 
				+            float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
			
 
				+			if (err >= opt_err)
			
 
				+				break;
			
 
				+
			
 
				+			for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[j][i] = temp_indices0[j][i];
			
 
				+				nvAssert (orig_indices[j][i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			if (do_b == 0)
			
 
				+				opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+			else
			
 
				+				opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+			opt_err = err;
			
 
				+			do_b = 1 - do_b;	// now move the other endpoint
			
 
				+		}
			
 
				+
			
 
				+		// see if the indices have changed
			
 
				+		int i;
			
 
				+		for (i=0; i<np; ++i)
			
 
				+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
			
 
				+				break;
			
 
				+
			
 
				+		if (i<np)
			
 
				+			ch = -1;	// start over
			
 
				+	}
			
 
				+
			
 
				+	// finally, do a small exhaustive search around what we think is the global minima to be sure
			
 
				+	bool first = true;
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
			
 
				+	{
			
 
				+        float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
			
 
				+
			
 
				+		if (new_err < opt_err)
			
 
				+		{
			
 
				+			opt_err = new_err;
			
 
				+
			
 
				+			if (first)
			
 
				+			{
			
 
				+				for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+				{
			
 
				+					orig_indices[j][i] = temp_indices0[j][i];
			
 
				+					nvAssert (orig_indices[j][i] != -1);
			
 
				+				}
			
 
				+				first = false;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// see if the indices have changed
			
 
				+				int i;
			
 
				+				for (i=0; i<np; ++i)
			
 
				+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
			
 
				+						break;
			
 
				+
			
 
				+				if (i<np)
			
 
				+				{
			
 
				+					ch = -1;	// start over
			
 
				+					first = true;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return opt_err;
			
 
				+}
			
 
				+
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const float orig_err[NREGIONS], 
			
 
				+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
			
 
				+{
			
 
				+	Vector4 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+	IntEndptsRGBA temp_in, temp_out;
			
 
				+
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		// collect the pixels in the region
			
 
				+		int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    np++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+		opt_endpts[region] = temp_in = orig_endpts[region];
			
 
				+		opt_err[region] = orig_err[region];
			
 
				+
			
 
				+		float best_err = orig_err[region];
			
 
				+
			
 
				+		// make sure we have a valid error for temp_in
			
 
				+		// we didn't change temp_in, so orig_err[region] is still valid
			
 
				+		float temp_in_err = orig_err[region];
			
 
				+
			
 
				+		// now try to optimize these endpoints
			
 
				+        float temp_out_err = optimize_one(pixels, importance, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
			
 
				+
			
 
				+		// if we find an improvement, update the best so far and correct the output endpoints and errors
			
 
				+		if (temp_out_err < best_err)
			
 
				+		{
			
 
				+			best_err = temp_out_err;
			
 
				+			opt_err[region] = temp_out_err;
			
 
				+			opt_endpts[region] = temp_out;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+	for each pattern
			
 
				+		convert endpoints using pattern precision
			
 
				+		assign indices and get initial error
			
 
				+		compress indices (and possibly reorder endpoints)
			
 
				+		transform endpoints
			
 
				+		if transformed endpoints fit pattern
			
 
				+			get original endpoints back
			
 
				+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+			compress new indices
			
 
				+			transform new endpoints
			
 
				+			if new endpoints fit pattern AND if error is improved
			
 
				+				emit compressed block with new data
			
 
				+			else
			
 
				+				emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+static float refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
			
 
				+{
			
 
				+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
			
 
				+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
			
 
				+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+	{
			
 
				+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
			
 
				+
			
 
				+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
			
 
				+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
			
 
				+
			
 
				+		if (patterns[sp].transform_mode)
			
 
				+			transform_forward(patterns[sp].transform_mode, orig_endpts);
			
 
				+
			
 
				+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
			
 
				+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
			
 
				+		if (endpts_fit(orig_endpts, patterns[sp]))
			
 
				+		{
			
 
				+			if (patterns[sp].transform_mode)
			
 
				+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
			
 
				+
			
 
				+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
			
 
				+
			
 
				+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
			
 
				+			// (nreed) Commented out asserts because they go off all the time...not sure why
			
 
				+			//for (int i=0; i<NREGIONS; ++i)
			
 
				+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
			
 
				+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
			
 
				+
			
 
				+			if (patterns[sp].transform_mode)
			
 
				+				transform_forward(patterns[sp].transform_mode, opt_endpts);
			
 
				+
			
 
				+			orig_toterr = opt_toterr = 0;
			
 
				+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+			{
			
 
				+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
			
 
				+				return opt_toterr;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// either it stopped fitting when we optimized it, or there was no improvement
			
 
				+				// so go back to the unoptimized endpoints which we know will fit
			
 
				+				if (patterns[sp].transform_mode)
			
 
				+					transform_forward(patterns[sp].transform_mode, orig_endpts);
			
 
				+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
			
 
				+				return orig_toterr;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 4).";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void clamp(Vector4 &v)
			
 
				+{
			
 
				+	if (v.x < 0.0f) v.x = 0.0f;
			
 
				+	if (v.x > 255.0f) v.x = 255.0f;
			
 
				+	if (v.y < 0.0f) v.y = 0.0f;
			
 
				+	if (v.y > 255.0f) v.y = 255.0f;
			
 
				+	if (v.z < 0.0f) v.z = 0.0f;
			
 
				+	if (v.z > 255.0f) v.z = 255.0f;
			
 
				+	if (v.w < 0.0f) v.w = 0.0f;
			
 
				+	if (v.w > 255.0f) v.w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+// compute initial endpoints for the "RGB" portion and the "A" portion. 
			
 
				+// Note these channels may have been rotated.
			
 
				+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		int np = 0;
			
 
				+		Vector3 colors[Tile::TILE_TOTAL];
			
 
				+		float alphas[Tile::TILE_TOTAL];
			
 
				+		Vector4 mean(0,0,0,0);
			
 
				+
			
 
				+		for (int y = 0; y < tile.size_y; y++)
			
 
				+		for (int x = 0; x < tile.size_x; x++)
			
 
				+			if (REGION(x,y,shapeindex) == region)
			
 
				+			{
			
 
				+				colors[np] = tile.data[y][x].xyz();
			
 
				+				alphas[np] = tile.data[y][x].w;
			
 
				+				mean += tile.data[y][x];
			
 
				+				++np;
			
 
				+			}
			
 
				+
			
 
				+		// handle simple cases	
			
 
				+		if (np == 0)
			
 
				+		{
			
 
				+			Vector4 zero(0,0,0,255.0f);
			
 
				+			endpts[region].A = zero;
			
 
				+			endpts[region].B = zero;
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 1)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[0], alphas[0]);
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 2)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[1], alphas[1]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		mean /= float(np);
			
 
				+
			
 
				+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+		// project each pixel value along the principal direction
			
 
				+		float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+		float mina = FLT_MAX, maxa = -FLT_MAX;
			
 
				+		for (int i = 0; i < np; i++) 
			
 
				+		{
			
 
				+			float dp = dot(colors[i]-mean.xyz(), direction);
			
 
				+			if (dp < minp) minp = dp;
			
 
				+			if (dp > maxp) maxp = dp;
			
 
				+
			
 
				+			dp = alphas[i] - mean.w;
			
 
				+			if (dp < mina) mina = dp;
			
 
				+			if (dp > maxa) maxa = dp;
			
 
				+		}
			
 
				+
			
 
				+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+		endpts[region].A = mean + Vector4(minp*direction, mina);
			
 
				+		endpts[region].B = mean + Vector4(maxp*direction, maxa);
			
 
				+
			
 
				+		// clamp endpoints
			
 
				+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+		// shape based on endpoints being clamped
			
 
				+		clamp(endpts[region].A);
			
 
				+		clamp(endpts[region].B);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+float AVPCL::compress_mode4(const Tile &t, char *block)
			
 
				+{
			
 
				+	FltEndpts endpts[NREGIONS];
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+	int shape = 0;
			
 
				+	Tile t1;
			
 
				+
			
 
				+	// try all rotations. refine tries the 2 different indexings.
			
 
				+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
			
 
				+	{
			
 
				+		rotate_tile(t, r, t1);
			
 
				+		rough(t1, shape, endpts);
			
 
				+		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
			
 
				+		{
			
 
				+			float mse = refine(t1, shape, r, i, endpts, tempblock);
			
 
				+			if (mse < msebest)
			
 
				+			{
			
 
				+				memcpy(block, tempblock, sizeof(tempblock));
			
 
				+				msebest = mse;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return msebest;
			
 
				+}
			
--- a/3rdparty/nvtt/bc7/avpcl_mode5.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode5.cpp
@@ -0,0 +1,1216 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+// x100000 2r 777x2 8x2 2bi 2bi
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Matrix.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "endpts.h"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
			
 
				+// array 0 is always the RGB array and array 1 is always the A array
			
 
				+#define	NINDEXARRAYS	2
			
 
				+#define	INDEXARRAY_RGB	0
			
 
				+#define INDEXARRAY_A	1
			
 
				+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
			
 
				+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
			
 
				+
			
 
				+#define NINDICES3	4
			
 
				+#define	INDEXBITS3	2
			
 
				+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
			
 
				+#define	DENOM3		(NINDICES3-1)
			
 
				+#define	BIAS3		(DENOM3/2)
			
 
				+
			
 
				+#define NINDICES2	4
			
 
				+#define	INDEXBITS2	2
			
 
				+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
			
 
				+#define	DENOM2		(NINDICES2-1)
			
 
				+#define	BIAS2		(DENOM2/2)
			
 
				+
			
 
				+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
			
 
				+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
			
 
				+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
			
 
				+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
			
 
				+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
			
 
				+
			
 
				+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
			
 
				+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
			
 
				+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
			
 
				+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
			
 
				+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
			
 
				+
			
 
				+#define	NSHAPES	1
			
 
				+
			
 
				+static int shapes[NSHAPES] =
			
 
				+{
			
 
				+	0x0000,
			
 
				+};
			
 
				+
			
 
				+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
			
 
				+
			
 
				+#define NREGIONS	1			// keep the region stuff in just in case...
			
 
				+
			
 
				+// encoded index compression location: region 0 is always at 0,0.
			
 
				+
			
 
				+#define	NBITSIZES	2			// one endpoint pair
			
 
				+
			
 
				+struct ChanBits
			
 
				+{
			
 
				+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
			
 
				+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
			
 
				+	int mode;				// associated mode value
			
 
				+	int modebits;			// number of mode bits
			
 
				+	const char *encoding;			// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define	TRANSFORM_MODE_ALPHA	1
			
 
				+#define	TRANSFORM_MODE_RGB	2
			
 
				+
			
 
				+#define	NPATTERNS 1
			
 
				+
			
 
				+static Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+	// red		green		blue		alpha	xfm	mode  mb encoding
			
 
				+	7,7,		7,7,		7,7,		8,8,	0x0, 0x20, 6, "",
			
 
				+};
			
 
				+
			
 
				+struct RegionPrec
			
 
				+{
			
 
				+	int	endpt_a_prec[NCHANNELS_RGBA];
			
 
				+	int endpt_b_prec[NCHANNELS_RGBA];
			
 
				+};
			
 
				+
			
 
				+struct PatternPrec
			
 
				+{
			
 
				+	RegionPrec region_precs[NREGIONS];
			
 
				+};
			
 
				+
			
 
				+// this is the precision for each channel and region
			
 
				+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
			
 
				+static PatternPrec pattern_precs[NPATTERNS] =
			
 
				+{
			
 
				+	7,7,7,8,	7,7,7,8,
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// return # of bits needed to store n. handle signed or unsigned cases properly
			
 
				+static int nbits(int n, bool issigned)
			
 
				+{
			
 
				+	int nb;
			
 
				+	if (n==0)
			
 
				+		return 0;	// no bits needed for 0, signed or not
			
 
				+	else if (n > 0)
			
 
				+	{
			
 
				+		for (nb=0; n; ++nb, n>>=1) ;
			
 
				+		return nb + (issigned?1:0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nvAssert (issigned);
			
 
				+		for (nb=0; n<-1; ++nb, n>>=1) ;
			
 
				+		return nb + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+#define	R_0	ep[0].A[i]
			
 
				+#define	R_1 ep[0].B[i]
			
 
				+
			
 
				+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	if (transform_mode & TRANSFORM_MODE_RGB)
			
 
				+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
			
 
				+			R_1 -= R_0;
			
 
				+	if (transform_mode & TRANSFORM_MODE_ALPHA)
			
 
				+	{
			
 
				+		i = CHANNEL_A;
			
 
				+		R_1 -= R_0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
			
 
				+{
			
 
				+	int i;
			
 
				+
			
 
				+	if (transform_mode & TRANSFORM_MODE_RGB)
			
 
				+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
			
 
				+			R_1 += R_0;
			
 
				+	if (transform_mode & TRANSFORM_MODE_ALPHA)
			
 
				+	{
			
 
				+		i = CHANNEL_A;
			
 
				+		R_1 += R_0;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
			
 
				+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
			
 
				+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
			
 
				+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]);
			
 
				+
			
 
				+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
			
 
				+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
			
 
				+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
			
 
				+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
			
 
				+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
			
 
				+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	int index_positions[NREGIONS];
			
 
				+
			
 
				+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		int x = index_positions[region] & 3;
			
 
				+		int y = (index_positions[region] >> 2) & 3;
			
 
				+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+
			
 
				+		// swap RGB
			
 
				+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
			
 
				+		}
			
 
				+
			
 
				+		// swap A
			
 
				+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
			
 
				+{
			
 
				+	// ignore shapeindex
			
 
				+	out.write(p.mode, p.modebits);
			
 
				+	out.write(rotatemode, ROTATEMODE_BITS);
			
 
				+//	out.write(indexmode, INDEXMODE_BITS);
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+		{
			
 
				+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
			
 
				+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
			
 
				+		}
			
 
				+	nvAssert (out.getptr() == 66);
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
			
 
				+{
			
 
				+	int mode = AVPCL::getmode(in);
			
 
				+
			
 
				+	pat_index = 0;
			
 
				+
			
 
				+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+	nvAssert (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+	p = patterns[pat_index];
			
 
				+
			
 
				+	shapeindex = 0;		// we don't have any
			
 
				+
			
 
				+	rotatemode = in.read(ROTATEMODE_BITS);
			
 
				+
			
 
				+	indexmode = 0;		// we don't have any
			
 
				+
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+		{
			
 
				+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
			
 
				+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
			
 
				+		}
			
 
				+	nvAssert (in.getptr() == 66);
			
 
				+}
			
 
				+
			
 
				+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
			
 
				+{
			
 
				+	// the indices we shorten is always index 0
			
 
				+
			
 
				+	// do the 2 bit indices first
			
 
				+	nvAssert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
			
 
				+
			
 
				+	// then the 3 bit indices
			
 
				+	nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	// the indices we shorten is always index 0
			
 
				+
			
 
				+	// do the 2 bit indices first
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
			
 
				+
			
 
				+	// then the 3 bit indices
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
			
 
				+{
			
 
				+	Bits out(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
			
 
				+
			
 
				+	write_indices(indices, shapeindex, indexmode, out);
			
 
				+
			
 
				+	nvAssert(out.getptr() == AVPCL::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
			
 
				+{
			
 
				+	// scale endpoints for RGB
			
 
				+	int a, b;
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
			
 
				+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
			
 
				+
			
 
				+	// interpolate R
			
 
				+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
			
 
				+		palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
			
 
				+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
			
 
				+
			
 
				+	// interpolate G
			
 
				+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
			
 
				+		palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
			
 
				+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
			
 
				+
			
 
				+	// interpolate B
			
 
				+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
			
 
				+		palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
			
 
				+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
			
 
				+
			
 
				+	// interpolate A
			
 
				+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
			
 
				+		palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode)));
			
 
				+}
			
 
				+
			
 
				+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NCHANNELS_RGBA; ++i)
			
 
				+	{
			
 
				+		if (p.transform_mode)
			
 
				+		{
			
 
				+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
			
 
				+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);
			
 
				+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]);
			
 
				+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
			
 
				+{
			
 
				+	out.size_x = in.size_x;
			
 
				+	out.size_y = in.size_y;
			
 
				+
			
 
				+	for (int y=0; y<in.size_y; ++y)
			
 
				+	for (int x=0; x<in.size_x; ++x)
			
 
				+	{
			
 
				+		float t;
			
 
				+		out.data[y][x] = in.data[y][x];
			
 
				+
			
 
				+		switch(rotatemode)
			
 
				+		{
			
 
				+		case ROTATEMODE_RGBA_RGBA: break;
			
 
				+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).x; (out.data[y][x]).x = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
			
 
				+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).y; (out.data[y][x]).y = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
			
 
				+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).z; (out.data[y][x]).z = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
			
 
				+		default: nvUnreachable();
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress_mode5(const char *block, Tile &t)
			
 
				+{
			
 
				+	Bits in(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	Pattern p;
			
 
				+	IntEndptsRGBA endpts[NREGIONS];
			
 
				+	int shapeindex, pat_index, rotatemode, indexmode;
			
 
				+
			
 
				+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
			
 
				+	
			
 
				+	sign_extend(p, endpts);
			
 
				+
			
 
				+	if (p.transform_mode)
			
 
				+		transform_inverse(p.transform_mode, endpts);
			
 
				+
			
 
				+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
			
 
				+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
			
 
				+
			
 
				+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	read_indices(in, shapeindex, indexmode, indices);
			
 
				+
			
 
				+	nvAssert(in.getptr() == AVPCL::BITSIZE);
			
 
				+
			
 
				+	Tile temp(t.size_x, t.size_y);
			
 
				+
			
 
				+	// lookup
			
 
				+	for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+		temp.data[y][x] = Vector4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
			
 
				+
			
 
				+	rotate_tile(temp, rotatemode, t);
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
			
 
				+// exceeds what we already have
			
 
				+static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	Vector3 palette_rgb[NINDICES3];	// could be nindices2
			
 
				+	float palette_a[NINDICES3];	// could be nindices2
			
 
				+	float toterr = 0;
			
 
				+
			
 
				+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
			
 
				+
			
 
				+	Vector3 rgb;
			
 
				+	float a;
			
 
				+
			
 
				+	for (int i = 0; i < np; ++i)
			
 
				+	{
			
 
				+		float err, besterr;
			
 
				+		float palette_alpha = 0, tile_alpha = 0;
			
 
				+
			
 
				+		if(AVPCL::flag_premult)
			
 
				+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).x :
			
 
				+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).y :
			
 
				+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).z : (colors[i]).w;
			
 
				+
			
 
				+		rgb.x = (colors[i]).x;
			
 
				+		rgb.y = (colors[i]).y;
			
 
				+		rgb.z = (colors[i]).z;
			
 
				+		a = (colors[i]).w;
			
 
				+
			
 
				+		// compute the two indices separately
			
 
				+		// if we're doing premultiplied alpha, we need to choose first the index that
			
 
				+		// determines the alpha value, and then do the other index
			
 
				+
			
 
				+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
			
 
				+		{
			
 
				+			// do A index first as it has the alpha
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
			
 
				+			{
			
 
				+				err = Utils::metric1(a, palette_a[j], rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					palette_alpha = palette_a[j];
			
 
				+					indices[INDEXARRAY_A][i] = j;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
			
 
				+
			
 
				+			// do RGB index
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
			
 
				+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_RGB][i] = j;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr += besterr;
			
 
				+			if (toterr > current_besterr)
			
 
				+			{
			
 
				+				// fill out bogus index values so it's initialized at least
			
 
				+				for (int k = i; k < np; ++k)
			
 
				+				{
			
 
				+					indices[INDEXARRAY_RGB][k] = -1;
			
 
				+					indices[INDEXARRAY_A][k] = -1;
			
 
				+				}
			
 
				+				return FLT_MAX;
			
 
				+			}
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// do RGB index
			
 
				+			besterr = FLT_MAX;
			
 
				+			int bestindex;
			
 
				+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
			
 
				+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					bestindex = j;
			
 
				+					indices[INDEXARRAY_RGB][i] = j;
			
 
				+				}
			
 
				+			}
			
 
				+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x :
			
 
				+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y :
			
 
				+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0);
			
 
				+			toterr += besterr;
			
 
				+
			
 
				+			// do A index
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
			
 
				+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_A][i] = j;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
			
 
				+			if (toterr > current_besterr)
			
 
				+			{
			
 
				+				// fill out bogus index values so it's initialized at least
			
 
				+				for (int k = i; k < np; ++k)
			
 
				+				{
			
 
				+					indices[INDEXARRAY_RGB][k] = -1;
			
 
				+					indices[INDEXARRAY_A][k] = -1;
			
 
				+				}
			
 
				+				return FLT_MAX;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
			
 
				+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
			
 
				+{
			
 
				+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
			
 
				+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
			
 
				+		toterr[region] = 0;
			
 
				+	}
			
 
				+
			
 
				+	Vector3 rgb;
			
 
				+	float a;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr;
			
 
				+		float palette_alpha = 0, tile_alpha = 0;
			
 
				+
			
 
				+		rgb.x = (tile.data[y][x]).x;
			
 
				+		rgb.y = (tile.data[y][x]).y;
			
 
				+		rgb.z = (tile.data[y][x]).z;
			
 
				+		a = (tile.data[y][x]).w;
			
 
				+
			
 
				+		if(AVPCL::flag_premult)
			
 
				+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x :
			
 
				+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y :
			
 
				+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w;
			
 
				+
			
 
				+		// compute the two indices separately
			
 
				+		// if we're doing premultiplied alpha, we need to choose first the index that
			
 
				+		// determines the alpha value, and then do the other index
			
 
				+
			
 
				+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
			
 
				+		{
			
 
				+			// do A index first as it has the alpha
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
			
 
				+			{
			
 
				+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_A][y][x] = i;
			
 
				+					palette_alpha = palette_a[region][i];
			
 
				+				}
			
 
				+			}
			
 
				+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
			
 
				+
			
 
				+			// do RGB index
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
			
 
				+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_RGB][y][x] = i;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr[region] += besterr;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			// do RGB index first as it has the alpha
			
 
				+			besterr = FLT_MAX;
			
 
				+			int bestindex;
			
 
				+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
			
 
				+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_RGB][y][x] = i;
			
 
				+					bestindex = i;
			
 
				+				}
			
 
				+			}
			
 
				+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x :
			
 
				+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y :
			
 
				+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0);
			
 
				+			toterr[region] += besterr;
			
 
				+
			
 
				+			// do A index
			
 
				+			besterr = FLT_MAX;
			
 
				+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
			
 
				+			{
			
 
				+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
			
 
				+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
			
 
				+
			
 
				+				if (err > besterr)	// error increased, so we're done searching
			
 
				+					break;
			
 
				+				if (err < besterr)
			
 
				+				{
			
 
				+					besterr = err;
			
 
				+					indices[INDEXARRAY_A][y][x] = i;
			
 
				+				}
			
 
				+			}
			
 
				+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
			
 
				+// this function returns either old_err or a value smaller (if it was successful in improving the error)
			
 
				+static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts,
			
 
				+						  float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	// we have the old endpoints: old_endpts
			
 
				+	// we have the perturbed endpoints: new_endpts
			
 
				+	// we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+	IntEndptsRGBA temp_endpts;
			
 
				+	float min_err = old_err;		// start with the best current error
			
 
				+	int beststep;
			
 
				+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[j][i] = -1;
			
 
				+
			
 
				+	// copy real endpoints so we can perturb them
			
 
				+	temp_endpts = new_endpts = old_endpts;
			
 
				+
			
 
				+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
			
 
				+
			
 
				+	// do a logarithmic search for the best error for this endpoint (which)
			
 
				+	for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+	{
			
 
				+		bool improved = false;
			
 
				+		for (int sign = -1; sign <= 1; sign += 2)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+			{
			
 
				+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
			
 
				+
			
 
				+			if (err < min_err)
			
 
				+			{
			
 
				+				improved = true;
			
 
				+				min_err = err;
			
 
				+				beststep = sign * step;
			
 
				+				for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					indices[j][i] = temp_indices[j][i];
			
 
				+			}
			
 
				+		}
			
 
				+		// if this was an improvement, move the endpoint and continue search from there
			
 
				+		if (improved)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+				new_endpts.A[ch] += beststep;
			
 
				+			else
			
 
				+				new_endpts.B[ch] += beststep;
			
 
				+		}
			
 
				+	}
			
 
				+	return min_err;
			
 
				+}
			
 
				+
			
 
				+// the larger the error the more time it is worth spending on an exhaustive search.
			
 
				+// perturb the endpoints at least -3 to 3.
			
 
				+// if err > 5000 perturb endpoints 50% of precision
			
 
				+// if err > 1000 25%
			
 
				+// if err > 200 12.5%
			
 
				+// if err > 40  6.25%
			
 
				+// for np = 16 -- adjust error thresholds as a function of np
			
 
				+// always ensure endpoint ordering is preserved (no need to overlap the scan)
			
 
				+static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	IntEndptsRGBA temp_endpts;
			
 
				+	float best_err = orig_err;
			
 
				+	int aprec = region_prec.endpt_a_prec[ch];
			
 
				+	int bprec = region_prec.endpt_b_prec[ch];
			
 
				+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[j][i] = -1;
			
 
				+
			
 
				+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
			
 
				+
			
 
				+	if (orig_err == 0) return orig_err;
			
 
				+
			
 
				+	int adelta = 0, bdelta = 0;
			
 
				+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
			
 
				+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
			
 
				+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
			
 
				+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
			
 
				+	adelta = max(adelta, 3);
			
 
				+	bdelta = max(bdelta, 3);
			
 
				+
			
 
				+#ifdef	DISABLE_EXHAUSTIVE
			
 
				+	adelta = bdelta = 3;
			
 
				+#endif
			
 
				+
			
 
				+	temp_endpts = opt_endpts;
			
 
				+
			
 
				+	// ok figure out the range of A and B
			
 
				+	int alow = max(0, opt_endpts.A[ch] - adelta);
			
 
				+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
			
 
				+	int blow = max(0, opt_endpts.B[ch] - bdelta);
			
 
				+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
			
 
				+
			
 
				+	// now there's no need to swap the ordering of A and B
			
 
				+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
			
 
				+
			
 
				+	int amin, bmin;
			
 
				+
			
 
				+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
			
 
				+	{
			
 
				+		// keep a <= b
			
 
				+		for (int a = alow; a <= ahigh; ++a)
			
 
				+		for (int b = max(a, blow); b < bhigh; ++b)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[j][i] = temp_indices[j][i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// keep b <= a
			
 
				+		for (int b = blow; b < bhigh; ++b)
			
 
				+		for (int a = max(b, alow); a <= ahigh; ++a)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[j][i] = temp_indices[j][i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (best_err < orig_err)
			
 
				+	{
			
 
				+		opt_endpts.A[ch] = amin;
			
 
				+		opt_endpts.B[ch] = bmin;
			
 
				+		orig_err = best_err;
			
 
				+		for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+		for (int i=0; i<np; ++i)
			
 
				+			indices[j][i] = good_indices[j][i];
			
 
				+	}
			
 
				+
			
 
				+	return best_err;
			
 
				+}
			
 
				+
			
 
				+static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
			
 
				+{
			
 
				+	float opt_err = orig_err;
			
 
				+
			
 
				+	opt_endpts = orig_endpts;
			
 
				+
			
 
				+	/*
			
 
				+		err0 = perturb(rgb0, delta0)
			
 
				+		err1 = perturb(rgb1, delta1)
			
 
				+		if (err0 < err1)
			
 
				+			if (err0 >= initial_error) break
			
 
				+			rgb0 += delta0
			
 
				+			next = 1
			
 
				+		else
			
 
				+			if (err1 >= initial_error) break
			
 
				+			rgb1 += delta1
			
 
				+			next = 0
			
 
				+		initial_err = map()
			
 
				+		for (;;)
			
 
				+			err = perturb(next ? rgb1:rgb0, delta)
			
 
				+			if (err >= initial_err) break
			
 
				+			next? rgb1 : rgb0 += delta
			
 
				+			initial_err = err
			
 
				+	*/
			
 
				+	IntEndptsRGBA new_a, new_b;
			
 
				+	IntEndptsRGBA new_endpt;
			
 
				+	int do_b;
			
 
				+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
			
 
				+
			
 
				+	// now optimize each channel separately
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
			
 
				+	{
			
 
				+		// figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+		// if we just alternate, we can easily end up in a local minima
			
 
				+        float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
			
 
				+
			
 
				+		if (err0 < err1)
			
 
				+		{
			
 
				+			if (err0 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
			
 
				+				nvAssert (orig_indices[j][i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.A[ch] = new_a.A[ch];
			
 
				+			opt_err = err0;
			
 
				+			do_b = 1;		// do B next
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (err1 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
			
 
				+				nvAssert (orig_indices[j][i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.B[ch] = new_b.B[ch];
			
 
				+			opt_err = err1;
			
 
				+			do_b = 0;		// do A next
			
 
				+		}
			
 
				+		
			
 
				+		// now alternate endpoints and keep trying until there is no improvement
			
 
				+		for (;;)
			
 
				+		{
			
 
				+            float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
			
 
				+			if (err >= opt_err)
			
 
				+				break;
			
 
				+
			
 
				+			for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[j][i] = temp_indices0[j][i];
			
 
				+				nvAssert (orig_indices[j][i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			if (do_b == 0)
			
 
				+				opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+			else
			
 
				+				opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+			opt_err = err;
			
 
				+			do_b = 1 - do_b;	// now move the other endpoint
			
 
				+		}
			
 
				+
			
 
				+		// see if the indices have changed
			
 
				+		int i;
			
 
				+		for (i=0; i<np; ++i)
			
 
				+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
			
 
				+				break;
			
 
				+
			
 
				+		if (i<np)
			
 
				+			ch = -1;	// start over
			
 
				+	}
			
 
				+
			
 
				+	// finally, do a small exhaustive search around what we think is the global minima to be sure
			
 
				+	bool first = true;
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
			
 
				+	{
			
 
				+        float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
			
 
				+
			
 
				+		if (new_err < opt_err)
			
 
				+		{
			
 
				+			opt_err = new_err;
			
 
				+
			
 
				+			if (first)
			
 
				+			{
			
 
				+				for (int j=0; j<NINDEXARRAYS; ++j)
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+				{
			
 
				+					orig_indices[j][i] = temp_indices0[j][i];
			
 
				+					nvAssert (orig_indices[j][i] != -1);
			
 
				+				}
			
 
				+				first = false;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// see if the indices have changed
			
 
				+				int i;
			
 
				+				for (i=0; i<np; ++i)
			
 
				+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
			
 
				+						break;
			
 
				+
			
 
				+				if (i<np)
			
 
				+				{
			
 
				+					ch = -1;	// start over
			
 
				+					first = true;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return opt_err;
			
 
				+}
			
 
				+
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const float orig_err[NREGIONS], 
			
 
				+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
			
 
				+{
			
 
				+	Vector4 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+	IntEndptsRGBA temp_in, temp_out;
			
 
				+
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		// collect the pixels in the region
			
 
				+		int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    np++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+		opt_endpts[region] = temp_in = orig_endpts[region];
			
 
				+		opt_err[region] = orig_err[region];
			
 
				+
			
 
				+		float best_err = orig_err[region];
			
 
				+
			
 
				+		// make sure we have a valid error for temp_in
			
 
				+		// we didn't change temp_in, so orig_err[region] is still valid
			
 
				+		float temp_in_err = orig_err[region];
			
 
				+
			
 
				+		// now try to optimize these endpoints
			
 
				+        float temp_out_err = optimize_one(pixels, importance, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
			
 
				+
			
 
				+		// if we find an improvement, update the best so far and correct the output endpoints and errors
			
 
				+		if (temp_out_err < best_err)
			
 
				+		{
			
 
				+			best_err = temp_out_err;
			
 
				+			opt_err[region] = temp_out_err;
			
 
				+			opt_endpts[region] = temp_out;
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+	for each pattern
			
 
				+		convert endpoints using pattern precision
			
 
				+		assign indices and get initial error
			
 
				+		compress indices (and possibly reorder endpoints)
			
 
				+		transform endpoints
			
 
				+		if transformed endpoints fit pattern
			
 
				+			get original endpoints back
			
 
				+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+			compress new indices
			
 
				+			transform new endpoints
			
 
				+			if new endpoints fit pattern AND if error is improved
			
 
				+				emit compressed block with new data
			
 
				+			else
			
 
				+				emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+static float refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
			
 
				+{
			
 
				+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
			
 
				+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
			
 
				+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+	{
			
 
				+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
			
 
				+
			
 
				+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
			
 
				+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
			
 
				+
			
 
				+		if (patterns[sp].transform_mode)
			
 
				+			transform_forward(patterns[sp].transform_mode, orig_endpts);
			
 
				+
			
 
				+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
			
 
				+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
			
 
				+		if (endpts_fit(orig_endpts, patterns[sp]))
			
 
				+		{
			
 
				+			if (patterns[sp].transform_mode)
			
 
				+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
			
 
				+
			
 
				+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
			
 
				+
			
 
				+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
			
 
				+			// (nreed) Commented out asserts because they go off all the time...not sure why
			
 
				+			//for (int i=0; i<NREGIONS; ++i)
			
 
				+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
			
 
				+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
			
 
				+
			
 
				+			if (patterns[sp].transform_mode)
			
 
				+				transform_forward(patterns[sp].transform_mode, opt_endpts);
			
 
				+
			
 
				+			orig_toterr = opt_toterr = 0;
			
 
				+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+			{
			
 
				+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
			
 
				+				return opt_toterr;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// either it stopped fitting when we optimized it, or there was no improvement
			
 
				+				// so go back to the unoptimized endpoints which we know will fit
			
 
				+				if (patterns[sp].transform_mode)
			
 
				+					transform_forward(patterns[sp].transform_mode, orig_endpts);
			
 
				+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
			
 
				+				return orig_toterr;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 5).";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void clamp(Vector4 &v)
			
 
				+{
			
 
				+	if (v.x < 0.0f) v.x = 0.0f;
			
 
				+	if (v.x > 255.0f) v.x = 255.0f;
			
 
				+	if (v.y < 0.0f) v.y = 0.0f;
			
 
				+	if (v.y > 255.0f) v.y = 255.0f;
			
 
				+	if (v.z < 0.0f) v.z = 0.0f;
			
 
				+	if (v.z > 255.0f) v.z = 255.0f;
			
 
				+	if (v.w < 0.0f) v.w = 0.0f;
			
 
				+	if (v.w > 255.0f) v.w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+// compute initial endpoints for the "RGB" portion and the "A" portion. 
			
 
				+// Note these channels may have been rotated.
			
 
				+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		int np = 0;
			
 
				+		Vector3 colors[Tile::TILE_TOTAL];
			
 
				+		float alphas[Tile::TILE_TOTAL];
			
 
				+		Vector4 mean(0,0,0,0);
			
 
				+
			
 
				+		for (int y = 0; y < tile.size_y; y++)
			
 
				+		for (int x = 0; x < tile.size_x; x++)
			
 
				+			if (REGION(x,y,shapeindex) == region)
			
 
				+			{
			
 
				+				colors[np] = tile.data[y][x].xyz();
			
 
				+				alphas[np] = tile.data[y][x].w;
			
 
				+				mean += tile.data[y][x];
			
 
				+				++np;
			
 
				+			}
			
 
				+
			
 
				+		// handle simple cases	
			
 
				+		if (np == 0)
			
 
				+		{
			
 
				+			Vector4 zero(0,0,0,255.0f);
			
 
				+			endpts[region].A = zero;
			
 
				+			endpts[region].B = zero;
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 1)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[0], alphas[0]);
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 2)
			
 
				+		{
			
 
				+			endpts[region].A = Vector4(colors[0], alphas[0]);
			
 
				+			endpts[region].B = Vector4(colors[1], alphas[1]);
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		mean /= float(np);
			
 
				+
			
 
				+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+		// project each pixel value along the principal direction
			
 
				+		float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+		float mina = FLT_MAX, maxa = -FLT_MAX;
			
 
				+		for (int i = 0; i < np; i++) 
			
 
				+		{
			
 
				+			float dp = dot(colors[i]-mean.xyz(), direction);
			
 
				+			if (dp < minp) minp = dp;
			
 
				+			if (dp > maxp) maxp = dp;
			
 
				+
			
 
				+			dp = alphas[i] - mean.w;
			
 
				+			if (dp < mina) mina = dp;
			
 
				+			if (dp > maxa) maxa = dp;
			
 
				+		}
			
 
				+
			
 
				+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+		endpts[region].A = mean + Vector4(minp*direction, mina);
			
 
				+		endpts[region].B = mean + Vector4(maxp*direction, maxa);
			
 
				+
			
 
				+		// clamp endpoints
			
 
				+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+		// shape based on endpoints being clamped
			
 
				+		clamp(endpts[region].A);
			
 
				+		clamp(endpts[region].B);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+float AVPCL::compress_mode5(const Tile &t, char *block)
			
 
				+{
			
 
				+	FltEndpts endpts[NREGIONS];
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+	int shape = 0;
			
 
				+	Tile t1;
			
 
				+
			
 
				+	// try all rotations. refine tries the 2 different indexings.
			
 
				+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
			
 
				+	{
			
 
				+		rotate_tile(t, r, t1);
			
 
				+		rough(t1, shape, endpts);
			
 
				+//		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
			
 
				+		for (int i = 0; i < 1 && msebest > 0; ++i)
			
 
				+		{
			
 
				+			float mse = refine(t1, shape, r, i, endpts, tempblock);
			
 
				+			if (mse < msebest)
			
 
				+			{
			
 
				+				memcpy(block, tempblock, sizeof(tempblock));
			
 
				+				msebest = mse;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	return msebest;
			
 
				+}
			
--- a/3rdparty/nvtt/bc7/avpcl_mode6.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode6.cpp
@@ -0,0 +1,1055 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+// x1000000 7777.1x2 4bi
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Matrix.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "endpts.h"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
			
 
				+
			
 
				+#define NINDICES	16
			
 
				+#define	INDEXBITS	4
			
 
				+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
			
 
				+#define	DENOM		(NINDICES-1)
			
 
				+#define	BIAS		(DENOM/2)
			
 
				+
			
 
				+#define	NSHAPES	1
			
 
				+
			
 
				+static int shapes[NSHAPES] =
			
 
				+{
			
 
				+	0x0000,
			
 
				+};
			
 
				+
			
 
				+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
			
 
				+
			
 
				+#define	NREGIONS	1
			
 
				+
			
 
				+#define	NBITSIZES	(NREGIONS*2)
			
 
				+#define	ABITINDEX(region)	(2*(region)+0)
			
 
				+#define	BBITINDEX(region)	(2*(region)+1)
			
 
				+
			
 
				+struct ChanBits
			
 
				+{
			
 
				+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
			
 
				+	int mode;				// associated mode value
			
 
				+	int modebits;			// number of mode bits
			
 
				+	const char *encoding;			// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define	NPATTERNS 1
			
 
				+
			
 
				+static Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+	// red	green	blue	alpha	mode  mb verilog
			
 
				+	7,7,	7,7,	7,7,	7,7,	0x40, 7, "",
			
 
				+};
			
 
				+
			
 
				+struct RegionPrec
			
 
				+{
			
 
				+	int	endpt_a_prec[NCHANNELS_RGBA];
			
 
				+	int endpt_b_prec[NCHANNELS_RGBA];
			
 
				+};
			
 
				+
			
 
				+struct PatternPrec
			
 
				+{
			
 
				+	RegionPrec region_precs[NREGIONS];
			
 
				+};
			
 
				+
			
 
				+// this is the precision for each channel and region
			
 
				+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
			
 
				+static PatternPrec pattern_precs[NPATTERNS] =
			
 
				+{
			
 
				+	7,7,7,7,	7,7,7,7,
			
 
				+};
			
 
				+
			
 
				+// return # of bits needed to store n. handle signed or unsigned cases properly
			
 
				+static int nbits(int n, bool issigned)
			
 
				+{
			
 
				+	int nb;
			
 
				+	if (n==0)
			
 
				+		return 0;	// no bits needed for 0, signed or not
			
 
				+	else if (n > 0)
			
 
				+	{
			
 
				+		for (nb=0; n; ++nb, n>>=1) ;
			
 
				+		return nb + (issigned?1:0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nvAssert (issigned);
			
 
				+		for (nb=0; n<-1; ++nb, n>>=1) ;
			
 
				+		return nb + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+we're using this table to assign lsbs
			
 
				+abgr	>=2	correct
			
 
				+0000	0	0
			
 
				+0001	0	0
			
 
				+0010	0	0
			
 
				+0011	1	x1
			
 
				+0100	0	0
			
 
				+0101	1	x1
			
 
				+0110	1	x1
			
 
				+0111	1	1
			
 
				+1000	0	0
			
 
				+1001	1	x0
			
 
				+1010	1	x0
			
 
				+1011	1	1
			
 
				+1100	1	x0
			
 
				+1101	1	1
			
 
				+1110	1	1
			
 
				+1111	1	1
			
 
				+
			
 
				+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
			
 
				+I choose to assign the lsbs so that the rgb channels are as good as possible.
			
 
				+*/
			
 
				+
			
 
				+// 8888 ->7777.1, use the "correct" column above to assign the lsb
			
 
				+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
			
 
				+{
			
 
				+	int onescnt;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+	{
			
 
				+		// ignore the alpha channel in the count
			
 
				+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
			
 
				+		compr_endpts.A[j] = endpts.A[j] >> 1;
			
 
				+		nvAssert (compr_endpts.A[j] < 128);
			
 
				+	}
			
 
				+	compr_endpts.a_lsb = onescnt >= 2;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+	{
			
 
				+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
			
 
				+		compr_endpts.B[j] = endpts.B[j] >> 1;
			
 
				+		nvAssert (compr_endpts.B[j] < 128);
			
 
				+	}
			
 
				+	compr_endpts.b_lsb = onescnt >= 2;
			
 
				+}
			
 
				+
			
 
				+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
			
 
				+{
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+	{
			
 
				+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
			
 
				+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		uncompress_one(compr_endpts[i], endpts[i]);
			
 
				+}
			
 
				+
			
 
				+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		compress_one(endpts[i], compr_endpts[i]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
			
 
				+{
			
 
				+	IntEndptsRGBA full_endpts[NREGIONS];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
			
 
				+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
			
 
				+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
			
 
				+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]+1);
			
 
				+
			
 
				+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
			
 
				+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
			
 
				+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
			
 
				+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]+1);
			
 
				+
			
 
				+		compress_one(full_endpts[region], q_endpts[region]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
			
 
				+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
			
 
				+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
			
 
				+{
			
 
				+	int index_positions[NREGIONS];
			
 
				+
			
 
				+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		int x = index_positions[region] & 3;
			
 
				+		int y = (index_positions[region] >> 2) & 3;
			
 
				+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+		if (indices[y][x] & HIGH_INDEXBIT)
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
			
 
				+			{
			
 
				+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
			
 
				+			}
			
 
				+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[y][x] = NINDICES - 1 - indices[y][x];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
			
 
				+{
			
 
				+	out.write(p.mode, p.modebits);
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+	{
			
 
				+		out.write(endpts[i].a_lsb, 1);
			
 
				+		out.write(endpts[i].b_lsb, 1);
			
 
				+	}
			
 
				+
			
 
				+	nvAssert (out.getptr() == 65);
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
			
 
				+{
			
 
				+	int mode = AVPCL::getmode(in);
			
 
				+
			
 
				+	pat_index = 0;
			
 
				+
			
 
				+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+	nvAssert (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+	p = patterns[pat_index];
			
 
				+
			
 
				+	shapeindex = 0;		// we don't have any
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+	
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+	{
			
 
				+		endpts[i].a_lsb  = in.read(1);
			
 
				+		endpts[i].b_lsb  = in.read(1);
			
 
				+	}
			
 
				+
			
 
				+	nvAssert (in.getptr() == 65);
			
 
				+}
			
 
				+
			
 
				+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
			
 
				+{
			
 
				+	nvAssert ((indices[0][0] & HIGH_INDEXBIT) == 0);
			
 
				+
			
 
				+	// the index we shorten is always index 0
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+	{
			
 
				+		if (i==0)
			
 
				+			out.write(indices[i>>2][i&3], INDEXBITS-1);	// write i..[2:0]
			
 
				+		else
			
 
				+			out.write(indices[i>>2][i&3], INDEXBITS);	// write i..[3:0]
			
 
				+	}
			
 
				+
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	// the index we shorten is always index 0
			
 
				+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
			
 
				+	{
			
 
				+		if (i==0)
			
 
				+			indices[i>>2][i&3] = in.read(INDEXBITS-1);	// read i..[1:0]
			
 
				+		else
			
 
				+			indices[i>>2][i&3] = in.read(INDEXBITS);	// read i..[2:0]
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
			
 
				+{
			
 
				+	Bits out(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	write_header(endpts, shapeindex, p, out);
			
 
				+
			
 
				+	write_indices(indices, shapeindex, out);
			
 
				+
			
 
				+	nvAssert(out.getptr() == AVPCL::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
			
 
				+{
			
 
				+	IntEndptsRGBA endpts;
			
 
				+
			
 
				+	uncompress_one(endpts_2, endpts);
			
 
				+
			
 
				+	// scale endpoints
			
 
				+	int a, b;			// really need a IntVec4...
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
			
 
				+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress_mode6(const char *block, Tile &t)
			
 
				+{
			
 
				+	Bits in(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	Pattern p;
			
 
				+	IntEndptsRGBA_2 endpts[NREGIONS];
			
 
				+	int shapeindex, pat_index;
			
 
				+
			
 
				+	read_header(in, endpts, shapeindex, p, pat_index);
			
 
				+	
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
			
 
				+
			
 
				+	int indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	read_indices(in, shapeindex, indices);
			
 
				+
			
 
				+	nvAssert(in.getptr() == AVPCL::BITSIZE);
			
 
				+
			
 
				+	// lookup
			
 
				+	for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	Vector4 palette[NINDICES];
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	generate_palette_quantized(endpts, region_prec, palette);
			
 
				+
			
 
				+	for (int i = 0; i < np; ++i)
			
 
				+	{
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
			
 
				+		{
			
 
				+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
			
 
				+									     Utils::metric4premult(colors[i], palette[j]) ;
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[i] = j;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+
			
 
				+		// check for early exit
			
 
				+		if (toterr > current_err)
			
 
				+		{
			
 
				+			// fill out bogus index values so it's initialized at least
			
 
				+			for (int k = i; k < np; ++k)
			
 
				+				indices[k] = -1;
			
 
				+
			
 
				+			return FLT_MAX;
			
 
				+		}
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
			
 
				+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
			
 
				+		toterr[region] = 0;
			
 
				+	}
			
 
				+
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
			
 
				+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[y][x] = i;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr[region] += besterr;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
			
 
				+// this function returns either old_err or a value smaller (if it was successful in improving the error)
			
 
				+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts,
			
 
				+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	// we have the old endpoints: old_endpts
			
 
				+	// we have the perturbed endpoints: new_endpts
			
 
				+	// we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+	IntEndptsRGBA_2 temp_endpts;
			
 
				+	float min_err = old_err;		// start with the best current error
			
 
				+	int beststep;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	// copy real endpoints so we can perturb them
			
 
				+	temp_endpts = new_endpts = old_endpts;
			
 
				+
			
 
				+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
			
 
				+
			
 
				+	// do a logarithmic search for the best error for this endpoint (which)
			
 
				+	for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+	{
			
 
				+		bool improved = false;
			
 
				+		for (int sign = -1; sign <= 1; sign += 2)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+			{
			
 
				+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
			
 
				+
			
 
				+			if (err < min_err)
			
 
				+			{
			
 
				+				improved = true;
			
 
				+				min_err = err;
			
 
				+				beststep = sign * step;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+		// if this was an improvement, move the endpoint and continue search from there
			
 
				+		if (improved)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+				new_endpts.A[ch] += beststep;
			
 
				+			else
			
 
				+				new_endpts.B[ch] += beststep;
			
 
				+		}
			
 
				+	}
			
 
				+	return min_err;
			
 
				+}
			
 
				+
			
 
				+// the larger the error the more time it is worth spending on an exhaustive search.
			
 
				+// perturb the endpoints at least -3 to 3.
			
 
				+// if err > 5000 perturb endpoints 50% of precision
			
 
				+// if err > 1000 25%
			
 
				+// if err > 200 12.5%
			
 
				+// if err > 40  6.25%
			
 
				+// for np = 16 -- adjust error thresholds as a function of np
			
 
				+// always ensure endpoint ordering is preserved (no need to overlap the scan)
			
 
				+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
			
 
				+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	IntEndptsRGBA_2 temp_endpts;
			
 
				+	float best_err = orig_err;
			
 
				+	int aprec = region_prec.endpt_a_prec[ch];
			
 
				+	int bprec = region_prec.endpt_b_prec[ch];
			
 
				+	int good_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
			
 
				+
			
 
				+	if (orig_err == 0) return orig_err;
			
 
				+
			
 
				+	int adelta = 0, bdelta = 0;
			
 
				+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
			
 
				+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
			
 
				+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
			
 
				+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
			
 
				+	adelta = max(adelta, 3);
			
 
				+	bdelta = max(bdelta, 3);
			
 
				+
			
 
				+#ifdef	DISABLE_EXHAUSTIVE
			
 
				+	adelta = bdelta = 3;
			
 
				+#endif
			
 
				+
			
 
				+	temp_endpts = opt_endpts;
			
 
				+
			
 
				+	// ok figure out the range of A and B
			
 
				+	int alow = max(0, opt_endpts.A[ch] - adelta);
			
 
				+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
			
 
				+	int blow = max(0, opt_endpts.B[ch] - bdelta);
			
 
				+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
			
 
				+
			
 
				+	// now there's no need to swap the ordering of A and B
			
 
				+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
			
 
				+
			
 
				+	int amin, bmin;
			
 
				+
			
 
				+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
			
 
				+	{
			
 
				+		// keep a <= b
			
 
				+		for (int a = alow; a <= ahigh; ++a)
			
 
				+		for (int b = max(a, blow); b < bhigh; ++b)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// keep b <= a
			
 
				+		for (int b = blow; b < bhigh; ++b)
			
 
				+		for (int a = max(b, alow); a <= ahigh; ++a)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err; 
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (best_err < orig_err)
			
 
				+	{
			
 
				+		opt_endpts.A[ch] = amin;
			
 
				+		opt_endpts.B[ch] = bmin;
			
 
				+		orig_err = best_err;
			
 
				+		// if we actually improved, update the indices
			
 
				+		for (int i=0; i<np; ++i)
			
 
				+			indices[i] = good_indices[i];
			
 
				+	}
			
 
				+	return best_err;
			
 
				+}
			
 
				+
			
 
				+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
			
 
				+{
			
 
				+	float opt_err = orig_err;
			
 
				+
			
 
				+	opt_endpts = orig_endpts;
			
 
				+
			
 
				+	/*
			
 
				+		err0 = perturb(rgb0, delta0)
			
 
				+		err1 = perturb(rgb1, delta1)
			
 
				+		if (err0 < err1)
			
 
				+			if (err0 >= initial_error) break
			
 
				+			rgb0 += delta0
			
 
				+			next = 1
			
 
				+		else
			
 
				+			if (err1 >= initial_error) break
			
 
				+			rgb1 += delta1
			
 
				+			next = 0
			
 
				+		initial_err = map()
			
 
				+		for (;;)
			
 
				+			err = perturb(next ? rgb1:rgb0, delta)
			
 
				+			if (err >= initial_err) break
			
 
				+			next? rgb1 : rgb0 += delta
			
 
				+			initial_err = err
			
 
				+	*/
			
 
				+	IntEndptsRGBA_2 new_a, new_b;
			
 
				+	IntEndptsRGBA_2 new_endpt;
			
 
				+	int do_b;
			
 
				+	int orig_indices[Tile::TILE_TOTAL];
			
 
				+	int new_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices0[Tile::TILE_TOTAL];
			
 
				+	int temp_indices1[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	// now optimize each channel separately
			
 
				+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
			
 
				+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
			
 
				+	{
			
 
				+		// figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+		// if we just alternate, we can easily end up in a local minima
			
 
				+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
			
 
				+
			
 
				+		if (err0 < err1)
			
 
				+		{
			
 
				+			if (err0 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.A[ch] = new_a.A[ch];
			
 
				+			opt_err = err0;
			
 
				+			do_b = 1;		// do B next
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (err1 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices1[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.B[ch] = new_b.B[ch];
			
 
				+			opt_err = err1;
			
 
				+			do_b = 0;		// do A next
			
 
				+		}
			
 
				+		
			
 
				+		// now alternate endpoints and keep trying until there is no improvement
			
 
				+		for (;;)
			
 
				+		{
			
 
				+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
			
 
				+			if (err >= opt_err)
			
 
				+				break;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (new_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			if (do_b == 0)
			
 
				+				opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+			else
			
 
				+				opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+			opt_err = err;
			
 
				+			do_b = 1 - do_b;	// now move the other endpoint
			
 
				+		}
			
 
				+
			
 
				+		// see if the indices have changed
			
 
				+		int i;
			
 
				+		for (i=0; i<np; ++i)
			
 
				+			if (orig_indices[i] != new_indices[i])
			
 
				+				break;
			
 
				+
			
 
				+		if (i<np)
			
 
				+			ch = -1;	// start over
			
 
				+	}
			
 
				+
			
 
				+	// finally, do a small exhaustive search around what we think is the global minima to be sure
			
 
				+	// note this is independent of the above search, so we don't care about the indices from the above
			
 
				+	// we don't care about the above because if they differ, so what? we've already started at ch=0
			
 
				+	bool first = true;
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
			
 
				+	{
			
 
				+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
			
 
				+
			
 
				+		if (new_err < opt_err)
			
 
				+		{
			
 
				+			opt_err = new_err;
			
 
				+
			
 
				+			if (first)
			
 
				+			{
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+				{
			
 
				+					orig_indices[i] = temp_indices0[i];
			
 
				+					nvAssert (orig_indices[i] != -1);
			
 
				+				}
			
 
				+				first = false;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// see if the indices have changed
			
 
				+				int i;
			
 
				+				for (i=0; i<np; ++i)
			
 
				+					if (orig_indices[i] != temp_indices0[i])
			
 
				+						break;
			
 
				+
			
 
				+				if (i<np)
			
 
				+				{
			
 
				+					ch = -1;	// start over
			
 
				+					first = true;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return opt_err;
			
 
				+}
			
 
				+
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
			
 
				+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
			
 
				+{
			
 
				+	Vector4 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+	IntEndptsRGBA_2 temp_in, temp_out;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		// collect the pixels in the region
			
 
				+		int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    np++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+		opt_endpts[region] = temp_in = orig_endpts[region];
			
 
				+		opt_err[region] = orig_err[region];
			
 
				+
			
 
				+		float best_err = orig_err[region];
			
 
				+
			
 
				+		// try all lsb modes as we search for better endpoints
			
 
				+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
			
 
				+		{
			
 
				+			temp_in.a_lsb = lsbmode & 1;
			
 
				+			temp_in.b_lsb = (lsbmode >> 1) & 1;
			
 
				+
			
 
				+			// make sure we have a valid error for temp_in
			
 
				+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
			
 
				+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
			
 
				+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
			
 
				+
			
 
				+			// now try to optimize these endpoints
			
 
				+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
			
 
				+
			
 
				+			// if we find an improvement, update the best so far and correct the output endpoints and errors
			
 
				+			if (temp_out_err < best_err)
			
 
				+			{
			
 
				+				best_err = temp_out_err;
			
 
				+				opt_err[region] = temp_out_err;
			
 
				+				opt_endpts[region] = temp_out;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+	for each pattern
			
 
				+		convert endpoints using pattern precision
			
 
				+		assign indices and get initial error
			
 
				+		compress indices (and possibly reorder endpoints)
			
 
				+		transform endpoints
			
 
				+		if transformed endpoints fit pattern
			
 
				+			get original endpoints back
			
 
				+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+			compress new indices
			
 
				+			transform new endpoints
			
 
				+			if new endpoints fit pattern AND if error is improved
			
 
				+				emit compressed block with new data
			
 
				+			else
			
 
				+				emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+
			
 
				+     simplify the above given that there is no transform now and that endpoints will always fit
			
 
				+*/
			
 
				+
			
 
				+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
			
 
				+{
			
 
				+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
			
 
				+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
			
 
				+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+	{
			
 
				+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
			
 
				+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
			
 
				+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
			
 
				+
			
 
				+		optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
			
 
				+
			
 
				+		assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
			
 
				+		// (nreed) Commented out asserts because they go off all the time...not sure why
			
 
				+		//for (int i=0; i<NREGIONS; ++i)
			
 
				+		//	nvAssert(expected_opt_err[i] == opt_err[i]);
			
 
				+		swap_indices(opt_endpts, opt_indices, shapeindex_best);
			
 
				+
			
 
				+		orig_toterr = opt_toterr = 0;
			
 
				+		for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+		//nvAssert(opt_toterr <= orig_toterr);
			
 
				+
			
 
				+		if (opt_toterr < orig_toterr)
			
 
				+		{
			
 
				+			emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
			
 
				+			return opt_toterr;
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
			
 
				+			return orig_toterr;
			
 
				+		}
			
 
				+	}
			
 
				+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 6).";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void clamp(Vector4 &v)
			
 
				+{
			
 
				+	if (v.x < 0.0f) v.x = 0.0f;
			
 
				+	if (v.x > 255.0f) v.x = 255.0f;
			
 
				+	if (v.y < 0.0f) v.y = 0.0f;
			
 
				+	if (v.y > 255.0f) v.y = 255.0f;
			
 
				+	if (v.z < 0.0f) v.z = 0.0f;
			
 
				+	if (v.z > 255.0f) v.z = 255.0f;
			
 
				+	if (v.w < 0.0f) v.w = 0.0f;
			
 
				+	if (v.w > 255.0f) v.w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
			
 
				+}
			
 
				+
			
 
				+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
			
 
				+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	generate_palette_unquantized(endpts, palette);
			
 
				+
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr;
			
 
				+
			
 
				+		besterr = Utils::metric4(tile.data[y][x], palette[region][0]);
			
 
				+
			
 
				+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+				besterr = err;
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		int np = 0;
			
 
				+		Vector4 colors[Tile::TILE_TOTAL];
			
 
				+		Vector4 mean(0,0,0,0);
			
 
				+
			
 
				+		for (int y = 0; y < tile.size_y; y++)
			
 
				+		for (int x = 0; x < tile.size_x; x++)
			
 
				+			if (REGION(x,y,shapeindex) == region)
			
 
				+			{
			
 
				+				colors[np] = tile.data[y][x];
			
 
				+				mean += tile.data[y][x];
			
 
				+				++np;
			
 
				+			}
			
 
				+
			
 
				+		// handle simple cases	
			
 
				+		if (np == 0)
			
 
				+		{
			
 
				+			Vector4 zero(0,0,0,255.0f);
			
 
				+			endpts[region].A = zero;
			
 
				+			endpts[region].B = zero;
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 1)
			
 
				+		{
			
 
				+			endpts[region].A = colors[0];
			
 
				+			endpts[region].B = colors[0];
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 2)
			
 
				+		{
			
 
				+			endpts[region].A = colors[0];
			
 
				+			endpts[region].B = colors[1];
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		mean /= float(np);
			
 
				+
			
 
				+		Vector4 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+		// project each pixel value along the principal direction
			
 
				+		float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+		for (int i = 0; i < np; i++) 
			
 
				+		{
			
 
				+			float dp = dot(colors[i]-mean, direction);
			
 
				+			if (dp < minp) minp = dp;
			
 
				+			if (dp > maxp) maxp = dp;
			
 
				+		}
			
 
				+
			
 
				+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+		endpts[region].A = mean + minp*direction;
			
 
				+		endpts[region].B = mean + maxp*direction;
			
 
				+
			
 
				+		// clamp endpoints
			
 
				+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+		// shape based on endpoints being clamped
			
 
				+		clamp(endpts[region].A);
			
 
				+		clamp(endpts[region].B);
			
 
				+	}
			
 
				+
			
 
				+	return map_colors(tile, shapeindex, endpts);
			
 
				+}
			
 
				+
			
 
				+static void swap(float *list1, int *list2, int i, int j)
			
 
				+{
			
 
				+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
			
 
				+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
			
 
				+}
			
 
				+
			
 
				+float AVPCL::compress_mode6(const Tile &t, char *block)
			
 
				+{
			
 
				+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
			
 
				+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
			
 
				+	const int NITEMS=1;
			
 
				+
			
 
				+	// pick the best NITEMS shapes and refine these.
			
 
				+	struct {
			
 
				+		FltEndpts endpts[NREGIONS];
			
 
				+	} all[NSHAPES];
			
 
				+	float roughmse[NSHAPES];
			
 
				+	int index[NSHAPES];
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+
			
 
				+	for (int i=0; i<NSHAPES; ++i)
			
 
				+	{
			
 
				+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
			
 
				+		index[i] = i;
			
 
				+	}
			
 
				+
			
 
				+	// bubble sort -- only need to bubble up the first NITEMS items
			
 
				+	for (int i=0; i<NITEMS; ++i)
			
 
				+	for (int j=i+1; j<NSHAPES; ++j)
			
 
				+		if (roughmse[i] > roughmse[j])
			
 
				+			swap(roughmse, index, i, j);
			
 
				+
			
 
				+	for (int i=0; i<NITEMS && msebest>0; ++i)
			
 
				+	{
			
 
				+		int shape = index[i];
			
 
				+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
			
 
				+		if (mse < msebest)
			
 
				+		{
			
 
				+			memcpy(block, tempblock, sizeof(tempblock));
			
 
				+			msebest = mse;
			
 
				+		}
			
 
				+	}
			
 
				+	return msebest;
			
 
				+}
			
 
				+
			
--- a/3rdparty/nvtt/bc7/avpcl_mode7.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_mode7.cpp
@@ -0,0 +1,1094 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
			
 
				+
			
 
				+// x10000000 5555.1x4 64p 2bi (30b)
			
 
				+
			
 
				+#include "bits.h"
			
 
				+#include "tile.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvcore/Debug.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include "nvmath/Matrix.inl"
			
 
				+#include "nvmath/Fitting.h"
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "endpts.h"
			
 
				+#include <cstring>
			
 
				+#include <float.h>
			
 
				+
			
 
				+#include "shapes_two.h"
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
			
 
				+
			
 
				+#define NINDICES	4
			
 
				+#define	INDEXBITS	2
			
 
				+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
			
 
				+#define	DENOM		(NINDICES-1)
			
 
				+#define	BIAS		(DENOM/2)
			
 
				+
			
 
				+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
			
 
				+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
			
 
				+// stop without having to touch all shapes?
			
 
				+
			
 
				+#define	POS_TO_X(pos)	((pos)&3)
			
 
				+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
			
 
				+
			
 
				+#define	NBITSIZES	(NREGIONS*2)
			
 
				+#define	ABITINDEX(region)	(2*(region)+0)
			
 
				+#define	BBITINDEX(region)	(2*(region)+1)
			
 
				+
			
 
				+struct ChanBits
			
 
				+{
			
 
				+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
			
 
				+};
			
 
				+
			
 
				+struct Pattern
			
 
				+{
			
 
				+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
			
 
				+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
			
 
				+	int mode;				// associated mode value
			
 
				+	int modebits;			// number of mode bits
			
 
				+	const char *encoding;			// verilog description of encoding for this mode
			
 
				+};
			
 
				+
			
 
				+#define	NPATTERNS 1
			
 
				+#define	NREGIONS  2
			
 
				+
			
 
				+static Pattern patterns[NPATTERNS] =
			
 
				+{
			
 
				+	// red		green		blue		alpha		xfm	mode  mb
			
 
				+	5,5,5,5,	5,5,5,5,	5,5,5,5,	5,5,5,5,	0,	0x80, 8, "",
			
 
				+};
			
 
				+
			
 
				+struct RegionPrec
			
 
				+{
			
 
				+	int	endpt_a_prec[NCHANNELS_RGBA];
			
 
				+	int endpt_b_prec[NCHANNELS_RGBA];
			
 
				+};
			
 
				+
			
 
				+struct PatternPrec
			
 
				+{
			
 
				+	RegionPrec region_precs[NREGIONS];
			
 
				+};
			
 
				+
			
 
				+
			
 
				+// this is the precision for each channel and region
			
 
				+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
			
 
				+static PatternPrec pattern_precs[NPATTERNS] =
			
 
				+{
			
 
				+	5,5,5,5,  5,5,5,5,  5,5,5,5,  5,5,5,5,
			
 
				+};
			
 
				+
			
 
				+// return # of bits needed to store n. handle signed or unsigned cases properly
			
 
				+static int nbits(int n, bool issigned)
			
 
				+{
			
 
				+	int nb;
			
 
				+	if (n==0)
			
 
				+		return 0;	// no bits needed for 0, signed or not
			
 
				+	else if (n > 0)
			
 
				+	{
			
 
				+		for (nb=0; n; ++nb, n>>=1) ;
			
 
				+		return nb + (issigned?1:0);
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		nvAssert (issigned);
			
 
				+		for (nb=0; n<-1; ++nb, n>>=1) ;
			
 
				+		return nb + 1;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void transform_forward(IntEndptsRGBA_2 ep[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+static void transform_inverse(IntEndptsRGBA_2 ep[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+/*
			
 
				+we're using this table to assign lsbs
			
 
				+abgr	>=2	correct
			
 
				+0000	0	0
			
 
				+0001	0	0
			
 
				+0010	0	0
			
 
				+0011	1	x1
			
 
				+0100	0	0
			
 
				+0101	1	x1
			
 
				+0110	1	x1
			
 
				+0111	1	1
			
 
				+1000	0	0
			
 
				+1001	1	x0
			
 
				+1010	1	x0
			
 
				+1011	1	1
			
 
				+1100	1	x0
			
 
				+1101	1	1
			
 
				+1110	1	1
			
 
				+1111	1	1
			
 
				+
			
 
				+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
			
 
				+I choose to assign the lsbs so that the rgb channels are as good as possible.
			
 
				+*/
			
 
				+
			
 
				+// 6666 ->5555.1, use the "correct" column above to assign the lsb
			
 
				+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
			
 
				+{
			
 
				+	int onescnt;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+	{
			
 
				+		// ignore the alpha channel in the count
			
 
				+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
			
 
				+		compr_endpts.A[j] = endpts.A[j] >> 1;
			
 
				+		nvAssert (compr_endpts.A[j] < 32);
			
 
				+	}
			
 
				+	compr_endpts.a_lsb = onescnt >= 2;
			
 
				+
			
 
				+	onescnt = 0;
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+	{
			
 
				+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
			
 
				+		compr_endpts.B[j] = endpts.B[j] >> 1;
			
 
				+		nvAssert (compr_endpts.B[j] < 32);
			
 
				+	}
			
 
				+	compr_endpts.b_lsb = onescnt >= 2;
			
 
				+}
			
 
				+
			
 
				+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
			
 
				+{
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+	{
			
 
				+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
			
 
				+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
			
 
				+	}
			
 
				+}
			
 
				+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		uncompress_one(compr_endpts[i], endpts[i]);
			
 
				+}
			
 
				+
			
 
				+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+		compress_one(endpts[i], compr_endpts[i]);
			
 
				+}
			
 
				+
			
 
				+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
			
 
				+{
			
 
				+	IntEndptsRGBA full_endpts[NREGIONS];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
			
 
				+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
			
 
				+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
			
 
				+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]+1);
			
 
				+
			
 
				+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
			
 
				+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
			
 
				+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
			
 
				+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]+1);
			
 
				+
			
 
				+		compress_one(full_endpts[region], q_endpts[region]);
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
			
 
				+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
			
 
				+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
			
 
				+
			
 
				+		int x = POS_TO_X(position);
			
 
				+		int y = POS_TO_Y(position);
			
 
				+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
			
 
				+		if (indices[y][x] & HIGH_INDEXBIT)
			
 
				+		{
			
 
				+			// high bit is set, swap the endpts and indices for this region
			
 
				+			int t;
			
 
				+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
			
 
				+			{
			
 
				+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
			
 
				+			}
			
 
				+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
			
 
				+
			
 
				+			for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+			for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+				if (REGION(x,y,shapeindex) == region)
			
 
				+					indices[y][x] = NINDICES - 1 - indices[y][x];
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
			
 
				+{
			
 
				+	return true;
			
 
				+}
			
 
				+
			
 
				+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
			
 
				+{
			
 
				+	out.write(p.mode, p.modebits);
			
 
				+	out.write(shapeindex, SHAPEBITS);
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+	{
			
 
				+		out.write(endpts[i].a_lsb, 1);
			
 
				+		out.write(endpts[i].b_lsb, 1);
			
 
				+	}
			
 
				+
			
 
				+	nvAssert (out.getptr() == 98);
			
 
				+}
			
 
				+
			
 
				+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
			
 
				+{
			
 
				+	int mode = AVPCL::getmode(in);
			
 
				+
			
 
				+	pat_index = 0;
			
 
				+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
			
 
				+	nvAssert (in.getptr() == patterns[pat_index].modebits);
			
 
				+
			
 
				+	shapeindex = in.read(SHAPEBITS);
			
 
				+	p = patterns[pat_index];
			
 
				+
			
 
				+	for (int j=0; j<NCHANNELS_RGBA; ++j)
			
 
				+		for (int i=0; i<NREGIONS; ++i)
			
 
				+		{
			
 
				+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
			
 
				+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
			
 
				+		}
			
 
				+	
			
 
				+	for (int i=0; i<NREGIONS; ++i)
			
 
				+	{
			
 
				+		endpts[i].a_lsb  = in.read(1);
			
 
				+		endpts[i].b_lsb  = in.read(1);
			
 
				+	}
			
 
				+
			
 
				+	nvAssert (in.getptr() == 98);
			
 
				+}
			
 
				+
			
 
				+// WORK PLACEHOLDER -- keep it simple for now
			
 
				+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
			
 
				+{
			
 
				+	int positions[NREGIONS];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
			
 
				+{
			
 
				+	int positions[NREGIONS];
			
 
				+
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
			
 
				+
			
 
				+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
			
 
				+	{
			
 
				+		int x = POS_TO_X(pos);
			
 
				+		int y = POS_TO_Y(pos);
			
 
				+
			
 
				+		bool match = false;
			
 
				+
			
 
				+		for (int r = 0; r < NREGIONS; ++r)
			
 
				+			if (positions[r] == pos) { match = true; break; }
			
 
				+
			
 
				+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
			
 
				+{
			
 
				+	Bits out(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	write_header(endpts, shapeindex, p, out);
			
 
				+
			
 
				+	write_indices(indices, shapeindex, out);
			
 
				+
			
 
				+	nvAssert(out.getptr() == AVPCL::BITSIZE);
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
			
 
				+{
			
 
				+	IntEndptsRGBA endpts;
			
 
				+
			
 
				+	uncompress_one(endpts_2, endpts);
			
 
				+
			
 
				+	// scale endpoints
			
 
				+	int a, b;			// really need a IntVec4...
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
			
 
				+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+
			
 
				+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
			
 
				+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
			
 
				+
			
 
				+	// interpolate
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM));
			
 
				+}
			
 
				+
			
 
				+// sign extend but only if it was transformed
			
 
				+static void sign_extend(Pattern &p, IntEndptsRGBA_2 endpts[NREGIONS])
			
 
				+{
			
 
				+	nvUnreachable();
			
 
				+}
			
 
				+
			
 
				+void AVPCL::decompress_mode7(const char *block, Tile &t)
			
 
				+{
			
 
				+	Bits in(block, AVPCL::BITSIZE);
			
 
				+
			
 
				+	Pattern p;
			
 
				+	IntEndptsRGBA_2 endpts[NREGIONS];
			
 
				+	int shapeindex, pat_index;
			
 
				+
			
 
				+	read_header(in, endpts, shapeindex, p, pat_index);
			
 
				+	
			
 
				+	if (p.transformed)
			
 
				+	{
			
 
				+		sign_extend(p, endpts);
			
 
				+		transform_inverse(endpts);
			
 
				+	}
			
 
				+
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+	for (int r = 0; r < NREGIONS; ++r)
			
 
				+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
			
 
				+
			
 
				+	int indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	read_indices(in, shapeindex, indices);
			
 
				+
			
 
				+	nvAssert(in.getptr() == AVPCL::BITSIZE);
			
 
				+
			
 
				+	// lookup
			
 
				+	for (int y = 0; y < Tile::TILE_H; y++)
			
 
				+	for (int x = 0; x < Tile::TILE_W; x++)
			
 
				+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
			
 
				+}
			
 
				+
			
 
				+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
			
 
				+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	Vector4 palette[NINDICES];
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	generate_palette_quantized(endpts, region_prec, palette);
			
 
				+
			
 
				+	for (int i = 0; i < np; ++i)
			
 
				+	{
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
			
 
				+		{
			
 
				+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
			
 
				+									     Utils::metric4premult(colors[i], palette[j]) ;
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[i] = j;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+
			
 
				+		// check for early exit
			
 
				+		if (toterr > current_err)
			
 
				+		{
			
 
				+			// fill out bogus index values so it's initialized at least
			
 
				+			for (int k = i; k < np; ++k)
			
 
				+				indices[k] = -1;
			
 
				+
			
 
				+			return FLT_MAX;
			
 
				+		}
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
			
 
				+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
			
 
				+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	{
			
 
				+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
			
 
				+		toterr[region] = 0;
			
 
				+	}
			
 
				+
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
			
 
				+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+			{
			
 
				+				besterr = err;
			
 
				+				indices[y][x] = i;
			
 
				+			}
			
 
				+		}
			
 
				+		toterr[region] += besterr;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
			
 
				+// this function returns either old_err or a value smaller (if it was successful in improving the error)
			
 
				+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts,
			
 
				+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	// we have the old endpoints: old_endpts
			
 
				+	// we have the perturbed endpoints: new_endpts
			
 
				+	// we have the temporary endpoints: temp_endpts
			
 
				+
			
 
				+	IntEndptsRGBA_2 temp_endpts;
			
 
				+	float min_err = old_err;		// start with the best current error
			
 
				+	int beststep;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	// copy real endpoints so we can perturb them
			
 
				+	temp_endpts = new_endpts = old_endpts;
			
 
				+
			
 
				+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
			
 
				+
			
 
				+	// do a logarithmic search for the best error for this endpoint (which)
			
 
				+	for (int step = 1 << (prec-1); step; step >>= 1)
			
 
				+	{
			
 
				+		bool improved = false;
			
 
				+		for (int sign = -1; sign <= 1; sign += 2)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+			{
			
 
				+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
			
 
				+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
			
 
				+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
			
 
				+					continue;
			
 
				+			}
			
 
				+
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
			
 
				+
			
 
				+			if (err < min_err)
			
 
				+			{
			
 
				+				improved = true;
			
 
				+				min_err = err;
			
 
				+				beststep = sign * step;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+		// if this was an improvement, move the endpoint and continue search from there
			
 
				+		if (improved)
			
 
				+		{
			
 
				+			if (do_b == 0)
			
 
				+				new_endpts.A[ch] += beststep;
			
 
				+			else
			
 
				+				new_endpts.B[ch] += beststep;
			
 
				+		}
			
 
				+	}
			
 
				+	return min_err;
			
 
				+}
			
 
				+
			
 
				+// the larger the error the more time it is worth spending on an exhaustive search.
			
 
				+// perturb the endpoints at least -3 to 3.
			
 
				+// if err > 5000 perturb endpoints 50% of precision
			
 
				+// if err > 1000 25%
			
 
				+// if err > 200 12.5%
			
 
				+// if err > 40  6.25%
			
 
				+// for np = 16 -- adjust error thresholds as a function of np
			
 
				+// always ensure endpoint ordering is preserved (no need to overlap the scan)
			
 
				+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
			
 
				+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
			
 
				+{
			
 
				+	IntEndptsRGBA_2 temp_endpts;
			
 
				+	float best_err = orig_err;
			
 
				+	int aprec = region_prec.endpt_a_prec[ch];
			
 
				+	int bprec = region_prec.endpt_b_prec[ch];
			
 
				+	int good_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int i=0; i<np; ++i)
			
 
				+		indices[i] = -1;
			
 
				+
			
 
				+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
			
 
				+
			
 
				+	if (orig_err == 0) return orig_err;
			
 
				+
			
 
				+	int adelta = 0, bdelta = 0;
			
 
				+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
			
 
				+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
			
 
				+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
			
 
				+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
			
 
				+	adelta = max(adelta, 3);
			
 
				+	bdelta = max(bdelta, 3);
			
 
				+
			
 
				+#ifdef	DISABLE_EXHAUSTIVE
			
 
				+	adelta = bdelta = 3;
			
 
				+#endif
			
 
				+
			
 
				+	temp_endpts = opt_endpts;
			
 
				+
			
 
				+	// ok figure out the range of A and B
			
 
				+	int alow = max(0, opt_endpts.A[ch] - adelta);
			
 
				+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
			
 
				+	int blow = max(0, opt_endpts.B[ch] - bdelta);
			
 
				+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
			
 
				+
			
 
				+	// now there's no need to swap the ordering of A and B
			
 
				+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
			
 
				+
			
 
				+	int amin, bmin;
			
 
				+
			
 
				+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
			
 
				+	{
			
 
				+		// keep a <= b
			
 
				+		for (int a = alow; a <= ahigh; ++a)
			
 
				+		for (int b = max(a, blow); b < bhigh; ++b)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err;
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	else
			
 
				+	{
			
 
				+		// keep b <= a
			
 
				+		for (int b = blow; b < bhigh; ++b)
			
 
				+		for (int a = max(b, alow); a <= ahigh; ++a)
			
 
				+		{
			
 
				+			temp_endpts.A[ch] = a;
			
 
				+			temp_endpts.B[ch] = b;
			
 
				+		
			
 
				+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
			
 
				+			if (err < best_err) 
			
 
				+			{ 
			
 
				+				amin = a; 
			
 
				+				bmin = b; 
			
 
				+				best_err = err; 
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+					good_indices[i] = temp_indices[i];
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	if (best_err < orig_err)
			
 
				+	{
			
 
				+		opt_endpts.A[ch] = amin;
			
 
				+		opt_endpts.B[ch] = bmin;
			
 
				+		orig_err = best_err;
			
 
				+		// if we actually improved, update the indices
			
 
				+		for (int i=0; i<np; ++i)
			
 
				+			indices[i] = good_indices[i];
			
 
				+	}
			
 
				+	return best_err;
			
 
				+}
			
 
				+
			
 
				+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
			
 
				+{
			
 
				+	float opt_err = orig_err;
			
 
				+
			
 
				+	opt_endpts = orig_endpts;
			
 
				+
			
 
				+	/*
			
 
				+		err0 = perturb(rgb0, delta0)
			
 
				+		err1 = perturb(rgb1, delta1)
			
 
				+		if (err0 < err1)
			
 
				+			if (err0 >= initial_error) break
			
 
				+			rgb0 += delta0
			
 
				+			next = 1
			
 
				+		else
			
 
				+			if (err1 >= initial_error) break
			
 
				+			rgb1 += delta1
			
 
				+			next = 0
			
 
				+		initial_err = map()
			
 
				+		for (;;)
			
 
				+			err = perturb(next ? rgb1:rgb0, delta)
			
 
				+			if (err >= initial_err) break
			
 
				+			next? rgb1 : rgb0 += delta
			
 
				+			initial_err = err
			
 
				+	*/
			
 
				+	IntEndptsRGBA_2 new_a, new_b;
			
 
				+	IntEndptsRGBA_2 new_endpt;
			
 
				+	int do_b;
			
 
				+	int orig_indices[Tile::TILE_TOTAL];
			
 
				+	int new_indices[Tile::TILE_TOTAL];
			
 
				+	int temp_indices0[Tile::TILE_TOTAL];
			
 
				+	int temp_indices1[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	// now optimize each channel separately
			
 
				+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
			
 
				+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
			
 
				+	{
			
 
				+		// figure out which endpoint when perturbed gives the most improvement and start there
			
 
				+		// if we just alternate, we can easily end up in a local minima
			
 
				+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
			
 
				+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
			
 
				+
			
 
				+		if (err0 < err1)
			
 
				+		{
			
 
				+			if (err0 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.A[ch] = new_a.A[ch];
			
 
				+			opt_err = err0;
			
 
				+			do_b = 1;		// do B next
			
 
				+		}
			
 
				+		else
			
 
				+		{
			
 
				+			if (err1 >= opt_err)
			
 
				+				continue;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = orig_indices[i] = temp_indices1[i];
			
 
				+				nvAssert (orig_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			opt_endpts.B[ch] = new_b.B[ch];
			
 
				+			opt_err = err1;
			
 
				+			do_b = 0;		// do A next
			
 
				+		}
			
 
				+		
			
 
				+		// now alternate endpoints and keep trying until there is no improvement
			
 
				+		for (;;)
			
 
				+		{
			
 
				+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
			
 
				+			if (err >= opt_err)
			
 
				+				break;
			
 
				+
			
 
				+			for (int i=0; i<np; ++i)
			
 
				+			{
			
 
				+				new_indices[i] = temp_indices0[i];
			
 
				+				nvAssert (new_indices[i] != -1);
			
 
				+			}
			
 
				+
			
 
				+			if (do_b == 0)
			
 
				+				opt_endpts.A[ch] = new_endpt.A[ch];
			
 
				+			else
			
 
				+				opt_endpts.B[ch] = new_endpt.B[ch];
			
 
				+			opt_err = err;
			
 
				+			do_b = 1 - do_b;	// now move the other endpoint
			
 
				+		}
			
 
				+
			
 
				+		// see if the indices have changed
			
 
				+		int i;
			
 
				+		for (i=0; i<np; ++i)
			
 
				+			if (orig_indices[i] != new_indices[i])
			
 
				+				break;
			
 
				+
			
 
				+		if (i<np)
			
 
				+			ch = -1;	// start over
			
 
				+	}
			
 
				+
			
 
				+	// finally, do a small exhaustive search around what we think is the global minima to be sure
			
 
				+	// note this is independent of the above search, so we don't care about the indices from the above
			
 
				+	// we don't care about the above because if they differ, so what? we've already started at ch=0
			
 
				+	bool first = true;
			
 
				+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
			
 
				+	{
			
 
				+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
			
 
				+
			
 
				+		if (new_err < opt_err)
			
 
				+		{
			
 
				+			opt_err = new_err;
			
 
				+
			
 
				+			if (first)
			
 
				+			{
			
 
				+				for (int i=0; i<np; ++i)
			
 
				+				{
			
 
				+					orig_indices[i] = temp_indices0[i];
			
 
				+					nvAssert (orig_indices[i] != -1);
			
 
				+				}
			
 
				+				first = false;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// see if the indices have changed
			
 
				+				int i;
			
 
				+				for (i=0; i<np; ++i)
			
 
				+					if (orig_indices[i] != temp_indices0[i])
			
 
				+						break;
			
 
				+
			
 
				+				if (i<np)
			
 
				+				{
			
 
				+					ch = -1;	// start over
			
 
				+					first = true;
			
 
				+				}
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return opt_err;
			
 
				+}
			
 
				+
			
 
				+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
			
 
				+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
			
 
				+{
			
 
				+	Vector4 pixels[Tile::TILE_TOTAL];
			
 
				+    float importance[Tile::TILE_TOTAL];
			
 
				+	IntEndptsRGBA_2 temp_in, temp_out;
			
 
				+	int temp_indices[Tile::TILE_TOTAL];
			
 
				+
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		// collect the pixels in the region
			
 
				+		int np = 0;
			
 
				+
			
 
				+        for (int y = 0; y < tile.size_y; y++) {
			
 
				+            for (int x = 0; x < tile.size_x; x++) {
			
 
				+                if (REGION(x, y, shapeindex) == region) {
			
 
				+                    pixels[np] = tile.data[y][x];
			
 
				+                    importance[np] = tile.importance_map[y][x];
			
 
				+                    np++;
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+		opt_endpts[region] = temp_in = orig_endpts[region];
			
 
				+		opt_err[region] = orig_err[region];
			
 
				+
			
 
				+		float best_err = orig_err[region];
			
 
				+
			
 
				+		// try all lsb modes as we search for better endpoints
			
 
				+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
			
 
				+		{
			
 
				+			temp_in.a_lsb = lsbmode & 1;
			
 
				+			temp_in.b_lsb = (lsbmode >> 1) & 1;
			
 
				+
			
 
				+			// make sure we have a valid error for temp_in
			
 
				+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
			
 
				+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
			
 
				+			float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
			
 
				+
			
 
				+			// now try to optimize these endpoints
			
 
				+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
			
 
				+
			
 
				+			// if we find an improvement, update the best so far and correct the output endpoints and errors
			
 
				+			if (temp_out_err < best_err)
			
 
				+			{
			
 
				+				best_err = temp_out_err;
			
 
				+				opt_err[region] = temp_out_err;
			
 
				+				opt_endpts[region] = temp_out;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+/* optimization algorithm
			
 
				+	for each pattern
			
 
				+		convert endpoints using pattern precision
			
 
				+		assign indices and get initial error
			
 
				+		compress indices (and possibly reorder endpoints)
			
 
				+		transform endpoints
			
 
				+		if transformed endpoints fit pattern
			
 
				+			get original endpoints back
			
 
				+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
			
 
				+			compress new indices
			
 
				+			transform new endpoints
			
 
				+			if new endpoints fit pattern AND if error is improved
			
 
				+				emit compressed block with new data
			
 
				+			else
			
 
				+				emit compressed block with original data // to try to preserve maximum endpoint precision
			
 
				+*/
			
 
				+
			
 
				+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
			
 
				+{
			
 
				+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
			
 
				+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
			
 
				+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
			
 
				+
			
 
				+	for (int sp = 0; sp < NPATTERNS; ++sp)
			
 
				+	{
			
 
				+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
			
 
				+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
			
 
				+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
			
 
				+		if (patterns[sp].transformed)
			
 
				+			transform_forward(orig_endpts);
			
 
				+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
			
 
				+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
			
 
				+		if (endpts_fit(orig_endpts, patterns[sp]))
			
 
				+		{
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_inverse(orig_endpts);
			
 
				+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
			
 
				+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
			
 
				+			// (nreed) Commented out asserts because they go off all the time...not sure why
			
 
				+			//for (int i=0; i<NREGIONS; ++i)
			
 
				+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
			
 
				+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
			
 
				+			if (patterns[sp].transformed)
			
 
				+				transform_forward(opt_endpts);
			
 
				+			orig_toterr = opt_toterr = 0;
			
 
				+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
			
 
				+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
			
 
				+			{
			
 
				+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
			
 
				+				return opt_toterr;
			
 
				+			}
			
 
				+			else
			
 
				+			{
			
 
				+				// either it stopped fitting when we optimized it, or there was no improvement
			
 
				+				// so go back to the unoptimized endpoints which we know will fit
			
 
				+				if (patterns[sp].transformed)
			
 
				+					transform_forward(orig_endpts);
			
 
				+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
			
 
				+				return orig_toterr;
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 7).";
			
 
				+	return FLT_MAX;
			
 
				+}
			
 
				+
			
 
				+static void clamp(Vector4 &v)
			
 
				+{
			
 
				+	if (v.x < 0.0f) v.x = 0.0f;
			
 
				+	if (v.x > 255.0f) v.x = 255.0f;
			
 
				+	if (v.y < 0.0f) v.y = 0.0f;
			
 
				+	if (v.y > 255.0f) v.y = 255.0f;
			
 
				+	if (v.z < 0.0f) v.z = 0.0f;
			
 
				+	if (v.z > 255.0f) v.z = 255.0f;
			
 
				+	if (v.w < 0.0f) v.w = 0.0f;
			
 
				+	if (v.w > 255.0f) v.w = 255.0f;
			
 
				+}
			
 
				+
			
 
				+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
			
 
				+{
			
 
				+	for (int region = 0; region < NREGIONS; ++region)
			
 
				+	for (int i = 0; i < NINDICES; ++i)
			
 
				+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
			
 
				+}
			
 
				+
			
 
				+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
			
 
				+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	// build list of possibles
			
 
				+	Vector4 palette[NREGIONS][NINDICES];
			
 
				+
			
 
				+	generate_palette_unquantized(endpts, palette);
			
 
				+
			
 
				+	float toterr = 0;
			
 
				+	Vector4 err;
			
 
				+
			
 
				+	for (int y = 0; y < tile.size_y; y++)
			
 
				+	for (int x = 0; x < tile.size_x; x++)
			
 
				+	{
			
 
				+		int region = REGION(x,y,shapeindex);
			
 
				+		float err, besterr = FLT_MAX;
			
 
				+
			
 
				+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
			
 
				+		{
			
 
				+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
			
 
				+
			
 
				+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
			
 
				+				break;
			
 
				+			if (err < besterr)
			
 
				+				besterr = err;
			
 
				+		}
			
 
				+		toterr += besterr;
			
 
				+	}
			
 
				+	return toterr;
			
 
				+}
			
 
				+
			
 
				+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
			
 
				+{
			
 
				+	for (int region=0; region<NREGIONS; ++region)
			
 
				+	{
			
 
				+		int np = 0;
			
 
				+		Vector4 colors[Tile::TILE_TOTAL];
			
 
				+		Vector4 mean(0,0,0,0);
			
 
				+
			
 
				+		for (int y = 0; y < tile.size_y; y++)
			
 
				+		for (int x = 0; x < tile.size_x; x++)
			
 
				+			if (REGION(x,y,shapeindex) == region)
			
 
				+			{
			
 
				+				colors[np] = tile.data[y][x];
			
 
				+				mean += tile.data[y][x];
			
 
				+				++np;
			
 
				+			}
			
 
				+
			
 
				+		// handle simple cases	
			
 
				+		if (np == 0)
			
 
				+		{
			
 
				+			Vector4 zero(0,0,0,255.0f);
			
 
				+			endpts[region].A = zero;
			
 
				+			endpts[region].B = zero;
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 1)
			
 
				+		{
			
 
				+			endpts[region].A = colors[0];
			
 
				+			endpts[region].B = colors[0];
			
 
				+			continue;
			
 
				+		}
			
 
				+		else if (np == 2)
			
 
				+		{
			
 
				+			endpts[region].A = colors[0];
			
 
				+			endpts[region].B = colors[1];
			
 
				+			continue;
			
 
				+		}
			
 
				+
			
 
				+		mean /= float(np);
			
 
				+
			
 
				+		Vector4 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
			
 
				+
			
 
				+		// project each pixel value along the principal direction
			
 
				+		float minp = FLT_MAX, maxp = -FLT_MAX;
			
 
				+		for (int i = 0; i < np; i++) 
			
 
				+		{
			
 
				+			float dp = dot(colors[i]-mean, direction);
			
 
				+			if (dp < minp) minp = dp;
			
 
				+			if (dp > maxp) maxp = dp;
			
 
				+		}
			
 
				+
			
 
				+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
			
 
				+		endpts[region].A = mean + minp*direction;
			
 
				+		endpts[region].B = mean + maxp*direction;
			
 
				+
			
 
				+		// clamp endpoints
			
 
				+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
			
 
				+		// shape based on endpoints being clamped
			
 
				+		clamp(endpts[region].A);
			
 
				+		clamp(endpts[region].B);
			
 
				+	}
			
 
				+
			
 
				+	return map_colors(tile, shapeindex, endpts);
			
 
				+}
			
 
				+
			
 
				+static void swap(float *list1, int *list2, int i, int j)
			
 
				+{
			
 
				+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
			
 
				+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
			
 
				+}
			
 
				+
			
 
				+float AVPCL::compress_mode7(const Tile &t, char *block)
			
 
				+{
			
 
				+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
			
 
				+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
			
 
				+	const int NITEMS=NSHAPES/4;
			
 
				+
			
 
				+	// pick the best NITEMS shapes and refine these.
			
 
				+	struct {
			
 
				+		FltEndpts endpts[NREGIONS];
			
 
				+	} all[NSHAPES];
			
 
				+	float roughmse[NSHAPES];
			
 
				+	int index[NSHAPES];
			
 
				+	char tempblock[AVPCL::BLOCKSIZE];
			
 
				+	float msebest = FLT_MAX;
			
 
				+
			
 
				+	for (int i=0; i<NSHAPES; ++i)
			
 
				+	{
			
 
				+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
			
 
				+		index[i] = i;
			
 
				+	}
			
 
				+
			
 
				+	// bubble sort -- only need to bubble up the first NITEMS items
			
 
				+	for (int i=0; i<NITEMS; ++i)
			
 
				+	for (int j=i+1; j<NSHAPES; ++j)
			
 
				+		if (roughmse[i] > roughmse[j])
			
 
				+			swap(roughmse, index, i, j);
			
 
				+
			
 
				+	for (int i=0; i<NITEMS && msebest>0; ++i)
			
 
				+	{
			
 
				+		int shape = index[i];
			
 
				+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
			
 
				+		if (mse < msebest)
			
 
				+		{
			
 
				+			memcpy(block, tempblock, sizeof(tempblock));
			
 
				+			msebest = mse;
			
 
				+		}
			
 
				+	}
			
 
				+	return msebest;
			
 
				+}
			
 
				+
			
--- a/3rdparty/nvtt/bc7/avpcl_utils.cpp
+++ b/3rdparty/nvtt/bc7/avpcl_utils.cpp
@@ -0,0 +1,389 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// Utility and common routines
			
 
				+
			
 
				+#include "avpcl_utils.h"
			
 
				+#include "avpcl.h"
			
 
				+#include "nvmath/Vector.inl"
			
 
				+#include <math.h>
			
 
				+
			
 
				+using namespace nv;
			
 
				+using namespace AVPCL;
			
 
				+
			
 
				+static const int denom7_weights[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
			
 
				+static const int denom15_weights[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
			
 
				+
			
 
				+int Utils::lerp(int a, int b, int i, int bias, int denom)
			
 
				+{
			
 
				+#ifdef	USE_ZOH_INTERP
			
 
				+	nvAssert (denom == 3 || denom == 7 || denom == 15);
			
 
				+	nvAssert (i >= 0 && i <= denom);
			
 
				+	nvAssert (bias >= 0 && bias <= denom/2);
			
 
				+	nvAssert (a >= 0 && b >= 0);
			
 
				+
			
 
				+	int round = 0;
			
 
				+#ifdef	USE_ZOH_INTERP_ROUNDED
			
 
				+	round = 32;
			
 
				+#endif
			
 
				+
			
 
				+	switch (denom)
			
 
				+	{
			
 
				+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
			
 
				+	case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i] + round) >> 6;
			
 
				+	case 7:	return (a*denom7_weights[denom-i] + b*denom7_weights[i] + round) >> 6;
			
 
				+	default: nvUnreachable(); return 0;
			
 
				+	}
			
 
				+#else
			
 
				+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+Vector4 Utils::lerp(Vector4::Arg a, Vector4::Arg b, int i, int bias, int denom)
			
 
				+{
			
 
				+#ifdef	USE_ZOH_INTERP
			
 
				+	nvAssert (denom == 3 || denom == 7 || denom == 15);
			
 
				+	nvAssert (i >= 0 && i <= denom);
			
 
				+	nvAssert (bias >= 0 && bias <= denom/2);
			
 
				+//	nvAssert (a >= 0 && b >= 0);
			
 
				+
			
 
				+	// no need to bias these as this is an exact division
			
 
				+
			
 
				+	switch (denom)
			
 
				+	{
			
 
				+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
			
 
				+	case 15:return (a*float(denom15_weights[denom-i]) + b*float(denom15_weights[i])) / 64.0f;
			
 
				+	case 7:	return (a*float(denom7_weights[denom-i]) + b*float(denom7_weights[i])) / 64.0f;
			
 
				+	default: nvUnreachable(); return Vector4(0);
			
 
				+	}
			
 
				+#else
			
 
				+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
			
 
				+#endif
			
 
				+}
			
 
				+
			
 
				+
			
 
				+int Utils::unquantize(int q, int prec)
			
 
				+{
			
 
				+	int unq;
			
 
				+
			
 
				+	nvAssert (prec > 3);	// we only want to do one replicate
			
 
				+
			
 
				+#ifdef USE_ZOH_QUANT
			
 
				+	if (prec >= 8)
			
 
				+		unq = q;
			
 
				+	else if (q == 0) 
			
 
				+		unq = 0;
			
 
				+	else if (q == ((1<<prec)-1)) 
			
 
				+		unq = 255;
			
 
				+	else
			
 
				+		unq = (q * 256 + 128) >> prec;
			
 
				+#else
			
 
				+	// avpcl unquantizer -- bit replicate
			
 
				+	unq = (q << (8-prec)) | (q >> (2*prec-8));
			
 
				+#endif
			
 
				+
			
 
				+	return unq;
			
 
				+}
			
 
				+
			
 
				+// quantize to the best value -- i.e., minimize unquantize error
			
 
				+int Utils::quantize(float value, int prec)
			
 
				+{
			
 
				+	int q, unq;
			
 
				+
			
 
				+	nvAssert (prec > 3);	// we only want to do one replicate
			
 
				+
			
 
				+	unq = (int)floor(value + 0.5f);
			
 
				+	nvAssert (unq <= 255);
			
 
				+
			
 
				+#ifdef USE_ZOH_QUANT
			
 
				+	q = (prec >= 8) ? unq : (unq << prec) / 256;
			
 
				+#else
			
 
				+	// avpcl quantizer -- scale properly for best possible bit-replicated result
			
 
				+	q = (unq * ((1<<prec)-1) + 127)/255;
			
 
				+#endif
			
 
				+
			
 
				+	nvAssert (q >= 0 && q < (1 << prec));
			
 
				+
			
 
				+	return q;
			
 
				+}
			
 
				+
			
 
				+float Utils::metric4(Vector4::Arg a, Vector4::Arg b)
			
 
				+{
			
 
				+	Vector4 err = a - b;
			
 
				+
			
 
				+	// if nonuniform, select weights and weigh away
			
 
				+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
			
 
				+	{
			
 
				+		float rwt, gwt, bwt;
			
 
				+		if (AVPCL::flag_nonuniform)
			
 
				+		{
			
 
				+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
			
 
				+		}
			
 
				+		else /*if (AVPCL::flag_nonuniform_ati)*/
			
 
				+		{
			
 
				+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
			
 
				+		}
			
 
				+
			
 
				+		// weigh the components
			
 
				+		err.x *= rwt;
			
 
				+		err.y *= gwt;
			
 
				+		err.z *= bwt;
			
 
				+	}
			
 
				+
			
 
				+	return lengthSquared(err);
			
 
				+}
			
 
				+
			
 
				+// WORK -- implement rotatemode for the below -- that changes where the rwt, gwt, and bwt's go.
			
 
				+float Utils::metric3(Vector3::Arg a, Vector3::Arg b, int rotatemode)
			
 
				+{
			
 
				+	Vector3 err = a - b;
			
 
				+
			
 
				+	// if nonuniform, select weights and weigh away
			
 
				+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
			
 
				+	{
			
 
				+		float rwt, gwt, bwt;
			
 
				+		if (AVPCL::flag_nonuniform)
			
 
				+		{
			
 
				+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
			
 
				+		}
			
 
				+		else if (AVPCL::flag_nonuniform_ati)
			
 
				+		{
			
 
				+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
			
 
				+		}
			
 
				+
			
 
				+		// adjust weights based on rotatemode
			
 
				+		switch(rotatemode)
			
 
				+		{
			
 
				+		case ROTATEMODE_RGBA_RGBA: break;
			
 
				+		case ROTATEMODE_RGBA_AGBR: rwt = 1.0f; break;
			
 
				+		case ROTATEMODE_RGBA_RABG: gwt = 1.0f; break;
			
 
				+		case ROTATEMODE_RGBA_RGAB: bwt = 1.0f; break;
			
 
				+		default: nvUnreachable();
			
 
				+		}
			
 
				+
			
 
				+		// weigh the components
			
 
				+		err.x *= rwt;
			
 
				+		err.y *= gwt;
			
 
				+		err.z *= bwt;
			
 
				+	}
			
 
				+
			
 
				+	return lengthSquared(err);
			
 
				+}
			
 
				+
			
 
				+float Utils::metric1(const float a, const float b, int rotatemode)
			
 
				+{
			
 
				+	float err = a - b;
			
 
				+
			
 
				+	// if nonuniform, select weights and weigh away
			
 
				+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
			
 
				+	{
			
 
				+		float rwt, gwt, bwt, awt;
			
 
				+		if (AVPCL::flag_nonuniform)
			
 
				+		{
			
 
				+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
			
 
				+		}
			
 
				+		else if (AVPCL::flag_nonuniform_ati)
			
 
				+		{
			
 
				+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
			
 
				+		}
			
 
				+
			
 
				+		// adjust weights based on rotatemode
			
 
				+		switch(rotatemode)
			
 
				+		{
			
 
				+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
			
 
				+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
			
 
				+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
			
 
				+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
			
 
				+		default: nvUnreachable();
			
 
				+		}
			
 
				+
			
 
				+		// weigh the components
			
 
				+		err *= awt;
			
 
				+	}
			
 
				+
			
 
				+	return err * err;
			
 
				+}
			
 
				+
			
 
				+float Utils::premult(float r, float a)
			
 
				+{
			
 
				+	// note that the args are really integers stored in floats
			
 
				+	int R = int(r), A = int(a);
			
 
				+
			
 
				+	nvAssert ((R==r) && (A==a));
			
 
				+
			
 
				+	return float((R*A + 127)/255);
			
 
				+}
			
 
				+
			
 
				+static void premult4(Vector4& rgba)
			
 
				+{
			
 
				+	rgba.x = Utils::premult(rgba.x, rgba.w);
			
 
				+	rgba.y = Utils::premult(rgba.y, rgba.w);
			
 
				+	rgba.z = Utils::premult(rgba.z, rgba.w);
			
 
				+}
			
 
				+
			
 
				+static void premult3(Vector3& rgb, float a)
			
 
				+{
			
 
				+	rgb.x = Utils::premult(rgb.x, a);
			
 
				+	rgb.y = Utils::premult(rgb.y, a);
			
 
				+	rgb.z = Utils::premult(rgb.z, a);
			
 
				+}
			
 
				+
			
 
				+float Utils::metric4premult(Vector4::Arg a, Vector4::Arg b)
			
 
				+{
			
 
				+	Vector4 pma = a, pmb = b;
			
 
				+
			
 
				+	premult4(pma);
			
 
				+	premult4(pmb);
			
 
				+
			
 
				+	Vector4 err = pma - pmb;
			
 
				+
			
 
				+	// if nonuniform, select weights and weigh away
			
 
				+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
			
 
				+	{
			
 
				+		float rwt, gwt, bwt;
			
 
				+		if (AVPCL::flag_nonuniform)
			
 
				+		{
			
 
				+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
			
 
				+		}
			
 
				+		else /*if (AVPCL::flag_nonuniform_ati)*/
			
 
				+		{
			
 
				+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
			
 
				+		}
			
 
				+
			
 
				+		// weigh the components
			
 
				+		err.x *= rwt;
			
 
				+		err.y *= gwt;
			
 
				+		err.z *= bwt;
			
 
				+	}
			
 
				+
			
 
				+	return lengthSquared(err);
			
 
				+}
			
 
				+
			
 
				+float Utils::metric3premult_alphaout(Vector3::Arg rgb0, float a0, Vector3::Arg rgb1, float a1)
			
 
				+{
			
 
				+	Vector3 pma = rgb0, pmb = rgb1;
			
 
				+
			
 
				+	premult3(pma, a0);
			
 
				+	premult3(pmb, a1);
			
 
				+
			
 
				+	Vector3 err = pma - pmb;
			
 
				+
			
 
				+	// if nonuniform, select weights and weigh away
			
 
				+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
			
 
				+	{
			
 
				+		float rwt, gwt, bwt;
			
 
				+		if (AVPCL::flag_nonuniform)
			
 
				+		{
			
 
				+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
			
 
				+		}
			
 
				+		else /*if (AVPCL::flag_nonuniform_ati)*/
			
 
				+		{
			
 
				+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
			
 
				+		}
			
 
				+
			
 
				+		// weigh the components
			
 
				+		err.x *= rwt;
			
 
				+		err.y *= gwt;
			
 
				+		err.z *= bwt;
			
 
				+	}
			
 
				+
			
 
				+	return lengthSquared(err);
			
 
				+}
			
 
				+
			
 
				+float Utils::metric3premult_alphain(Vector3::Arg rgb0, Vector3::Arg rgb1, int rotatemode)
			
 
				+{
			
 
				+	Vector3 pma = rgb0, pmb = rgb1;
			
 
				+
			
 
				+	switch(rotatemode)
			
 
				+	{
			
 
				+	case ROTATEMODE_RGBA_RGBA:
			
 
				+		// this function isn't supposed to be called for this rotatemode
			
 
				+		nvUnreachable();
			
 
				+		break;
			
 
				+	case ROTATEMODE_RGBA_AGBR:
			
 
				+		pma.y = premult(pma.y, pma.x);
			
 
				+		pma.z = premult(pma.z, pma.x);
			
 
				+		pmb.y = premult(pmb.y, pmb.x);
			
 
				+		pmb.z = premult(pmb.z, pmb.x);
			
 
				+		break;
			
 
				+	case ROTATEMODE_RGBA_RABG:
			
 
				+		pma.x = premult(pma.x, pma.y);
			
 
				+		pma.z = premult(pma.z, pma.y);
			
 
				+		pmb.x = premult(pmb.x, pmb.y);
			
 
				+		pmb.z = premult(pmb.z, pmb.y);
			
 
				+		break;
			
 
				+	case ROTATEMODE_RGBA_RGAB:
			
 
				+		pma.x = premult(pma.x, pma.z);
			
 
				+		pma.y = premult(pma.y, pma.z);
			
 
				+		pmb.x = premult(pmb.x, pmb.z);
			
 
				+		pmb.y = premult(pmb.y, pmb.z);
			
 
				+		break;
			
 
				+	default: nvUnreachable();
			
 
				+	}
			
 
				+
			
 
				+	Vector3 err = pma - pmb;
			
 
				+
			
 
				+	// if nonuniform, select weights and weigh away
			
 
				+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
			
 
				+	{
			
 
				+		float rwt, gwt, bwt;
			
 
				+		if (AVPCL::flag_nonuniform)
			
 
				+		{
			
 
				+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
			
 
				+		}
			
 
				+		else /*if (AVPCL::flag_nonuniform_ati)*/
			
 
				+		{
			
 
				+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
			
 
				+		}
			
 
				+
			
 
				+		// weigh the components
			
 
				+		err.x *= rwt;
			
 
				+		err.y *= gwt;
			
 
				+		err.z *= bwt;
			
 
				+	}
			
 
				+
			
 
				+	return lengthSquared(err);
			
 
				+}
			
 
				+
			
 
				+float Utils::metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode)
			
 
				+{
			
 
				+	float err = premult(rgb0, a0) - premult(rgb1, a1);
			
 
				+
			
 
				+	// if nonuniform, select weights and weigh away
			
 
				+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
			
 
				+	{
			
 
				+		float rwt, gwt, bwt, awt;
			
 
				+		if (AVPCL::flag_nonuniform)
			
 
				+		{
			
 
				+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
			
 
				+		}
			
 
				+		else if (AVPCL::flag_nonuniform_ati)
			
 
				+		{
			
 
				+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
			
 
				+		}
			
 
				+
			
 
				+		// adjust weights based on rotatemode
			
 
				+		switch(rotatemode)
			
 
				+		{
			
 
				+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
			
 
				+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
			
 
				+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
			
 
				+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
			
 
				+		default: nvUnreachable();
			
 
				+		}
			
 
				+
			
 
				+		// weigh the components
			
 
				+		err *= awt;
			
 
				+	}
			
 
				+
			
 
				+	return err * err;
			
 
				+}
			
--- a/3rdparty/nvtt/bc7/avpcl_utils.h
+++ b/3rdparty/nvtt/bc7/avpcl_utils.h
@@ -0,0 +1,61 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+// utility class holding common routines
			
 
				+#ifndef _AVPCL_UTILS_H
			
 
				+#define _AVPCL_UTILS_H
			
 
				+
			
 
				+#include "nvmath/Vector.h"
			
 
				+
			
 
				+namespace AVPCL {
			
 
				+
			
 
				+inline int SIGN_EXTEND(int x, int nb) { return ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x)); }
			
 
				+
			
 
				+static const int INDEXMODE_BITS				= 1;		// 2 different index modes
			
 
				+static const int NINDEXMODES				= (1<<(INDEXMODE_BITS));
			
 
				+static const int INDEXMODE_ALPHA_IS_3BITS	= 0;
			
 
				+static const int INDEXMODE_ALPHA_IS_2BITS	= 1;
			
 
				+
			
 
				+static const int ROTATEMODE_BITS		= 2;		// 4 different rotate modes
			
 
				+static const int NROTATEMODES			= (1<<(ROTATEMODE_BITS));
			
 
				+static const int ROTATEMODE_RGBA_RGBA	= 0;
			
 
				+static const int ROTATEMODE_RGBA_AGBR	= 1;
			
 
				+static const int ROTATEMODE_RGBA_RABG	= 2;
			
 
				+static const int ROTATEMODE_RGBA_RGAB	= 3;
			
 
				+
			
 
				+class Utils
			
 
				+{
			
 
				+public:
			
 
				+	// error metrics
			
 
				+	static float metric4(nv::Vector4::Arg a, nv::Vector4::Arg b);
			
 
				+	static float metric3(nv::Vector3::Arg a, nv::Vector3::Arg b, int rotatemode);
			
 
				+	static float metric1(float a, float b, int rotatemode);
			
 
				+
			
 
				+	static float metric4premult(nv::Vector4::Arg rgba0, nv::Vector4::Arg rgba1);
			
 
				+	static float metric3premult_alphaout(nv::Vector3::Arg rgb0, float a0, nv::Vector3::Arg rgb1, float a1);
			
 
				+	static float metric3premult_alphain(nv::Vector3::Arg rgb0, nv::Vector3::Arg rgb1, int rotatemode);
			
 
				+	static float metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode);
			
 
				+
			
 
				+	static float premult(float r, float a);
			
 
				+
			
 
				+	// quantization and unquantization
			
 
				+	static int unquantize(int q, int prec);
			
 
				+	static int quantize(float value, int prec);
			
 
				+
			
 
				+	// lerping
			
 
				+	static int lerp(int a, int b, int i, int bias, int denom);
			
 
				+	static nv::Vector4 lerp(nv::Vector4::Arg a, nv::Vector4::Arg b, int i, int bias, int denom);
			
 
				+};
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/bc7/bits.h
+++ b/3rdparty/nvtt/bc7/bits.h
@@ -0,0 +1,76 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+#ifndef _AVPCL_BITS_H
			
 
				+#define _AVPCL_BITS_H
			
 
				+
			
 
				+// read/write a bitstream
			
 
				+
			
 
				+#include "nvcore/Debug.h"
			
 
				+
			
 
				+namespace AVPCL {
			
 
				+
			
 
				+class Bits
			
 
				+{
			
 
				+public:
			
 
				+
			
 
				+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
			
 
				+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
			
 
				+
			
 
				+	void write(int value, int nbits) {
			
 
				+		nvAssert (nbits >= 0 && nbits < 32);
			
 
				+		nvAssert (sizeof(int)>= 4);
			
 
				+		for (int i=0; i<nbits; ++i)
			
 
				+			writeone(value>>i);
			
 
				+	}
			
 
				+	int read(int nbits) { 
			
 
				+		nvAssert (nbits >= 0 && nbits < 32);
			
 
				+		nvAssert (sizeof(int)>= 4);
			
 
				+		int out = 0;
			
 
				+		for (int i=0; i<nbits; ++i)
			
 
				+			out |= readone() << i;
			
 
				+		return out;
			
 
				+	}
			
 
				+	int getptr() { return bptr; }
			
 
				+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
			
 
				+	int getsize() { return bend; }
			
 
				+
			
 
				+private:
			
 
				+	int	bptr;		// next bit to read
			
 
				+	int bend;		// last written bit + 1
			
 
				+	char *bits;		// ptr to user bit stream
			
 
				+	const char *cbits;	// ptr to const user bit stream
			
 
				+	int maxbits;	// max size of user bit stream
			
 
				+	char readonly;	// 1 if this is a read-only stream
			
 
				+
			
 
				+	int readone() {
			
 
				+		nvAssert (bptr < bend);
			
 
				+		if (bptr >= bend) return 0;
			
 
				+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
			
 
				+		++bptr;
			
 
				+		return bit != 0;
			
 
				+	}
			
 
				+	void writeone(int bit) {
			
 
				+		nvAssert (!readonly); // "Writing a read-only bit stream"
			
 
				+		nvAssert (bptr < maxbits);
			
 
				+		if (bptr >= maxbits) return;
			
 
				+		if (bit&1)
			
 
				+			bits[bptr>>3] |= 1 << (bptr & 7);
			
 
				+		else
			
 
				+			bits[bptr>>3] &= ~(1 << (bptr & 7));
			
 
				+		if (bptr++ >= bend) bend = bptr;
			
 
				+	}
			
 
				+};
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/bc7/endpts.h
+++ b/3rdparty/nvtt/bc7/endpts.h
@@ -0,0 +1,81 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+#ifndef _AVPCL_ENDPTS_H
			
 
				+#define _AVPCL_ENDPTS_H
			
 
				+
			
 
				+// endpoint definitions and routines to search through endpoint space
			
 
				+
			
 
				+#include "nvmath/Vector.h"
			
 
				+
			
 
				+namespace AVPCL {
			
 
				+
			
 
				+static const int NCHANNELS_RGB	= 3;
			
 
				+static const int NCHANNELS_RGBA	= 4;
			
 
				+static const int CHANNEL_R		= 0;
			
 
				+static const int CHANNEL_G		= 1;
			
 
				+static const int CHANNEL_B		= 2;
			
 
				+static const int CHANNEL_A		= 3;
			
 
				+
			
 
				+struct FltEndpts
			
 
				+{
			
 
				+	nv::Vector4	A;
			
 
				+	nv::Vector4	B;
			
 
				+};
			
 
				+
			
 
				+struct IntEndptsRGB
			
 
				+{
			
 
				+	int		A[NCHANNELS_RGB];
			
 
				+	int		B[NCHANNELS_RGB];
			
 
				+};
			
 
				+
			
 
				+struct IntEndptsRGB_1
			
 
				+{
			
 
				+	int		A[NCHANNELS_RGB];
			
 
				+	int		B[NCHANNELS_RGB];
			
 
				+	int		lsb;				// shared lsb for A and B
			
 
				+};
			
 
				+
			
 
				+struct IntEndptsRGB_2
			
 
				+{
			
 
				+	int		A[NCHANNELS_RGB];
			
 
				+	int		B[NCHANNELS_RGB];
			
 
				+	int		a_lsb;				// lsb for A
			
 
				+	int		b_lsb;				// lsb for B
			
 
				+};
			
 
				+
			
 
				+
			
 
				+struct IntEndptsRGBA
			
 
				+{
			
 
				+	int		A[NCHANNELS_RGBA];
			
 
				+	int		B[NCHANNELS_RGBA];
			
 
				+};
			
 
				+
			
 
				+struct IntEndptsRGBA_2
			
 
				+{
			
 
				+	int		A[NCHANNELS_RGBA];
			
 
				+	int		B[NCHANNELS_RGBA];
			
 
				+	int		a_lsb;				// lsb for A
			
 
				+	int		b_lsb;				// lsb for B
			
 
				+};
			
 
				+
			
 
				+struct IntEndptsRGBA_2a
			
 
				+{
			
 
				+	int		A[NCHANNELS_RGBA];
			
 
				+	int		B[NCHANNELS_RGBA];
			
 
				+	int		a_lsb;				// lsb for RGB channels of A
			
 
				+	int		b_lsb;				// lsb for RGB channels of A
			
 
				+};
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/bc7/shapes_three.h
+++ b/3rdparty/nvtt/bc7/shapes_three.h
@@ -0,0 +1,132 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+#ifndef	_AVPCL_SHAPES_THREE_H
			
 
				+#define _AVPCL_SHAPES_THREE_H
			
 
				+
			
 
				+// shapes for 3 regions
			
 
				+
			
 
				+#define NREGIONS 3
			
 
				+#define NSHAPES 64
			
 
				+#define SHAPEBITS 6
			
 
				+
			
 
				+static int shapes[NSHAPES*16] = 
			
 
				+{
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 2, 2, 2,   
			
 
				+0, 0, 1, 1,   0, 0, 1, 1,   2, 0, 0, 1,   0, 0, 2, 2,   
			
 
				+0, 2, 2, 1,   2, 2, 1, 1,   2, 2, 1, 1,   0, 0, 1, 1,   
			
 
				+2, 2, 2, 2,   2, 2, 2, 1,   2, 2, 1, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
			
 
				+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
			
 
				+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
			
 
				+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 2,   
			
 
				+0, 0, 0, 0,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 1, 2,   
			
 
				+1, 1, 1, 1,   1, 1, 1, 1,   2, 2, 2, 2,   0, 0, 1, 2,   
			
 
				+2, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 2,   0, 0, 1, 2,   
			
 
				+
			
 
				+0, 1, 1, 2,   0, 1, 2, 2,   0, 0, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 1, 1, 2,   0, 1, 2, 2,   0, 1, 1, 2,   2, 0, 0, 1,   
			
 
				+0, 1, 1, 2,   0, 1, 2, 2,   1, 1, 2, 2,   2, 2, 0, 0,   
			
 
				+0, 1, 1, 2,   0, 1, 2, 2,   1, 2, 2, 2,   2, 2, 2, 0,   
			
 
				+
			
 
				+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
			
 
				+0, 0, 1, 1,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
			
 
				+0, 1, 1, 2,   2, 0, 0, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
			
 
				+1, 1, 2, 2,   2, 2, 0, 0,   1, 1, 2, 2,   1, 1, 1, 1,   
			
 
				+
			
 
				+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
			
 
				+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   1, 1, 0, 0,   
			
 
				+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
			
 
				+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
			
 
				+
			
 
				+0, 1, 2, 2,   0, 0, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
			
 
				+0, 1, 2, 2,   0, 0, 1, 2,   1, 2, 2, 1,   0, 1, 1, 0,   
			
 
				+0, 0, 1, 1,   1, 1, 2, 2,   1, 2, 2, 1,   1, 2, 2, 1,   
			
 
				+0, 0, 0, 0,   2, 2, 2, 2,   0, 1, 1, 0,   1, 2, 2, 1,   
			
 
				+
			
 
				+0, 0, 2, 2,   0, 1, 1, 0,   0, 0, 1, 1,   0, 0, 0, 0,   
			
 
				+1, 1, 0, 2,   0, 1, 1, 0,   0, 1, 2, 2,   2, 0, 0, 0,   
			
 
				+1, 1, 0, 2,   2, 0, 0, 2,   0, 1, 2, 2,   2, 2, 1, 1,   
			
 
				+0, 0, 2, 2,   2, 2, 2, 2,   0, 0, 1, 1,   2, 2, 2, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 2, 2, 2,   0, 0, 1, 1,   0, 1, 2, 0,   
			
 
				+0, 0, 0, 2,   0, 0, 2, 2,   0, 0, 1, 2,   0, 1, 2, 0,   
			
 
				+1, 1, 2, 2,   0, 0, 1, 2,   0, 0, 2, 2,   0, 1, 2, 0,   
			
 
				+1, 2, 2, 2,   0, 0, 1, 1,   0, 2, 2, 2,   0, 1, 2, 0,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
			
 
				+1, 1, 1, 1,   1, 2, 0, 1,   2, 0, 1, 2,   2, 2, 0, 0,   
			
 
				+2, 2, 2, 2,   2, 0, 1, 2,   1, 2, 0, 1,   1, 1, 2, 2,   
			
 
				+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
			
 
				+
			
 
				+0, 0, 1, 1,   0, 1, 0, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
			
 
				+1, 1, 2, 2,   0, 1, 0, 1,   0, 0, 0, 0,   1, 1, 2, 2,   
			
 
				+2, 2, 0, 0,   2, 2, 2, 2,   2, 1, 2, 1,   0, 0, 2, 2,   
			
 
				+0, 0, 1, 1,   2, 2, 2, 2,   2, 1, 2, 1,   1, 1, 2, 2,   
			
 
				+
			
 
				+0, 0, 2, 2,   0, 2, 2, 0,   0, 1, 0, 1,   0, 0, 0, 0,   
			
 
				+0, 0, 1, 1,   1, 2, 2, 1,   2, 2, 2, 2,   2, 1, 2, 1,   
			
 
				+0, 0, 2, 2,   0, 2, 2, 0,   2, 2, 2, 2,   2, 1, 2, 1,   
			
 
				+0, 0, 1, 1,   1, 2, 2, 1,   0, 1, 0, 1,   2, 1, 2, 1,   
			
 
				+
			
 
				+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   0, 0, 0, 0,   
			
 
				+0, 1, 0, 1,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
			
 
				+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   2, 1, 1, 2,   
			
 
				+2, 2, 2, 2,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
			
 
				+
			
 
				+0, 2, 2, 2,   0, 0, 0, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
			
 
				+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
			
 
				+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   2, 1, 1, 2,   
			
 
				+0, 2, 2, 2,   0, 0, 0, 2,   2, 2, 2, 2,   2, 1, 1, 2,   
			
 
				+
			
 
				+0, 1, 1, 0,   0, 0, 2, 2,   0, 0, 2, 2,   0, 0, 0, 0,   
			
 
				+0, 1, 1, 0,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
			
 
				+2, 2, 2, 2,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
			
 
				+2, 2, 2, 2,   0, 0, 2, 2,   0, 0, 2, 2,   2, 1, 1, 2,   
			
 
				+
			
 
				+0, 0, 0, 2,   0, 2, 2, 2,   0, 1, 0, 1,   0, 1, 1, 1,   
			
 
				+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 0, 1, 1,   
			
 
				+0, 0, 0, 2,   0, 2, 2, 2,   2, 2, 2, 2,   2, 2, 0, 1,   
			
 
				+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 0,
			
 
				+};
			
 
				+
			
 
				+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
			
 
				+
			
 
				+static int shapeindex_to_compressed_indices[NSHAPES*3] = 
			
 
				+{
			
 
				+	0, 3,15,  0, 3, 8,  0,15, 8,  0,15, 3,
			
 
				+	0, 8,15,  0, 3,15,  0,15, 3,  0,15, 8,
			
 
				+	0, 8,15,  0, 8,15,  0, 6,15,  0, 6,15,
			
 
				+	0, 6,15,  0, 5,15,  0, 3,15,  0, 3, 8,
			
 
				+
			
 
				+	0, 3,15,  0, 3, 8,  0, 8,15,  0,15, 3,
			
 
				+	0, 3,15,  0, 3, 8,  0, 6,15,  0,10, 8,
			
 
				+	0, 5, 3,  0, 8,15,  0, 8, 6,  0, 6,10,
			
 
				+	0, 8,15,  0, 5,15,  0,15,10,  0,15, 8,
			
 
				+
			
 
				+	0, 8,15,  0,15, 3,  0, 3,15,  0, 5,10,
			
 
				+	0, 6,10,  0,10, 8,  0, 8, 9,  0,15,10,
			
 
				+	0,15, 6,  0, 3,15,  0,15, 8,  0, 5,15,
			
 
				+	0,15, 3,  0,15, 6,  0,15, 6,  0,15, 8,
			
 
				+
			
 
				+	0, 3,15,  0,15, 3,  0, 5,15,  0, 5,15,
			
 
				+	0, 5,15,  0, 8,15,  0, 5,15,  0,10,15,
			
 
				+	0, 5,15,  0,10,15,  0, 8,15,  0,13,15,
			
 
				+	0,15, 3,  0,12,15,  0, 3,15,  0, 3, 8
			
 
				+
			
 
				+};
			
 
				+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*3+(region)]
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/bc7/shapes_two.h
+++ b/3rdparty/nvtt/bc7/shapes_two.h
@@ -0,0 +1,133 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+#ifndef _AVPCL_SHAPES_TWO_H
			
 
				+#define _AVPCL_SHAPES_TWO_H
			
 
				+
			
 
				+// shapes for two regions
			
 
				+
			
 
				+#define NREGIONS 2
			
 
				+#define NSHAPES 64
			
 
				+#define SHAPEBITS 6
			
 
				+
			
 
				+static int shapes[NSHAPES*16] = 
			
 
				+{
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
			
 
				+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
			
 
				+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
			
 
				+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
			
 
				+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
			
 
				+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
			
 
				+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
			
 
				+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
			
 
				+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
			
 
				+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
			
 
				+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
			
 
				+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
			
 
				+
			
 
				+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
			
 
				+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
			
 
				+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
			
 
				+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
			
 
				+
			
 
				+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
			
 
				+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
			
 
				+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
			
 
				+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
			
 
				+
			
 
				+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
			
 
				+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
			
 
				+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
			
 
				+
			
 
				+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
			
 
				+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
			
 
				+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
			
 
				+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
			
 
				+
			
 
				+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
			
 
				+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
			
 
				+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
			
 
				+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
			
 
				+
			
 
				+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
			
 
				+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
			
 
				+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
			
 
				+
			
 
				+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
			
 
				+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
			
 
				+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
			
 
				+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
			
 
				+
			
 
				+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
			
 
				+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
			
 
				+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
			
 
				+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
			
 
				+
			
 
				+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
			
 
				+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
			
 
				+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
			
 
				+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
			
 
				+
			
 
				+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
			
 
				+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
			
 
				+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
			
 
				+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
			
 
				+
			
 
				+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
			
 
				+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
			
 
				+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
			
 
				+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
			
 
				+
			
 
				+};
			
 
				+
			
 
				+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
			
 
				+
			
 
				+static int shapeindex_to_compressed_indices[NSHAPES*2] = 
			
 
				+{
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+
			
 
				+	0,15,  0, 2,  0, 8,  0, 2,
			
 
				+	0, 2,  0, 8,  0, 8,  0,15,
			
 
				+	0, 2,  0, 8,  0, 2,  0, 2,
			
 
				+	0, 8,  0, 8,  0, 2,  0, 2,
			
 
				+
			
 
				+	0,15,  0,15,  0, 6,  0, 8,
			
 
				+	0, 2,  0, 8,  0,15,  0,15,
			
 
				+	0, 2,  0, 8,  0, 2,  0, 2,
			
 
				+	0, 2,  0,15,  0,15,  0, 6,
			
 
				+
			
 
				+	0, 6,  0, 2,  0, 6,  0, 8,
			
 
				+	0,15,  0,15,  0, 2,  0, 2,
			
 
				+	0,15,  0,15,  0,15,  0,15,
			
 
				+	0,15,  0, 2,  0, 2,  0,15
			
 
				+
			
 
				+};
			
 
				+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/bc7/tile.h
+++ b/3rdparty/nvtt/bc7/tile.h
@@ -0,0 +1,41 @@
 
				+/*
			
 
				+Copyright 2007 nVidia, Inc.
			
 
				+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
			
 
				+
			
 
				+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
			
 
				+
			
 
				+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
			
 
				+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
			
 
				+
			
 
				+See the License for the specific language governing permissions and limitations under the License.
			
 
				+*/
			
 
				+
			
 
				+#ifndef _AVPCL_TILE_H
			
 
				+#define _AVPCL_TILE_H
			
 
				+
			
 
				+#include "nvmath/Vector.h"
			
 
				+#include <math.h>
			
 
				+#include "avpcl_utils.h"
			
 
				+
			
 
				+namespace AVPCL {
			
 
				+
			
 
				+// extract a tile of pixels from an array
			
 
				+
			
 
				+class Tile
			
 
				+{
			
 
				+public:
			
 
				+	static const int TILE_H = 4;
			
 
				+	static const int TILE_W = 4;
			
 
				+	static const int TILE_TOTAL = TILE_H * TILE_W;
			
 
				+	nv::Vector4 data[TILE_H][TILE_W];
			
 
				+    float importance_map[TILE_H][TILE_W];
			
 
				+	int	size_x, size_y;			// actual size of tile
			
 
				+
			
 
				+	Tile() {};
			
 
				+	~Tile(){};
			
 
				+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
			
 
				+};
			
 
				+
			
 
				+}
			
 
				+
			
 
				+#endif
			
--- a/3rdparty/nvtt/nvcore/Array.inl
+++ b/3rdparty/nvtt/nvcore/Array.inl
@@ -0,0 +1,437 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>

			
 
				+

			
 
				+#ifndef NV_CORE_ARRAY_INL

			
 
				+#define NV_CORE_ARRAY_INL

			
 
				+

			
 
				+#include "array.h"

			
 
				+

			
 
				+#include "stream.h"

			
 
				+#include "utils.h" // swap

			
 
				+

			
 
				+#include <string.h>	// memmove

			
 
				+#include <new> // for placement new

			
 
				+

			
 
				+

			
 
				+

			
 
				+namespace nv 

			
 
				+{

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE T & Array<T>::append()

			
 
				+    {

			
 
				+        uint old_size = m_size;

			
 
				+        uint new_size = m_size + 1;

			
 
				+

			
 
				+        setArraySize(new_size);

			
 
				+

			
 
				+        construct_range(m_buffer, new_size, old_size);

			
 
				+

			
 
				+        return m_buffer[old_size]; // Return reference to last element.

			
 
				+    }

			
 
				+

			
 
				+    // Push an element at the end of the vector.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE void Array<T>::push_back( const T & val )

			
 
				+    {

			
 
				+#if 1

			
 
				+        nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size);

			
 
				+

			
 
				+        uint old_size = m_size;

			
 
				+        uint new_size = m_size + 1;

			
 
				+

			
 
				+        setArraySize(new_size);

			
 
				+

			
 
				+        construct_range(m_buffer, new_size, old_size, val);

			
 
				+#else

			
 
				+        uint new_size = m_size + 1;

			
 
				+

			
 
				+        if (new_size > m_capacity)

			
 
				+        {

			
 
				+            // @@ Is there any way to avoid this copy?

			
 
				+            // @@ Can we create a copy without side effects? Ie. without calls to constructor/destructor. Use alloca + memcpy?

			
 
				+            // @@ Assert instead of copy?

			
 
				+            const T copy(val);	// create a copy in case value is inside of this array.

			
 
				+

			
 
				+            setArraySize(new_size);

			
 
				+

			
 
				+            new (m_buffer+new_size-1) T(copy);

			
 
				+        }

			
 
				+        else

			
 
				+        {

			
 
				+            m_size = new_size;

			
 
				+            new(m_buffer+new_size-1) T(val);

			
 
				+        }

			
 
				+#endif // 0/1

			
 
				+    }

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE void Array<T>::pushBack( const T & val )

			
 
				+    {

			
 
				+        push_back(val);

			
 
				+    }

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE Array<T> & Array<T>::append( const T & val )

			
 
				+    {

			
 
				+        push_back(val);

			
 
				+        return *this;

			
 
				+    }

			
 
				+

			
 
				+    // Qt like push operator.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE Array<T> & Array<T>::operator<< ( T & t )

			
 
				+    {

			
 
				+        push_back(t);

			
 
				+        return *this;

			
 
				+    }

			
 
				+

			
 
				+    // Pop the element at the end of the vector.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE void Array<T>::pop_back()

			
 
				+    {

			
 
				+        nvDebugCheck( m_size > 0 );

			
 
				+        resize( m_size - 1 );

			
 
				+    }

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE void Array<T>::popBack(uint count)

			
 
				+    {

			
 
				+        nvDebugCheck(m_size >= count);

			
 
				+        resize(m_size - count);

			
 
				+    }

			
 
				+

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE void Array<T>::popFront(uint count)

			
 
				+    {

			
 
				+        nvDebugCheck(m_size >= count);

			
 
				+        //resize(m_size - count);

			
 
				+

			
 
				+        if (m_size == count) {

			
 
				+            clear();

			
 
				+        }

			
 
				+        else {

			
 
				+            destroy_range(m_buffer, 0, count);

			
 
				+

			
 
				+            memmove(m_buffer, m_buffer + count, sizeof(T) * (m_size - count));

			
 
				+

			
 
				+            m_size -= count;

			
 
				+        }

			
 
				+

			
 
				+    }

			
 
				+

			
 
				+

			
 
				+    // Get back element.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE const T & Array<T>::back() const

			
 
				+    {

			
 
				+        nvDebugCheck( m_size > 0 );

			
 
				+        return m_buffer[m_size-1];

			
 
				+    }

			
 
				+

			
 
				+    // Get back element.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE T & Array<T>::back()

			
 
				+    {

			
 
				+        nvDebugCheck( m_size > 0 );

			
 
				+        return m_buffer[m_size-1];

			
 
				+    }

			
 
				+

			
 
				+    // Get front element.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE const T & Array<T>::front() const

			
 
				+    {

			
 
				+        nvDebugCheck( m_size > 0 );

			
 
				+        return m_buffer[0];

			
 
				+    }

			
 
				+

			
 
				+    // Get front element.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE T & Array<T>::front()

			
 
				+    {

			
 
				+        nvDebugCheck( m_size > 0 );

			
 
				+        return m_buffer[0];

			
 
				+    }

			
 
				+

			
 
				+    // Check if the given element is contained in the array.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE bool Array<T>::contains(const T & e) const

			
 
				+    {

			
 
				+        return find(e, NULL);

			
 
				+    }

			
 
				+

			
 
				+    // Return true if element found.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint * indexPtr) const

			
 
				+    {

			
 
				+        return find(element, 0, m_size, indexPtr);

			
 
				+    }

			
 
				+

			
 
				+    // Return true if element found within the given range.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint begin, uint end, uint * indexPtr) const

			
 
				+    {

			
 
				+        return ::nv::find(element, m_buffer, begin, end, indexPtr);

			
 
				+    }

			
 
				+

			
 
				+

			
 
				+    // Remove the element at the given index. This is an expensive operation!

			
 
				+    template <typename T>

			
 
				+    void Array<T>::removeAt(uint index)

			
 
				+    {

			
 
				+        nvDebugCheck(index >= 0 && index < m_size);

			
 
				+

			
 
				+        if (m_size == 1) {

			
 
				+            clear();

			
 
				+        }

			
 
				+        else {

			
 
				+            m_buffer[index].~T();

			
 
				+

			
 
				+            memmove(m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index));

			
 
				+            m_size--;

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    // Remove the first instance of the given element.

			
 
				+    template <typename T>

			
 
				+    bool Array<T>::remove(const T & element)

			
 
				+    {

			
 
				+        uint index;

			
 
				+        if (find(element, &index)) {

			
 
				+            removeAt(index);

			
 
				+            return true;

			
 
				+        }

			
 
				+        return false;

			
 
				+    }

			
 
				+

			
 
				+    // Insert the given element at the given index shifting all the elements up.

			
 
				+    template <typename T>

			
 
				+    void Array<T>::insertAt(uint index, const T & val/*=T()*/)

			
 
				+    {

			
 
				+        nvDebugCheck( index >= 0 && index <= m_size );

			
 
				+

			
 
				+        setArraySize(m_size + 1);

			
 
				+

			
 
				+        if (index < m_size - 1) {

			
 
				+            memmove(m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index));

			
 
				+        }

			
 
				+

			
 
				+        // Copy-construct into the newly opened slot.

			
 
				+        new(m_buffer+index) T(val);

			
 
				+    }

			
 
				+

			
 
				+    // Append the given data to our vector.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE void Array<T>::append(const Array<T> & other)

			
 
				+    {

			
 
				+        append(other.m_buffer, other.m_size);

			
 
				+    }

			
 
				+

			
 
				+    // Append the given data to our vector.

			
 
				+    template <typename T>

			
 
				+    void Array<T>::append(const T other[], uint count)

			
 
				+    {

			
 
				+        if (count > 0) {

			
 
				+            const uint old_size = m_size;

			
 
				+

			
 
				+            setArraySize(m_size + count);

			
 
				+

			
 
				+            for (uint i = 0; i < count; i++ ) {

			
 
				+                new(m_buffer + old_size + i) T(other[i]);

			
 
				+            }

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+

			
 
				+    // Remove the given element by replacing it with the last one.

			
 
				+    template <typename T> 

			
 
				+    void Array<T>::replaceWithLast(uint index)

			
 
				+    {

			
 
				+        nvDebugCheck( index < m_size );

			
 
				+        nv::swap(m_buffer[index], back());      // @@ Is this OK when index == size-1?

			
 
				+        (m_buffer+m_size-1)->~T();

			
 
				+        m_size--;

			
 
				+    }

			
 
				+

			
 
				+    // Resize the vector preserving existing elements.

			
 
				+    template <typename T> 

			
 
				+    void Array<T>::resize(uint new_size)

			
 
				+    {

			
 
				+        uint old_size = m_size;

			
 
				+

			
 
				+        // Destruct old elements (if we're shrinking).

			
 
				+        destroy_range(m_buffer, new_size, old_size);

			
 
				+

			
 
				+        setArraySize(new_size);

			
 
				+

			
 
				+        // Call default constructors

			
 
				+        construct_range(m_buffer, new_size, old_size);

			
 
				+    }

			
 
				+

			
 
				+

			
 
				+    // Resize the vector preserving existing elements and initializing the

			
 
				+    // new ones with the given value.

			
 
				+    template <typename T> 

			
 
				+    void Array<T>::resize(uint new_size, const T & elem)

			
 
				+    {

			
 
				+        nvDebugCheck(&elem < m_buffer || &elem > m_buffer+m_size);

			
 
				+

			
 
				+        uint old_size = m_size;

			
 
				+

			
 
				+        // Destruct old elements (if we're shrinking).

			
 
				+        destroy_range(m_buffer, new_size, old_size);

			
 
				+

			
 
				+        setArraySize(new_size);

			
 
				+

			
 
				+        // Call copy constructors

			
 
				+        construct_range(m_buffer, new_size, old_size, elem);

			
 
				+    }

			
 
				+

			
 
				+    // Fill array with the given value.

			
 
				+    template <typename T>

			
 
				+    void Array<T>::fill(const T & elem)

			
 
				+    {

			
 
				+        fill(m_buffer, m_size, elem);

			
 
				+    }

			
 
				+

			
 
				+    // Clear the buffer.

			
 
				+    template <typename T> 

			
 
				+    NV_FORCEINLINE void Array<T>::clear()

			
 
				+    {

			
 
				+        nvDebugCheck(isValidPtr(m_buffer));

			
 
				+

			
 
				+        // Destruct old elements

			
 
				+        destroy_range(m_buffer, 0, m_size);

			
 
				+

			
 
				+        m_size = 0;

			
 
				+    }

			
 
				+

			
 
				+    // Shrink the allocated vector.

			
 
				+    template <typename T> 

			
 
				+    NV_FORCEINLINE void Array<T>::shrink()

			
 
				+    {

			
 
				+        if (m_size < m_capacity) {

			
 
				+            setArrayCapacity(m_size);

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    // Preallocate space.

			
 
				+    template <typename T> 

			
 
				+    NV_FORCEINLINE void Array<T>::reserve(uint desired_size)

			
 
				+    {

			
 
				+        if (desired_size > m_capacity) {

			
 
				+            setArrayCapacity(desired_size);

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    // Copy elements to this array. Resizes it if needed.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)

			
 
				+    {

			
 
				+#if 1   // More simple, but maybe not be as efficient?

			
 
				+        destroy_range(m_buffer, 0, m_size);

			
 
				+

			
 
				+        setArraySize(count);

			
 
				+

			
 
				+        construct_range(m_buffer, count, 0, data);

			
 
				+#else

			
 
				+        const uint old_size = m_size;

			
 
				+

			
 
				+        destroy_range(m_buffer, count, old_size);

			
 
				+

			
 
				+        setArraySize(count);

			
 
				+

			
 
				+        copy_range(m_buffer, data, old_size);

			
 
				+

			
 
				+        construct_range(m_buffer, count, old_size, data);

			
 
				+#endif

			
 
				+    }

			
 
				+

			
 
				+    // Assignment operator.

			
 
				+    template <typename T>

			
 
				+    NV_FORCEINLINE Array<T> & Array<T>::operator=( const Array<T> & a )

			
 
				+    {

			
 
				+        copy(a.m_buffer, a.m_size);

			
 
				+        return *this;

			
 
				+    }

			
 
				+

			
 
				+    // Release ownership of allocated memory and returns pointer to it.

			
 
				+    template <typename T>

			
 
				+    T * Array<T>::release() {

			
 
				+        T * tmp = m_buffer;

			
 
				+        m_buffer = NULL;

			
 
				+        m_capacity = 0;

			
 
				+        m_size = 0;

			
 
				+        return tmp;

			
 
				+    }

			
 
				+

			
 
				+

			
 
				+

			
 
				+    // Change array size.

			
 
				+    template <typename T> 

			
 
				+    inline void Array<T>::setArraySize(uint new_size) {

			
 
				+        m_size = new_size;

			
 
				+

			
 
				+        if (new_size > m_capacity) {

			
 
				+            uint new_buffer_size;

			
 
				+            if (m_capacity == 0) {

			
 
				+                // first allocation is exact

			
 
				+                new_buffer_size = new_size;

			
 
				+            }

			
 
				+            else {

			
 
				+                // following allocations grow array by 25%

			
 
				+                new_buffer_size = new_size + (new_size >> 2);

			
 
				+            }

			
 
				+

			
 
				+            setArrayCapacity( new_buffer_size );

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    // Change array capacity.

			
 
				+    template <typename T> 

			
 
				+    inline void Array<T>::setArrayCapacity(uint new_capacity) {

			
 
				+        nvDebugCheck(new_capacity >= m_size);

			
 
				+

			
 
				+        if (new_capacity == 0) {

			
 
				+            // free the buffer.

			
 
				+            if (m_buffer != NULL) {

			
 
				+                free<T>(m_buffer);

			
 
				+                m_buffer = NULL;

			
 
				+            }

			
 
				+        }

			
 
				+        else {

			
 
				+            // realloc the buffer

			
 
				+            m_buffer = realloc<T>(m_buffer, new_capacity);

			
 
				+        }

			
 
				+

			
 
				+        m_capacity = new_capacity;

			
 
				+    }

			
 
				+

			
 
				+    // Array serialization.

			
 
				+    template <typename Typ> 

			
 
				+    inline Stream & operator<< ( Stream & s, Array<Typ> & p )

			
 
				+    {

			
 
				+        if (s.isLoading()) {

			
 
				+            uint size;

			
 
				+            s << size;

			
 
				+            p.resize( size );

			
 
				+        }

			
 
				+        else {

			
 
				+            s << p.m_size;

			
 
				+        }

			
 
				+

			
 
				+        for (uint i = 0; i < p.m_size; i++) {

			
 
				+            s << p.m_buffer[i];

			
 
				+        }

			
 
				+

			
 
				+        return s;

			
 
				+    }

			
 
				+

			
 
				+    // Swap the members of the two given vectors.

			
 
				+    template <typename Typ>

			
 
				+    inline void swap(Array<Typ> & a, Array<Typ> & b)

			
 
				+    {

			
 
				+        nv::swap(a.m_buffer, b.m_buffer);

			
 
				+        nv::swap(a.m_capacity, b.m_capacity);

			
 
				+        nv::swap(a.m_size, b.m_size);

			
 
				+    }

			
 
				+

			
 
				+

			
 
				+} // nv namespace

			
 
				+

			
 
				+#endif // NV_CORE_ARRAY_INL

			
--- a/3rdparty/nvtt/nvcore/Debug.h
+++ b/3rdparty/nvtt/nvcore/Debug.h
@@ -0,0 +1,216 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_CORE_DEBUG_H
			
 
				+#define NV_CORE_DEBUG_H
			
 
				+
			
 
				+#include "nvcore.h"
			
 
				+
			
 
				+#include <stdarg.h> // va_list
			
 
				+
			
 
				+
			
 
				+// Make sure we are using our assert.
			
 
				+#undef assert
			
 
				+
			
 
				+#define NV_ABORT_DEBUG      1
			
 
				+#define NV_ABORT_IGNORE     2
			
 
				+#define NV_ABORT_EXIT       3
			
 
				+
			
 
				+#define nvNoAssert(exp) \
			
 
				+    NV_MULTI_LINE_MACRO_BEGIN \
			
 
				+    (void)sizeof(exp); \
			
 
				+    NV_MULTI_LINE_MACRO_END
			
 
				+
			
 
				+#if NV_NO_ASSERT
			
 
				+
			
 
				+#   define nvAssert(exp) nvNoAssert(exp)
			
 
				+#   define nvCheck(exp) nvNoAssert(exp)
			
 
				+#   define nvDebugAssert(exp) nvNoAssert(exp)
			
 
				+#   define nvDebugCheck(exp) nvNoAssert(exp)
			
 
				+#   define nvDebugBreak() nvNoAssert(0)
			
 
				+
			
 
				+#else // NV_NO_ASSERT
			
 
				+
			
 
				+#   if NV_CC_MSVC
			
 
				+        // @@ Does this work in msvc-6 and earlier?
			
 
				+#       define nvDebugBreak()       __debugbreak()
			
 
				+//#       define nvDebugBreak()        __asm { int 3 }
			
 
				+#   elif NV_OS_ORBIS
			
 
				+#       define nvDebugBreak()       __debugbreak()
			
 
				+#   elif NV_CC_GNUC
			
 
				+#       define nvDebugBreak()       __builtin_trap()
			
 
				+#   else
			
 
				+#       error "No nvDebugBreak()!"
			
 
				+#   endif
			
 
				+
			
 
				+/*
			
 
				+#   elif NV_CC_GNUC || NV_CPU_PPC && NV_OS_DARWIN
			
 
				+        // @@ Use __builtin_trap() on GCC
			
 
				+#       define nvDebugBreak()       __asm__ volatile ("trap")
			
 
				+#   elif (NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64) && NV_OS_DARWIN
			
 
				+#       define nvDebugBreak()       __asm__ volatile ("int3")
			
 
				+#   elif NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64
			
 
				+#       define nvDebugBreak()       __asm__ ( "int %0" : :"I"(3) )
			
 
				+#   else
			
 
				+#       include <signal.h>
			
 
				+#       define nvDebugBreak()       raise(SIGTRAP)
			
 
				+#   endif
			
 
				+*/
			
 
				+
			
 
				+#define nvDebugBreakOnce() \
			
 
				+    NV_MULTI_LINE_MACRO_BEGIN \
			
 
				+    static bool firstTime = true; \
			
 
				+    if (firstTime) { firstTime = false; nvDebugBreak(); } \
			
 
				+    NV_MULTI_LINE_MACRO_END
			
 
				+
			
 
				+#define nvAssertMacro(exp) \
			
 
				+    NV_MULTI_LINE_MACRO_BEGIN \
			
 
				+    if (!(exp)) { \
			
 
				+        if (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) { \
			
 
				+            nvDebugBreak(); \
			
 
				+        } \
			
 
				+    } \
			
 
				+    NV_MULTI_LINE_MACRO_END
			
 
				+
			
 
				+// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care
			
 
				+#define nvAssertMacroWithIgnoreAll(exp,...) \
			
 
				+    NV_MULTI_LINE_MACRO_BEGIN \
			
 
				+        static bool ignoreAll = false; \
			
 
				+        if (!ignoreAll && !(exp)) { \
			
 
				+            int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \
			
 
				+            if (result == NV_ABORT_DEBUG) { \
			
 
				+                nvDebugBreak(); \
			
 
				+            } else if (result == NV_ABORT_IGNORE) { \
			
 
				+                ignoreAll = true; \
			
 
				+            } \
			
 
				+        } \
			
 
				+    NV_MULTI_LINE_MACRO_END
			
 
				+
			
 
				+// Interesting assert macro from Insomniac:
			
 
				+// http://www.gdcvault.com/play/1015319/Developing-Imperfect-Software-How-to
			
 
				+// Used as follows:
			
 
				+// if (nvCheck(i < count)) {
			
 
				+//     normal path
			
 
				+// } else {
			
 
				+//     fixup code.
			
 
				+// }
			
 
				+// This style of macro could be combined with __builtin_expect to let the compiler know failure is unlikely.
			
 
				+#define nvCheckMacro(exp) \
			
 
				+    (\
			
 
				+        (exp) ? true : ( \
			
 
				+            (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) ? (nvDebugBreak(), true) : ( false ) \
			
 
				+        ) \
			
 
				+    )
			
 
				+
			
 
				+
			
 
				+#define nvAssert(exp)    nvAssertMacro(exp)
			
 
				+#define nvCheck(exp)     nvAssertMacro(exp)
			
 
				+
			
 
				+#if defined(_DEBUG)
			
 
				+#   define nvDebugAssert(exp)   nvAssertMacro(exp)
			
 
				+#   define nvDebugCheck(exp)    nvAssertMacro(exp)
			
 
				+#else // _DEBUG
			
 
				+#   define nvDebugAssert(exp)   nvNoAssert(exp)
			
 
				+#   define nvDebugCheck(exp)    nvNoAssert(exp)
			
 
				+#endif // _DEBUG
			
 
				+
			
 
				+#endif // NV_NO_ASSERT
			
 
				+
			
 
				+// Use nvAssume for very simple expresions only: nvAssume(0), nvAssume(value == true), etc.
			
 
				+/*#if !defined(_DEBUG)
			
 
				+#   if NV_CC_MSVC
			
 
				+#       define nvAssume(exp)    __assume(exp)
			
 
				+#   else
			
 
				+#       define nvAssume(exp)    nvCheck(exp)
			
 
				+#   endif
			
 
				+#else
			
 
				+#   define nvAssume(exp)    nvCheck(exp)
			
 
				+#endif*/
			
 
				+
			
 
				+#if defined(_DEBUG)
			
 
				+#  if NV_CC_MSVC
			
 
				+#   define nvUnreachable() nvAssert(0 && "unreachable"); __assume(0)
			
 
				+#  else
			
 
				+#   define nvUnreachable() nvAssert(0 && "unreachable"); __builtin_unreachable()
			
 
				+#  endif
			
 
				+#else
			
 
				+#  if NV_CC_MSVC
			
 
				+#   define nvUnreachable() __assume(0)
			
 
				+#  else
			
 
				+#   define nvUnreachable() __builtin_unreachable()
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#define nvError(x)      nvAbort(x, __FILE__, __LINE__, __FUNC__)
			
 
				+#define nvWarning(x)    nvDebugPrint("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
			
 
				+
			
 
				+#ifndef NV_DEBUG_PRINT
			
 
				+#define NV_DEBUG_PRINT 1 //defined(_DEBUG)
			
 
				+#endif
			
 
				+
			
 
				+#if NV_DEBUG_PRINT
			
 
				+#define nvDebug(...)    nvDebugPrint(__VA_ARGS__)
			
 
				+#else
			
 
				+#if NV_CC_MSVC
			
 
				+#define nvDebug(...)    __noop(__VA_ARGS__)
			
 
				+#else
			
 
				+#define nvDebug(...)    ((void)0) // Non-msvc platforms do not evaluate arguments?
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6)));
			
 
				+NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    inline bool isValidPtr(const void * ptr) {
			
 
				+    #if NV_CPU_X86_64
			
 
				+        if (ptr == NULL) return true;
			
 
				+        if (reinterpret_cast<uint64>(ptr) < 0x10000ULL) return false;
			
 
				+        if (reinterpret_cast<uint64>(ptr) >= 0x000007FFFFFEFFFFULL) return false;
			
 
				+    #else
			
 
				+	    if (reinterpret_cast<uint32>(ptr) == 0xcccccccc) return false;
			
 
				+	    if (reinterpret_cast<uint32>(ptr) == 0xcdcdcdcd) return false;
			
 
				+	    if (reinterpret_cast<uint32>(ptr) == 0xdddddddd) return false;
			
 
				+	    if (reinterpret_cast<uint32>(ptr) == 0xffffffff) return false;
			
 
				+    #endif
			
 
				+        return true;
			
 
				+    }
			
 
				+
			
 
				+    // Message handler interface.
			
 
				+    struct MessageHandler {
			
 
				+        virtual void log(const char * str, va_list arg) = 0;
			
 
				+        virtual ~MessageHandler() {}
			
 
				+    };
			
 
				+
			
 
				+    // Assert handler interface.
			
 
				+    struct AssertHandler {
			
 
				+        virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0;
			
 
				+        virtual ~AssertHandler() {}
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    namespace debug
			
 
				+    {
			
 
				+        NVCORE_API void dumpInfo();
			
 
				+        NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 );
			
 
				+
			
 
				+        NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
			
 
				+        NVCORE_API void resetMessageHandler();
			
 
				+
			
 
				+        NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
			
 
				+        NVCORE_API void resetAssertHandler();
			
 
				+
			
 
				+        NVCORE_API void enableSigHandler(bool interactive);
			
 
				+        NVCORE_API void disableSigHandler();
			
 
				+
			
 
				+        NVCORE_API bool isDebuggerPresent();
			
 
				+        NVCORE_API bool attachToDebugger();
			
 
				+
			
 
				+        NVCORE_API void terminate(int code);
			
 
				+    }
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_CORE_DEBUG_H
			
--- a/3rdparty/nvtt/nvcore/array.h
+++ b/3rdparty/nvtt/nvcore/array.h
@@ -0,0 +1,181 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_CORE_ARRAY_H
			
 
				+#define NV_CORE_ARRAY_H
			
 
				+
			
 
				+/*
			
 
				+This array class requires the elements to be relocable; it uses memmove and realloc. Ideally I should be 
			
 
				+using swap, but I honestly don't care. The only thing that you should be aware of is that internal pointers
			
 
				+are not supported.
			
 
				+
			
 
				+Note also that push_back and resize does not support inserting arguments elements that are in the same 
			
 
				+container. This is forbidden to prevent an extra copy.
			
 
				+*/
			
 
				+
			
 
				+
			
 
				+#include "memory.h"
			
 
				+#include "debug.h"
			
 
				+#include "foreach.h" // pseudoindex
			
 
				+
			
 
				+
			
 
				+namespace nv 
			
 
				+{
			
 
				+    class Stream;
			
 
				+
			
 
				+    /**
			
 
				+    * Replacement for std::vector that is easier to debug and provides
			
 
				+    * some nice foreach enumerators. 
			
 
				+    */
			
 
				+    template<typename T>
			
 
				+    class NVCORE_CLASS Array {
			
 
				+    public:
			
 
				+        typedef uint size_type;
			
 
				+
			
 
				+        // Default constructor.
			
 
				+        NV_FORCEINLINE Array() : m_buffer(NULL), m_capacity(0), m_size(0) {}
			
 
				+
			
 
				+        // Copy constructor.
			
 
				+        NV_FORCEINLINE Array(const Array & a) : m_buffer(NULL), m_capacity(0), m_size(0) {
			
 
				+            copy(a.m_buffer, a.m_size);
			
 
				+        }
			
 
				+
			
 
				+        // Constructor that initializes the vector with the given elements.
			
 
				+        NV_FORCEINLINE Array(const T * ptr, uint num) : m_buffer(NULL), m_capacity(0), m_size(0) {
			
 
				+            copy(ptr, num);
			
 
				+        }
			
 
				+
			
 
				+        // Allocate array.
			
 
				+        NV_FORCEINLINE explicit Array(uint capacity) : m_buffer(NULL), m_capacity(0), m_size(0) {
			
 
				+            setArrayCapacity(capacity);
			
 
				+        }
			
 
				+
			
 
				+        // Destructor.
			
 
				+        NV_FORCEINLINE ~Array() {
			
 
				+            clear();
			
 
				+            free<T>(m_buffer);
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        /// Const element access.
			
 
				+        NV_FORCEINLINE const T & operator[]( uint index ) const
			
 
				+        {
			
 
				+            nvDebugCheck(index < m_size);
			
 
				+            return m_buffer[index];
			
 
				+        }
			
 
				+        NV_FORCEINLINE const T & at( uint index ) const
			
 
				+        {
			
 
				+            nvDebugCheck(index < m_size);
			
 
				+            return m_buffer[index];
			
 
				+        }
			
 
				+
			
 
				+        /// Element access.
			
 
				+        NV_FORCEINLINE T & operator[] ( uint index )
			
 
				+        {
			
 
				+            nvDebugCheck(index < m_size);
			
 
				+            return m_buffer[index];
			
 
				+        }
			
 
				+        NV_FORCEINLINE T & at( uint index )
			
 
				+        {
			
 
				+            nvDebugCheck(index < m_size);
			
 
				+            return m_buffer[index];
			
 
				+        }
			
 
				+
			
 
				+        /// Get vector size.
			
 
				+        NV_FORCEINLINE uint size() const { return m_size; }
			
 
				+
			
 
				+        /// Get vector size.
			
 
				+        NV_FORCEINLINE uint count() const { return m_size; }
			
 
				+
			
 
				+        /// Get vector capacity.
			
 
				+        NV_FORCEINLINE uint capacity() const { return m_capacity; }
			
 
				+
			
 
				+        /// Get const vector pointer.
			
 
				+        NV_FORCEINLINE const T * buffer() const { return m_buffer; }
			
 
				+
			
 
				+        /// Get vector pointer.
			
 
				+        NV_FORCEINLINE T * buffer() { return m_buffer; }
			
 
				+
			
 
				+        /// Provide begin/end pointers for C++11 range-based for loops.
			
 
				+        NV_FORCEINLINE T * begin() { return m_buffer; }
			
 
				+        NV_FORCEINLINE T * end() { return m_buffer + m_size; }
			
 
				+        NV_FORCEINLINE const T * begin() const { return m_buffer; }
			
 
				+        NV_FORCEINLINE const T * end() const { return m_buffer + m_size; }
			
 
				+
			
 
				+        /// Is vector empty.
			
 
				+        NV_FORCEINLINE bool isEmpty() const { return m_size == 0; }
			
 
				+
			
 
				+        /// Is a null vector.
			
 
				+        NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; }
			
 
				+
			
 
				+
			
 
				+        T & append();
			
 
				+        void push_back( const T & val );
			
 
				+        void pushBack( const T & val );
			
 
				+        Array<T> & append( const T & val );
			
 
				+        Array<T> & operator<< ( T & t );
			
 
				+        void pop_back();
			
 
				+        void popBack(uint count = 1);
			
 
				+        void popFront(uint count = 1);
			
 
				+        const T & back() const;
			
 
				+        T & back();
			
 
				+        const T & front() const;
			
 
				+        T & front();
			
 
				+        bool contains(const T & e) const;
			
 
				+        bool find(const T & element, uint * indexPtr) const;
			
 
				+        bool find(const T & element, uint begin, uint end, uint * indexPtr) const;
			
 
				+        void removeAt(uint index);
			
 
				+        bool remove(const T & element);
			
 
				+        void insertAt(uint index, const T & val = T());
			
 
				+        void append(const Array<T> & other);
			
 
				+        void append(const T other[], uint count);
			
 
				+        void replaceWithLast(uint index);
			
 
				+        void resize(uint new_size);
			
 
				+        void resize(uint new_size, const T & elem);
			
 
				+        void fill(const T & elem);
			
 
				+        void clear();
			
 
				+        void shrink();
			
 
				+        void reserve(uint desired_size);
			
 
				+        void copy(const T * data, uint count);
			
 
				+        Array<T> & operator=( const Array<T> & a );
			
 
				+        T * release();
			
 
				+
			
 
				+
			
 
				+        // Array enumerator.
			
 
				+        typedef uint PseudoIndex;
			
 
				+
			
 
				+        NV_FORCEINLINE PseudoIndex start() const { return 0; }
			
 
				+        NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; }
			
 
				+        NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
			
 
				+
			
 
				+#if NV_CC_MSVC
			
 
				+        NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) {
			
 
				+            return m_buffer[i(this)];
			
 
				+        }
			
 
				+        NV_FORCEINLINE const T & operator[]( const PseudoIndexWrapper & i ) const {
			
 
				+            return m_buffer[i(this)];
			
 
				+        }
			
 
				+#endif
			
 
				+
			
 
				+        // Friends.
			
 
				+        template <typename Typ> 
			
 
				+        friend Stream & operator<< ( Stream & s, Array<Typ> & p );
			
 
				+
			
 
				+        template <typename Typ>
			
 
				+        friend void swap(Array<Typ> & a, Array<Typ> & b);
			
 
				+
			
 
				+
			
 
				+    protected:
			
 
				+
			
 
				+        void setArraySize(uint new_size);
			
 
				+        void setArrayCapacity(uint new_capacity);
			
 
				+
			
 
				+        T * m_buffer;
			
 
				+        uint m_capacity;
			
 
				+        uint m_size;
			
 
				+
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_CORE_ARRAY_H
			
--- a/3rdparty/nvtt/nvcore/defsgnucdarwin.h
+++ b/3rdparty/nvtt/nvcore/defsgnucdarwin.h
@@ -0,0 +1,53 @@
 
				+#ifndef NV_CORE_H
			
 
				+#error "Do not include this file directly."
			
 
				+#endif
			
 
				+
			
 
				+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
			
 
				+#include <stddef.h> // operator new, size_t, NULL
			
 
				+
			
 
				+// Function linkage
			
 
				+#define DLL_IMPORT
			
 
				+#if __GNUC__ >= 4
			
 
				+#	define DLL_EXPORT __attribute__((visibility("default")))
			
 
				+#	define DLL_EXPORT_CLASS DLL_EXPORT
			
 
				+#else
			
 
				+#	define DLL_EXPORT
			
 
				+#	define DLL_EXPORT_CLASS
			
 
				+#endif
			
 
				+
			
 
				+// Function calling modes
			
 
				+#if NV_CPU_X86
			
 
				+#	define NV_CDECL 	__attribute__((cdecl))
			
 
				+#	define NV_STDCALL	__attribute__((stdcall))
			
 
				+#else
			
 
				+#	define NV_CDECL 
			
 
				+#	define NV_STDCALL
			
 
				+#endif
			
 
				+
			
 
				+#define NV_FASTCALL		__attribute__((fastcall))
			
 
				+#define NV_FORCEINLINE	__attribute__((always_inline)) inline
			
 
				+#define NV_DEPRECATED   __attribute__((deprecated))
			
 
				+#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX
			
 
				+
			
 
				+#if __GNUC__ > 2
			
 
				+#define NV_PURE     __attribute__((pure))
			
 
				+#define NV_CONST    __attribute__((const))
			
 
				+#else
			
 
				+#define NV_PURE
			
 
				+#define NV_CONST
			
 
				+#endif
			
 
				+
			
 
				+#define NV_NOINLINE __attribute__((noinline))
			
 
				+
			
 
				+// Define __FUNC__ properly.
			
 
				+#if __STDC_VERSION__ < 199901L
			
 
				+#	if __GNUC__ >= 2
			
 
				+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
			
 
				+#	else
			
 
				+#		define __FUNC__ "<unknown>"
			
 
				+#	endif
			
 
				+#else
			
 
				+#	define __FUNC__ __PRETTY_FUNCTION__
			
 
				+#endif
			
 
				+
			
 
				+#define restrict    __restrict__
			
--- a/3rdparty/nvtt/nvcore/defsgnuclinux.h
+++ b/3rdparty/nvtt/nvcore/defsgnuclinux.h
@@ -0,0 +1,59 @@
 
				+#ifndef NV_CORE_H
			
 
				+#error "Do not include this file directly."
			
 
				+#endif
			
 
				+
			
 
				+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
			
 
				+#include <stddef.h> // operator new, size_t, NULL
			
 
				+
			
 
				+// Function linkage
			
 
				+#define DLL_IMPORT
			
 
				+#if __GNUC__ >= 4
			
 
				+#   define DLL_EXPORT   __attribute__((visibility("default")))
			
 
				+#   define DLL_EXPORT_CLASS DLL_EXPORT
			
 
				+#else
			
 
				+#   define DLL_EXPORT
			
 
				+#   define DLL_EXPORT_CLASS
			
 
				+#endif
			
 
				+
			
 
				+// Function calling modes
			
 
				+#if NV_CPU_X86
			
 
				+#   define NV_CDECL     __attribute__((cdecl))
			
 
				+#   define NV_STDCALL   __attribute__((stdcall))
			
 
				+#else
			
 
				+#   define NV_CDECL 
			
 
				+#   define NV_STDCALL
			
 
				+#endif
			
 
				+
			
 
				+#define NV_FASTCALL     __attribute__((fastcall))
			
 
				+//#if __GNUC__ > 3
			
 
				+// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :(
			
 
				+#define NV_FORCEINLINE  inline __attribute__((always_inline))
			
 
				+//#else
			
 
				+// Some compilers complain that inline and always_inline are redundant.
			
 
				+//#define NV_FORCEINLINE  __attribute__((always_inline))
			
 
				+//#endif
			
 
				+#define NV_DEPRECATED   __attribute__((deprecated))
			
 
				+#define NV_THREAD_LOCAL __thread 
			
 
				+
			
 
				+#if __GNUC__ > 2
			
 
				+#define NV_PURE     __attribute__((pure))
			
 
				+#define NV_CONST    __attribute__((const))
			
 
				+#else
			
 
				+#define NV_PURE
			
 
				+#define NV_CONST
			
 
				+#endif
			
 
				+
			
 
				+#define NV_NOINLINE __attribute__((noinline))
			
 
				+
			
 
				+// Define __FUNC__ properly.
			
 
				+#if __STDC_VERSION__ < 199901L
			
 
				+#   if __GNUC__ >= 2
			
 
				+#       define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__
			
 
				+#   else
			
 
				+#       define __FUNC__ "<unknown>"
			
 
				+#   endif
			
 
				+#else
			
 
				+#   define __FUNC__ __PRETTY_FUNCTION__
			
 
				+#endif
			
 
				+
			
 
				+#define restrict    __restrict__
			
--- a/3rdparty/nvtt/nvcore/defsgnucwin32.h
+++ b/3rdparty/nvtt/nvcore/defsgnucwin32.h
@@ -0,0 +1,65 @@
 
				+#ifndef NV_CORE_H
			
 
				+#error "Do not include this file directly."
			
 
				+#endif
			
 
				+
			
 
				+//#include <cstddef> // size_t, NULL
			
 
				+
			
 
				+// Function linkage
			
 
				+#define DLL_IMPORT	__declspec(dllimport)
			
 
				+#define DLL_EXPORT	__declspec(dllexport)
			
 
				+#define DLL_EXPORT_CLASS DLL_EXPORT
			
 
				+
			
 
				+// Function calling modes
			
 
				+#if NV_CPU_X86
			
 
				+#	define NV_CDECL 	__attribute__((cdecl))
			
 
				+#	define NV_STDCALL	__attribute__((stdcall))
			
 
				+#else
			
 
				+#	define NV_CDECL 
			
 
				+#	define NV_STDCALL
			
 
				+#endif
			
 
				+
			
 
				+#define NV_FASTCALL		__attribute__((fastcall))
			
 
				+#define NV_FORCEINLINE	__attribute__((always_inline))
			
 
				+#define NV_DEPRECATED   __attribute__((deprecated))
			
 
				+
			
 
				+#if __GNUC__ > 2
			
 
				+#define NV_PURE		__attribute__((pure))
			
 
				+#define NV_CONST	__attribute__((const))
			
 
				+#else
			
 
				+#define NV_PURE
			
 
				+#define NV_CONST
			
 
				+#endif
			
 
				+
			
 
				+#define NV_NOINLINE __attribute__((noinline))
			
 
				+
			
 
				+// Define __FUNC__ properly.
			
 
				+#if __STDC_VERSION__ < 199901L
			
 
				+#	if __GNUC__ >= 2
			
 
				+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
			
 
				+#	else
			
 
				+#		define __FUNC__ "<unknown>"
			
 
				+#	endif
			
 
				+#else
			
 
				+#	define __FUNC__ __PRETTY_FUNCTION__
			
 
				+#endif
			
 
				+
			
 
				+#define restrict	__restrict__
			
 
				+
			
 
				+/*
			
 
				+// Type definitions
			
 
				+typedef unsigned char		uint8;
			
 
				+typedef signed char			int8;
			
 
				+
			
 
				+typedef unsigned short		uint16;
			
 
				+typedef signed short		int16;
			
 
				+
			
 
				+typedef unsigned int		uint32;
			
 
				+typedef signed int			int32;
			
 
				+
			
 
				+typedef unsigned long long	uint64;
			
 
				+typedef signed long long	int64;
			
 
				+
			
 
				+// Aliases
			
 
				+typedef uint32				uint;
			
 
				+*/
			
 
				+
			
--- a/3rdparty/nvtt/nvcore/defsvcwin32.h
+++ b/3rdparty/nvtt/nvcore/defsvcwin32.h
@@ -0,0 +1,94 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_CORE_H
			
 
				+#error "Do not include this file directly."
			
 
				+#endif
			
 
				+
			
 
				+// Function linkage
			
 
				+#define DLL_IMPORT __declspec(dllimport)
			
 
				+#define DLL_EXPORT __declspec(dllexport)
			
 
				+#define DLL_EXPORT_CLASS DLL_EXPORT
			
 
				+
			
 
				+// Function calling modes
			
 
				+#define NV_CDECL        __cdecl
			
 
				+#define NV_STDCALL      __stdcall
			
 
				+#define NV_FASTCALL     __fastcall
			
 
				+#define NV_DEPRECATED
			
 
				+
			
 
				+#define NV_PURE
			
 
				+#define NV_CONST
			
 
				+
			
 
				+// Set standard function names.
			
 
				+#if _MSC_VER < 1900
			
 
				+#   define snprintf _snprintf
			
 
				+#endif
			
 
				+#if _MSC_VER < 1500
			
 
				+#   define vsnprintf _vsnprintf
			
 
				+#endif
			
 
				+#if _MSC_VER < 1700
			
 
				+#   define strtoll _strtoi64
			
 
				+#   define strtoull _strtoui64
			
 
				+#endif
			
 
				+#define chdir _chdir
			
 
				+#define getcwd _getcwd 
			
 
				+
			
 
				+#if _MSC_VER < 1800 // Not sure what version introduced this.
			
 
				+#define va_copy(a, b) (a) = (b)
			
 
				+#endif
			
 
				+
			
 
				+#if !defined restrict
			
 
				+#define restrict
			
 
				+#endif
			
 
				+
			
 
				+// Ignore gcc attributes.
			
 
				+#define __attribute__(X)
			
 
				+
			
 
				+#if !defined __FUNC__
			
 
				+#define __FUNC__ __FUNCTION__ 
			
 
				+#endif
			
 
				+
			
 
				+#define NV_NOINLINE __declspec(noinline)
			
 
				+#define NV_FORCEINLINE __forceinline
			
 
				+
			
 
				+#define NV_THREAD_LOCAL __declspec(thread)
			
 
				+
			
 
				+/*
			
 
				+// Type definitions
			
 
				+typedef unsigned char       uint8;
			
 
				+typedef signed char         int8;
			
 
				+
			
 
				+typedef unsigned short      uint16;
			
 
				+typedef signed short        int16;
			
 
				+
			
 
				+typedef unsigned int        uint32;
			
 
				+typedef signed int          int32;
			
 
				+
			
 
				+typedef unsigned __int64    uint64;
			
 
				+typedef signed __int64      int64;
			
 
				+
			
 
				+// Aliases
			
 
				+typedef uint32              uint;
			
 
				+*/
			
 
				+
			
 
				+// Unwanted VC++ warnings to disable.
			
 
				+/*
			
 
				+#pragma warning(disable : 4244)     // conversion to float, possible loss of data
			
 
				+#pragma warning(disable : 4245)     // conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
			
 
				+#pragma warning(disable : 4100)     // unreferenced formal parameter
			
 
				+#pragma warning(disable : 4514)     // unreferenced inline function has been removed
			
 
				+#pragma warning(disable : 4710)     // inline function not expanded
			
 
				+#pragma warning(disable : 4127)     // Conditional expression is constant
			
 
				+#pragma warning(disable : 4305)     // truncation from 'const double' to 'float'
			
 
				+#pragma warning(disable : 4505)     // unreferenced local function has been removed
			
 
				+
			
 
				+#pragma warning(disable : 4702)     // unreachable code in inline expanded function
			
 
				+#pragma warning(disable : 4711)     // function selected for automatic inlining
			
 
				+#pragma warning(disable : 4725)     // Pentium fdiv bug
			
 
				+
			
 
				+#pragma warning(disable : 4786)     // Identifier was truncated and cannot be debugged.
			
 
				+
			
 
				+#pragma warning(disable : 4675)     // resolved overload was found by argument-dependent lookup
			
 
				+*/
			
 
				+
			
 
				+#pragma warning(1 : 4705)     // Report unused local variables.
			
 
				+#pragma warning(1 : 4555)     // Expression has no effect.
			
--- a/3rdparty/nvtt/nvcore/foreach.h
+++ b/3rdparty/nvtt/nvcore/foreach.h
@@ -0,0 +1,68 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#pragma once
			
 
				+#ifndef NV_CORE_FOREACH_H
			
 
				+#define NV_CORE_FOREACH_H
			
 
				+
			
 
				+/*
			
 
				+These foreach macros are very non-standard and somewhat confusing, but I like them.
			
 
				+*/
			
 
				+
			
 
				+#include "nvcore.h"
			
 
				+
			
 
				+#if NV_CC_GNUC // If typeof or decltype is available:
			
 
				+#if !NV_CC_CPP11
			
 
				+#   define NV_DECLTYPE typeof // Using a non-standard extension over typeof that behaves as C++11 decltype
			
 
				+#else
			
 
				+#   define NV_DECLTYPE decltype
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+Ideally we would like to write this:
			
 
				+
			
 
				+#define NV_FOREACH(i, container) \
			
 
				+    for(NV_DECLTYPE(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
			
 
				+
			
 
				+But gcc versions prior to 4.7 required an intermediate type. See:
			
 
				+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709
			
 
				+*/
			
 
				+
			
 
				+#define NV_FOREACH(i, container) \
			
 
				+    typedef NV_DECLTYPE(container) NV_STRING_JOIN2(cont,__LINE__); \
			
 
				+    for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
			
 
				+
			
 
				+#else // If typeof not available:
			
 
				+
			
 
				+#include <new> // placement new
			
 
				+
			
 
				+struct PseudoIndexWrapper {
			
 
				+    template <typename T>
			
 
				+    PseudoIndexWrapper(const T & container) {
			
 
				+        nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory));
			
 
				+        new (memory) typename T::PseudoIndex(container.start());
			
 
				+    }
			
 
				+    // PseudoIndex cannot have a dtor!
			
 
				+
			
 
				+    template <typename T> typename T::PseudoIndex & operator()(const T * /*container*/) {
			
 
				+        return *reinterpret_cast<typename T::PseudoIndex *>(memory);
			
 
				+    }
			
 
				+    template <typename T> const typename T::PseudoIndex & operator()(const T * /*container*/) const {
			
 
				+        return *reinterpret_cast<const typename T::PseudoIndex *>(memory);
			
 
				+    }
			
 
				+
			
 
				+    uint8 memory[4];	// Increase the size if we have bigger enumerators.
			
 
				+};
			
 
				+
			
 
				+#define NV_FOREACH(i, container) \
			
 
				+    for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container))))
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+// Declare foreach keyword.
			
 
				+#if !defined NV_NO_USE_KEYWORDS
			
 
				+#   define foreach NV_FOREACH
			
 
				+#   define foreach_index NV_FOREACH
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+#endif // NV_CORE_FOREACH_H
			
--- a/3rdparty/nvtt/nvcore/hash.h
+++ b/3rdparty/nvtt/nvcore/hash.h
@@ -0,0 +1,83 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#pragma once
			
 
				+#ifndef NV_CORE_HASH_H
			
 
				+#define NV_CORE_HASH_H
			
 
				+
			
 
				+#include "nvcore.h"
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    inline uint sdbmHash(const void * data_in, uint size, uint h = 5381)
			
 
				+    {
			
 
				+        const uint8 * data = (const uint8 *) data_in;
			
 
				+        uint i = 0;
			
 
				+        while (i < size) {
			
 
				+            h = (h << 16) + (h << 6) - h + (uint) data[i++];
			
 
				+        }
			
 
				+        return h;
			
 
				+    }
			
 
				+
			
 
				+    // Note that this hash does not handle NaN properly.
			
 
				+    inline uint sdbmFloatHash(const float * f, uint count, uint h = 5381)
			
 
				+    {
			
 
				+        for (uint i = 0; i < count; i++) {
			
 
				+            //nvDebugCheck(nv::isFinite(*f));
			
 
				+            union { float f; uint32 i; } x = { f[i] };
			
 
				+            if (x.i == 0x80000000) x.i = 0;
			
 
				+            h = sdbmHash(&x, 4, h);
			
 
				+        }
			
 
				+        return h;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    template <typename T>
			
 
				+    inline uint hash(const T & t, uint h = 5381)
			
 
				+    {
			
 
				+        return sdbmHash(&t, sizeof(T), h);
			
 
				+    }
			
 
				+
			
 
				+    template <>
			
 
				+    inline uint hash(const float & f, uint h)
			
 
				+    {
			
 
				+        return sdbmFloatHash(&f, 1, h);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // Functors for hash table:
			
 
				+    template <typename Key> struct Hash 
			
 
				+    {
			
 
				+        uint operator()(const Key & k) const {
			
 
				+            return hash(k);
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+    template <typename Key> struct Equal
			
 
				+    {
			
 
				+        bool operator()(const Key & k0, const Key & k1) const {
			
 
				+            return k0 == k1;
			
 
				+        }
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    // @@ Move to Utils.h?
			
 
				+    template <typename T1, typename T2>
			
 
				+    struct Pair {
			
 
				+        T1 first;
			
 
				+        T2 second;
			
 
				+    };
			
 
				+
			
 
				+    template <typename T1, typename T2>
			
 
				+    bool operator==(const Pair<T1,T2> & p0, const Pair<T1,T2> & p1) {
			
 
				+        return p0.first == p1.first && p0.second == p1.second;
			
 
				+    }
			
 
				+
			
 
				+    template <typename T1, typename T2>
			
 
				+    uint hash(const Pair<T1,T2> & p, uint h = 5381) {
			
 
				+        return hash(p.second, hash(p.first));
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_CORE_HASH_H
			
--- a/3rdparty/nvtt/nvcore/memory.h
+++ b/3rdparty/nvtt/nvcore/memory.h
@@ -0,0 +1,29 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_CORE_MEMORY_H
			
 
				+#define NV_CORE_MEMORY_H
			
 
				+
			
 
				+#include "nvcore.h"
			
 
				+
			
 
				+namespace nv {
			
 
				+
			
 
				+    // C++ helpers.
			
 
				+    template <typename T> NV_FORCEINLINE T * malloc(size_t count) {
			
 
				+        return (T *)::malloc(sizeof(T) * count);
			
 
				+    }
			
 
				+
			
 
				+    template <typename T> NV_FORCEINLINE T * realloc(T * ptr, size_t count) {
			
 
				+        return (T *)::realloc(ptr, sizeof(T) * count);
			
 
				+    }
			
 
				+
			
 
				+    template <typename T> NV_FORCEINLINE void free(const T * ptr) {
			
 
				+        ::free((void *)ptr);
			
 
				+    }
			
 
				+
			
 
				+    template <typename T> NV_FORCEINLINE void zero(T & data) {
			
 
				+        memset(&data, 0, sizeof(T));
			
 
				+    }
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_CORE_MEMORY_H
			
--- a/3rdparty/nvtt/nvcore/nvcore.h
+++ b/3rdparty/nvtt/nvcore/nvcore.h
@@ -0,0 +1,299 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_CORE_H
			
 
				+#define NV_CORE_H
			
 
				+
			
 
				+// Function linkage
			
 
				+#if NVCORE_SHARED
			
 
				+#ifdef NVCORE_EXPORTS
			
 
				+#define NVCORE_API DLL_EXPORT
			
 
				+#define NVCORE_CLASS DLL_EXPORT_CLASS
			
 
				+#else
			
 
				+#define NVCORE_API DLL_IMPORT
			
 
				+#define NVCORE_CLASS DLL_IMPORT
			
 
				+#endif
			
 
				+#else // NVCORE_SHARED
			
 
				+#define NVCORE_API
			
 
				+#define NVCORE_CLASS
			
 
				+#endif // NVCORE_SHARED
			
 
				+
			
 
				+
			
 
				+// Platform definitions
			
 
				+#include "posh.h"
			
 
				+
			
 
				+// OS:
			
 
				+// NV_OS_WIN32
			
 
				+// NV_OS_WIN64
			
 
				+// NV_OS_MINGW
			
 
				+// NV_OS_CYGWIN
			
 
				+// NV_OS_LINUX
			
 
				+// NV_OS_UNIX
			
 
				+// NV_OS_DARWIN
			
 
				+// NV_OS_XBOX
			
 
				+// NV_OS_ORBIS
			
 
				+// NV_OS_IOS
			
 
				+
			
 
				+#define NV_OS_STRING POSH_OS_STRING
			
 
				+
			
 
				+#if defined POSH_OS_LINUX
			
 
				+#   define NV_OS_LINUX 1
			
 
				+#   define NV_OS_UNIX 1
			
 
				+#elif defined POSH_OS_ORBIS
			
 
				+#   define NV_OS_ORBIS 1
			
 
				+#elif defined POSH_OS_FREEBSD
			
 
				+#   define NV_OS_FREEBSD 1
			
 
				+#   define NV_OS_UNIX 1
			
 
				+#elif defined POSH_OS_OPENBSD
			
 
				+#   define NV_OS_OPENBSD 1
			
 
				+#   define NV_OS_UNIX 1
			
 
				+#elif defined POSH_OS_CYGWIN32
			
 
				+#   define NV_OS_CYGWIN 1
			
 
				+#elif defined POSH_OS_MINGW
			
 
				+#   define NV_OS_MINGW 1
			
 
				+#   define NV_OS_WIN32 1
			
 
				+#elif defined POSH_OS_OSX
			
 
				+#   define NV_OS_DARWIN 1
			
 
				+#   define NV_OS_UNIX 1
			
 
				+#elif defined POSH_OS_IOS
			
 
				+#   define NV_OS_DARWIN 1 //ACS should we keep this on IOS?
			
 
				+#   define NV_OS_UNIX 1
			
 
				+#   define NV_OS_IOS 1
			
 
				+#elif defined POSH_OS_UNIX
			
 
				+#   define NV_OS_UNIX 1
			
 
				+#elif defined POSH_OS_WIN64
			
 
				+#   define NV_OS_WIN32 1
			
 
				+#   define NV_OS_WIN64 1
			
 
				+#elif defined POSH_OS_WIN32
			
 
				+#   define NV_OS_WIN32 1
			
 
				+#elif defined POSH_OS_XBOX
			
 
				+#   define NV_OS_XBOX 1
			
 
				+#else
			
 
				+#   error "Unsupported OS"
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+// Threading:
			
 
				+// some platforms don't implement __thread or similar for thread-local-storage
			
 
				+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
			
 
				+#   define NV_OS_USE_PTHREAD 1
			
 
				+#   if NV_OS_DARWIN || NV_OS_IOS
			
 
				+#       define NV_OS_HAS_TLS_QUALIFIER 0
			
 
				+#   else
			
 
				+#       define NV_OS_HAS_TLS_QUALIFIER 1
			
 
				+#   endif
			
 
				+#else
			
 
				+#   define NV_OS_USE_PTHREAD 0
			
 
				+#   define NV_OS_HAS_TLS_QUALIFIER 1
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+// CPUs:
			
 
				+// NV_CPU_X86
			
 
				+// NV_CPU_X86_64
			
 
				+// NV_CPU_PPC
			
 
				+// NV_CPU_ARM
			
 
				+// NV_CPU_AARCH64
			
 
				+
			
 
				+#define NV_CPU_STRING   POSH_CPU_STRING
			
 
				+
			
 
				+#if defined POSH_CPU_X86_64
			
 
				+//#   define NV_CPU_X86 1
			
 
				+#   define NV_CPU_X86_64 1
			
 
				+#elif defined POSH_CPU_X86
			
 
				+#   define NV_CPU_X86 1
			
 
				+#elif defined POSH_CPU_PPC
			
 
				+#   define NV_CPU_PPC 1
			
 
				+#elif defined POSH_CPU_STRONGARM
			
 
				+#   define NV_CPU_ARM 1
			
 
				+#elif defined POSH_CPU_AARCH64
			
 
				+#   define NV_CPU_AARCH64 1
			
 
				+#else
			
 
				+#   error "Unsupported CPU"
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+// Compiler:
			
 
				+// NV_CC_GNUC
			
 
				+// NV_CC_MSVC
			
 
				+// NV_CC_CLANG
			
 
				+
			
 
				+#if defined POSH_COMPILER_CLANG
			
 
				+#   define NV_CC_CLANG  1
			
 
				+#   define NV_CC_GNUC   1    // Clang is compatible with GCC.
			
 
				+#   define NV_CC_STRING "clang"
			
 
				+#elif defined POSH_COMPILER_GCC
			
 
				+#   define NV_CC_GNUC   1
			
 
				+#   define NV_CC_STRING "gcc"
			
 
				+#elif defined POSH_COMPILER_MSVC
			
 
				+#   define NV_CC_MSVC   1
			
 
				+#   define NV_CC_STRING "msvc"
			
 
				+#else
			
 
				+#   error "Unsupported compiler"
			
 
				+#endif
			
 
				+
			
 
				+#if NV_CC_MSVC
			
 
				+#define NV_CC_CPP11 (__cplusplus > 199711L || _MSC_VER >= 1800) // Visual Studio 2013 has all the features we use, but doesn't advertise full C++11 support yet.
			
 
				+#else
			
 
				+// @@ IC: This works in CLANG, about GCC?
			
 
				+// @@ ES: Doesn't work in gcc. These 3 features are available in GCC >= 4.4.
			
 
				+#ifdef __clang__
			
 
				+#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert))
			
 
				+#elif defined __GNUC__ 
			
 
				+#define NV_CC_CPP11 ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
			
 
				+#endif
			
 
				+#endif
			
 
				+
			
 
				+// Endiannes:
			
 
				+#define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
			
 
				+#define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
			
 
				+#define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
			
 
				+
			
 
				+
			
 
				+// Define the right printf prefix for size_t arguments:
			
 
				+#if POSH_64BIT_POINTER
			
 
				+#  define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX
			
 
				+#else
			
 
				+#  define NV_SIZET_PRINTF_PREFIX
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+// Type definitions:
			
 
				+typedef posh_u8_t   uint8;
			
 
				+typedef posh_i8_t   int8;
			
 
				+
			
 
				+typedef posh_u16_t  uint16;
			
 
				+typedef posh_i16_t  int16;
			
 
				+
			
 
				+typedef posh_u32_t  uint32;
			
 
				+typedef posh_i32_t  int32;
			
 
				+
			
 
				+typedef posh_u64_t  uint64;
			
 
				+typedef posh_i64_t  int64;
			
 
				+
			
 
				+// Aliases
			
 
				+typedef uint32      uint;
			
 
				+
			
 
				+
			
 
				+// Version string:
			
 
				+#define NV_VERSION_STRING \
			
 
				+    NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
			
 
				+    NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
			
 
				+
			
 
				+
			
 
				+// Disable copy constructor and assignment operator. 
			
 
				+#if NV_CC_CPP11
			
 
				+#define NV_FORBID_COPY(C) \
			
 
				+    C( const C & ) = delete; \
			
 
				+    C &operator=( const C & ) = delete
			
 
				+#else
			
 
				+#define NV_FORBID_COPY(C) \
			
 
				+    private: \
			
 
				+    C( const C & ); \
			
 
				+    C &operator=( const C & )
			
 
				+#endif
			
 
				+
			
 
				+// Disable dynamic allocation on the heap. 
			
 
				+// See Prohibiting Heap-Based Objects in More Effective C++.
			
 
				+#define NV_FORBID_HEAPALLOC() \
			
 
				+    private: \
			
 
				+    void *operator new(size_t size); \
			
 
				+    void *operator new[](size_t size)
			
 
				+    //static void *operator new(size_t size); \
			
 
				+    //static void *operator new[](size_t size);
			
 
				+
			
 
				+// String concatenation macros.
			
 
				+#define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
			
 
				+#define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
			
 
				+#define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
			
 
				+#define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
			
 
				+#define NV_STRING2(x) #x
			
 
				+#define NV_STRING(x) NV_STRING2(x)
			
 
				+
			
 
				+#if NV_CC_MSVC
			
 
				+#define NV_MULTI_LINE_MACRO_BEGIN do {  
			
 
				+#define NV_MULTI_LINE_MACRO_END \
			
 
				+    __pragma(warning(push)) \
			
 
				+    __pragma(warning(disable:4127)) \
			
 
				+    } while(false) \
			
 
				+    __pragma(warning(pop))  
			
 
				+#else
			
 
				+#define NV_MULTI_LINE_MACRO_BEGIN do {
			
 
				+#define NV_MULTI_LINE_MACRO_END } while(false)
			
 
				+#endif
			
 
				+
			
 
				+#if NV_CC_CPP11
			
 
				+#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed")
			
 
				+#else
			
 
				+#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
			
 
				+#endif
			
 
				+#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
			
 
				+
			
 
				+// Make sure type definitions are fine.
			
 
				+NV_COMPILER_CHECK(sizeof(int8) == 1);
			
 
				+NV_COMPILER_CHECK(sizeof(uint8) == 1);
			
 
				+NV_COMPILER_CHECK(sizeof(int16) == 2);
			
 
				+NV_COMPILER_CHECK(sizeof(uint16) == 2);
			
 
				+NV_COMPILER_CHECK(sizeof(int32) == 4);
			
 
				+NV_COMPILER_CHECK(sizeof(uint32) == 4);
			
 
				+NV_COMPILER_CHECK(sizeof(int32) == 4);
			
 
				+NV_COMPILER_CHECK(sizeof(uint32) == 4);
			
 
				+
			
 
				+
			
 
				+#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
			
 
				+
			
 
				+#if 0 // Disabled in The Witness.
			
 
				+#if NV_CC_MSVC
			
 
				+#define NV_MESSAGE(x) message(__FILE__ "(" NV_STRING(__LINE__) ") : " x)
			
 
				+#else
			
 
				+#define NV_MESSAGE(x) message(x)
			
 
				+#endif
			
 
				+#else
			
 
				+#define NV_MESSAGE(x) 
			
 
				+#endif
			
 
				+
			
 
				+
			
 
				+// Startup initialization macro.
			
 
				+#define NV_AT_STARTUP(some_code) \
			
 
				+    namespace { \
			
 
				+        static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
			
 
				+            NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
			
 
				+        } \
			
 
				+        NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
			
 
				+    }
			
 
				+
			
 
				+// Indicate the compiler that the parameter is not used to suppress compier warnings.
			
 
				+#define NV_UNUSED(a) ((a)=(a))
			
 
				+
			
 
				+// Null index. @@ Move this somewhere else... it's only used by nvmesh.
			
 
				+//const unsigned int NIL = unsigned int(~0);
			
 
				+//#define NIL uint(~0)
			
 
				+
			
 
				+// Null pointer.
			
 
				+#ifndef NULL
			
 
				+#define NULL 0
			
 
				+#endif
			
 
				+
			
 
				+// Platform includes
			
 
				+#if NV_CC_MSVC
			
 
				+#   if NV_OS_WIN32
			
 
				+#       include "DefsVcWin32.h"
			
 
				+#   elif NV_OS_XBOX
			
 
				+#       include "DefsVcXBox.h"
			
 
				+#   else
			
 
				+#       error "MSVC: Platform not supported"
			
 
				+#   endif
			
 
				+#elif NV_CC_GNUC
			
 
				+#   if NV_OS_LINUX
			
 
				+#       include "DefsGnucLinux.h"
			
 
				+#   elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
			
 
				+#       include "DefsGnucDarwin.h"
			
 
				+#   elif NV_OS_MINGW
			
 
				+#       include "DefsGnucWin32.h"
			
 
				+#   elif NV_OS_CYGWIN
			
 
				+#       error "GCC: Cygwin not supported"
			
 
				+#   else
			
 
				+#       error "GCC: Platform not supported"
			
 
				+#   endif
			
 
				+#endif
			
 
				+
			
 
				+#endif // NV_CORE_H
			
--- a/3rdparty/nvtt/nvcore/posh.h
+++ b/3rdparty/nvtt/nvcore/posh.h
@@ -0,0 +1,1030 @@
 
				+/**
			
 
				+@file posh.h
			
 
				+@author Brian Hook
			
 
				+@version 1.3.001
			
 
				+
			
 
				+Header file for POSH, the Portable Open Source Harness project.
			
 
				+
			
 
				+NOTE: Unlike most header files, this one is designed to be included
			
 
				+multiple times, which is why it does not have the @#ifndef/@#define
			
 
				+preamble.
			
 
				+
			
 
				+POSH relies on environment specified preprocessor symbols in order
			
 
				+to infer as much as possible about the target OS/architecture and
			
 
				+the host compiler capabilities.
			
 
				+
			
 
				+NOTE: POSH is simple and focused. It attempts to provide basic
			
 
				+functionality and information, but it does NOT attempt to emulate
			
 
				+missing functionality.  I am also not willing to make POSH dirty
			
 
				+and hackish to support truly ancient and/or outmoded and/or bizarre
			
 
				+technologies such as non-ANSI compilers, systems with non-IEEE
			
 
				+floating point formats, segmented 16-bit operating systems, etc.
			
 
				+
			
 
				+Please refer to the accompanying HTML documentation or visit
			
 
				+http://www.poshlib.org for more information on how to use POSH.
			
 
				+
			
 
				+LICENSE:
			
 
				+
			
 
				+Copyright (c) 2004, Brian Hook
			
 
				+All rights reserved.
			
 
				+
			
 
				+Redistribution and use in source and binary forms, with or without
			
 
				+modification, are permitted provided that the following conditions are
			
 
				+met:
			
 
				+
			
 
				+    * Redistributions of source code must retain the above copyright
			
 
				+      notice, this list of conditions and the following disclaimer.
			
 
				+
			
 
				+    * Redistributions in binary form must reproduce the above
			
 
				+      copyright notice, this list of conditions and the following
			
 
				+      disclaimer in the documentation and/or other materials provided
			
 
				+      with the distribution.
			
 
				+
			
 
				+    * The names of this package'ss contributors contributors may not
			
 
				+      be used to endorse or promote products derived from this
			
 
				+      software without specific prior written permission.
			
 
				+
			
 
				+
			
 
				+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
			
 
				+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
			
 
				+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
			
 
				+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
			
 
				+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
			
 
				+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
			
 
				+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
			
 
				+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
			
 
				+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
			
 
				+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
			
 
				+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
			
 
				+
			
 
				+REVISION:
			
 
				+
			
 
				+I've been lax about revision histories, so this starts at, um, 1.3.001.
			
 
				+Sorry for any inconveniences.
			
 
				+
			
 
				+1.3.001 - 2/23/2006 - Incorporated fix for bug reported by Bill Cary,
			
 
				+                      where I was not detecting Visual Studio
			
 
				+                      compilation on x86-64 systems.  Added check for
			
 
				+                      _M_X64 which should fix that.
			
 
				+
			
 
				+*/
			
 
				+/*
			
 
				+I have yet to find an authoritative reference on preprocessor
			
 
				+symbols, but so far this is what I've gleaned:
			
 
				+
			
 
				+GNU GCC/G++:
			
 
				+   - __GNUC__: GNU C version
			
 
				+   - __GNUG__: GNU C++ compiler
			
 
				+   - __sun__ : on Sun platforms
			
 
				+   - __svr4__: on Solaris and other SysV R4 platforms
			
 
				+   - __mips__: on MIPS processor platforms
			
 
				+   - __sparc_v9__: on Sparc 64-bit CPUs
			
 
				+   - __sparcv9: 64-bit Solaris
			
 
				+   - __MIPSEL__: mips processor, compiled for little endian
			
 
				+   - __MIPSEB__: mips processor, compiled for big endian
			
 
				+   - _R5900: MIPS/Sony/Toshiba R5900 (PS2)
			
 
				+   - mc68000: 68K
			
 
				+   - m68000: 68K
			
 
				+   - m68k: 68K
			
 
				+   - __palmos__: PalmOS
			
 
				+
			
 
				+Intel C/C++ Compiler:
			
 
				+   - __ECC      : compiler version, IA64 only
			
 
				+   - __EDG__
			
 
				+   - __ELF__
			
 
				+   - __GXX_ABI_VERSION
			
 
				+   - __i386     : IA-32 only
			
 
				+   - __i386__   : IA-32 only
			
 
				+   - i386       : IA-32 only
			
 
				+   - __ia64     : IA-64 only
			
 
				+   - __ia64__   : IA-64 only
			
 
				+   - ia64       : IA-64 only
			
 
				+   - __ICC      : IA-32 only
			
 
				+   - __INTEL_COMPILER : IA-32 or IA-64, newer versions only
			
 
				+
			
 
				+Apple's C/C++ Compiler for OS X:
			
 
				+   - __APPLE_CC__
			
 
				+   - __APPLE__
			
 
				+   - __BIG_ENDIAN__
			
 
				+   - __APPLE__
			
 
				+   - __ppc__
			
 
				+   - __MACH__
			
 
				+
			
 
				+DJGPP:
			
 
				+   - __MSDOS__
			
 
				+   - __unix__
			
 
				+   - __unix
			
 
				+   - __GNUC__
			
 
				+   - __GO32
			
 
				+   - DJGPP
			
 
				+   - __i386, __i386, i386
			
 
				+
			
 
				+Cray's C compiler:
			
 
				+   - _ADDR64: if 64-bit pointers
			
 
				+   - _UNICOS: 
			
 
				+   - __unix:
			
 
				+
			
 
				+SGI's CC compiler predefines the following (and more) with -ansi:
			
 
				+   - __sgi
			
 
				+   - __unix
			
 
				+   - __host_mips
			
 
				+   - _SYSTYPE_SVR4
			
 
				+   - __mips
			
 
				+   - _MIPSEB
			
 
				+   - anyone know if there is a predefined symbol for the compiler?!
			
 
				+
			
 
				+MinGW:
			
 
				+   - as GnuC but also defines _WIN32, __WIN32, WIN32, _X86_, __i386, __i386__, and several others
			
 
				+   - __MINGW32__
			
 
				+
			
 
				+Cygwin:
			
 
				+   - as Gnu C, but also
			
 
				+   - __unix__
			
 
				+   - __CYGWIN32__
			
 
				+
			
 
				+Microsoft Visual Studio predefines the following:
			
 
				+   - _MSC_VER
			
 
				+   - _WIN32: on Win32
			
 
				+   - _M_IX6 (on x86 systems)
			
 
				+   - _M_X64: on x86-64 systems
			
 
				+   - _M_ALPHA (on DEC AXP systems)
			
 
				+   - _SH3: WinCE, Hitachi SH-3
			
 
				+   - _MIPS: WinCE, MIPS
			
 
				+   - _ARM: WinCE, ARM
			
 
				+
			
 
				+Sun's C Compiler:
			
 
				+   - sun and _sun
			
 
				+   - unix and _unix
			
 
				+   - sparc and _sparc (SPARC systems only)
			
 
				+   - i386 and _i386 (x86 systems only)
			
 
				+   - __SVR4 (Solaris only)
			
 
				+   - __sparcv9: 64-bit solaris
			
 
				+   - __SUNPRO_C
			
 
				+   - _LP64: defined in 64-bit LP64 mode, but only if <sys/types.h> is included
			
 
				+
			
 
				+Borland C/C++ predefines the following:
			
 
				+   - __BORLANDC__:
			
 
				+
			
 
				+DEC/Compaq C/C++ on Alpha:
			
 
				+   - __alpha
			
 
				+   - __arch64__
			
 
				+   - __unix__ (on Tru64 Unix)
			
 
				+   - __osf__
			
 
				+   - __DECC
			
 
				+   - __DECCXX (C++ compilation)
			
 
				+   - __DECC_VER
			
 
				+   - __DECCXX_VER
			
 
				+
			
 
				+IBM's AIX compiler:
			
 
				+   - __64BIT__ if 64-bit mode
			
 
				+   - _AIX
			
 
				+   - __IBMC__: C compiler version
			
 
				+   - __IBMCPP__: C++ compiler version
			
 
				+   - _LONG_LONG: compiler allows long long
			
 
				+
			
 
				+Watcom:
			
 
				+   - __WATCOMC__
			
 
				+   - __DOS__ : if targeting DOS
			
 
				+   - __386__ : if 32-bit support
			
 
				+   - __WIN32__ : if targetin 32-bit Windows
			
 
				+
			
 
				+HP-UX C/C++ Compiler:
			
 
				+   - __hpux
			
 
				+   - __unix
			
 
				+   - __hppa (on PA-RISC)
			
 
				+   - __LP64__: if compiled in 64-bit mode
			
 
				+
			
 
				+Metrowerks:
			
 
				+   - __MWERKS__
			
 
				+   - __powerpc__
			
 
				+   - _powerc
			
 
				+   - __MC68K__
			
 
				+   - macintosh when compiling for MacOS
			
 
				+   - __INTEL__ for x86 targets
			
 
				+   - __POWERPC__
			
 
				+
			
 
				+LLVM:
			
 
				+   - __llvm__
			
 
				+   - __clang__
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** Include <limits.h> optionally
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#ifdef POSH_USE_LIMITS_H
			
 
				+#  include <limits.h>
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** Determine compilation environment
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#if defined __ECC || defined __ICC || defined __INTEL_COMPILER
			
 
				+#  define POSH_COMPILER_STRING "Intel C/C++"
			
 
				+#  define POSH_COMPILER_INTEL 1
			
 
				+#endif
			
 
				+
			
 
				+#if ( defined __host_mips || defined __sgi ) && !defined __GNUC__
			
 
				+#  define POSH_COMPILER_STRING    "MIPSpro C/C++"
			
 
				+#  define POSH_COMPILER_MIPSPRO 1 
			
 
				+#endif
			
 
				+
			
 
				+#if defined __hpux && !defined __GNUC__
			
 
				+#  define POSH_COMPILER_STRING "HP-UX CC"
			
 
				+#  define POSH_COMPILER_HPCC 1 
			
 
				+#endif
			
 
				+
			
 
				+#if defined __clang__
			
 
				+#  define POSH_COMPILER_STRING "Clang"
			
 
				+#  define POSH_COMPILER_CLANG 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined __GNUC__ && !defined __clang__
			
 
				+#  define POSH_COMPILER_STRING "Gnu GCC"
			
 
				+#  define POSH_COMPILER_GCC 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined __APPLE_CC__
			
 
				+   /* we don't define the compiler string here, let it be GNU */
			
 
				+#  define POSH_COMPILER_APPLECC 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined __IBMC__ || defined __IBMCPP__
			
 
				+#  define POSH_COMPILER_STRING "IBM C/C++"
			
 
				+#  define POSH_COMPILER_IBM 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined _MSC_VER
			
 
				+#  define POSH_COMPILER_STRING "Microsoft Visual C++"
			
 
				+#  define POSH_COMPILER_MSVC 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined __SUNPRO_C
			
 
				+#  define POSH_COMPILER_STRING "Sun Pro" 
			
 
				+#  define POSH_COMPILER_SUN 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined __BORLANDC__
			
 
				+#  define POSH_COMPILER_STRING "Borland C/C++"
			
 
				+#  define POSH_COMPILER_BORLAND 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined __MWERKS__
			
 
				+#  define POSH_COMPILER_STRING     "MetroWerks CodeWarrior"
			
 
				+#  define POSH_COMPILER_METROWERKS 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined __DECC || defined __DECCXX
			
 
				+#  define POSH_COMPILER_STRING "Compaq/DEC C/C++"
			
 
				+#  define POSH_COMPILER_DEC 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined __WATCOMC__
			
 
				+#  define POSH_COMPILER_STRING "Watcom C/C++"
			
 
				+#  define POSH_COMPILER_WATCOM 1
			
 
				+#endif
			
 
				+
			
 
				+#if !defined POSH_COMPILER_STRING
			
 
				+#  define POSH_COMPILER_STRING "Unknown compiler"
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** Determine target operating system
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#if defined linux || defined __linux__
			
 
				+#  define POSH_OS_LINUX 1 
			
 
				+#  define POSH_OS_STRING "Linux"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __FreeBSD__
			
 
				+#  define POSH_OS_FREEBSD 1 
			
 
				+#  define POSH_OS_STRING "FreeBSD"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __OpenBSD__
			
 
				+#  define POSH_OS_OPENBSD 1
			
 
				+#  define POSH_OS_STRING "OpenBSD"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __CYGWIN32__
			
 
				+#  define POSH_OS_CYGWIN32 1
			
 
				+#  define POSH_OS_STRING "Cygwin"
			
 
				+#endif
			
 
				+
			
 
				+#if defined GEKKO
			
 
				+#  define POSH_OS_GAMECUBE
			
 
				+#  define __powerpc__
			
 
				+#  define POSH_OS_STRING "GameCube"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __MINGW32__
			
 
				+#  define POSH_OS_MINGW 1
			
 
				+#  define POSH_OS_STRING "MinGW"
			
 
				+#endif
			
 
				+
			
 
				+#if defined GO32 && defined DJGPP && defined __MSDOS__ 
			
 
				+#  define POSH_OS_GO32 1
			
 
				+#  define POSH_OS_STRING "GO32/MS-DOS"
			
 
				+#endif
			
 
				+
			
 
				+/* NOTE: make sure you use /bt=DOS if compiling for 32-bit DOS,
			
 
				+   otherwise Watcom assumes host=target */
			
 
				+#if defined __WATCOMC__  && defined __386__ && defined __DOS__
			
 
				+#  define POSH_OS_DOS32 1
			
 
				+#  define POSH_OS_STRING "DOS/32-bit"
			
 
				+#endif
			
 
				+
			
 
				+#if defined _UNICOS
			
 
				+#  define POSH_OS_UNICOS 1
			
 
				+#  define POSH_OS_STRING "UNICOS"
			
 
				+#endif
			
 
				+
			
 
				+#if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
			
 
				+#  define POSH_OS_OSX 1
			
 
				+#  define POSH_OS_STRING "MacOS X"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __sun__ || defined sun || defined __sun || defined __solaris__
			
 
				+#  if defined __SVR4 || defined __svr4__ || defined __solaris__
			
 
				+#     define POSH_OS_STRING "Solaris"
			
 
				+#     define POSH_OS_SOLARIS 1
			
 
				+#  endif
			
 
				+#  if !defined POSH_OS_STRING
			
 
				+#     define POSH_OS_STRING "SunOS"
			
 
				+#     define POSH_OS_SUNOS 1
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined __sgi__ || defined sgi || defined __sgi
			
 
				+#  define POSH_OS_IRIX 1
			
 
				+#  define POSH_OS_STRING "Irix"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __hpux__ || defined __hpux
			
 
				+#  define POSH_OS_HPUX 1
			
 
				+#  define POSH_OS_STRING "HP-UX"
			
 
				+#endif
			
 
				+
			
 
				+#if defined _AIX
			
 
				+#  define POSH_OS_AIX 1
			
 
				+#  define POSH_OS_STRING "AIX"
			
 
				+#endif
			
 
				+
			
 
				+#if ( defined __alpha && defined __osf__ )
			
 
				+#  define POSH_OS_TRU64 1
			
 
				+#  define POSH_OS_STRING "Tru64"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __BEOS__ || defined __beos__
			
 
				+#  define POSH_OS_BEOS 1
			
 
				+#  define POSH_OS_STRING "BeOS"
			
 
				+#endif
			
 
				+
			
 
				+#if defined amiga || defined amigados || defined AMIGA || defined _AMIGA
			
 
				+#  define POSH_OS_AMIGA 1
			
 
				+#  define POSH_OS_STRING "Amiga"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __unix__
			
 
				+#  define POSH_OS_UNIX 1 
			
 
				+#  if !defined POSH_OS_STRING
			
 
				+#     define POSH_OS_STRING "Unix-like(generic)"
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined _WIN32_WCE
			
 
				+#  define POSH_OS_WINCE 1
			
 
				+#  define POSH_OS_STRING "Windows CE"
			
 
				+#endif
			
 
				+
			
 
				+#if defined _XBOX || defined _XBOX_VER
			
 
				+#  define POSH_OS_XBOX 1
			
 
				+#  define POSH_OS_STRING "XBOX"
			
 
				+#endif
			
 
				+
			
 
				+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__
			
 
				+#  define POSH_OS_WIN32 1
			
 
				+#  if !defined POSH_OS_XBOX
			
 
				+#     if defined _WIN64
			
 
				+#        define POSH_OS_WIN64 1
			
 
				+#        define POSH_OS_STRING "Win64"
			
 
				+#     else
			
 
				+#        if !defined POSH_OS_STRING
			
 
				+#           define POSH_OS_STRING "Win32"
			
 
				+#        endif
			
 
				+#     endif
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined __palmos__
			
 
				+#  define POSH_OS_PALM 1
			
 
				+#  define POSH_OS_STRING "PalmOS"
			
 
				+#endif
			
 
				+
			
 
				+#if defined THINK_C || defined macintosh
			
 
				+#  define POSH_OS_MACOS 1
			
 
				+#  define POSH_OS_STRING "MacOS"
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** -----------------------------------------------------------------------------
			
 
				+** Determine target CPU
			
 
				+** -----------------------------------------------------------------------------
			
 
				+*/
			
 
				+
			
 
				+#if defined GEKKO
			
 
				+#  define POSH_CPU_PPC750 1
			
 
				+#  define POSH_CPU_STRING "IBM PowerPC 750 (NGC)"
			
 
				+#endif
			
 
				+
			
 
				+#if defined mc68000 || defined m68k || defined __MC68K__ || defined m68000
			
 
				+#  define POSH_CPU_68K 1
			
 
				+#  define POSH_CPU_STRING "MC68000"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __PPC__ || defined __POWERPC__  || defined powerpc || defined _POWER || defined __ppc__ || defined __powerpc__ || defined _M_PPC
			
 
				+#  define POSH_CPU_PPC 1
			
 
				+#  if !defined POSH_CPU_STRING
			
 
				+#    if defined __powerpc64__
			
 
				+#       define POSH_CPU_STRING "PowerPC64"
			
 
				+#    else
			
 
				+#       define POSH_CPU_STRING "PowerPC"
			
 
				+#    endif
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined _CRAYT3E || defined _CRAYMPP
			
 
				+#  define POSH_CPU_CRAYT3E 1 /* target processor is a DEC Alpha 21164 used in a Cray T3E*/
			
 
				+#  define POSH_CPU_STRING "Cray T3E (Alpha 21164)"
			
 
				+#endif
			
 
				+
			
 
				+#if defined CRAY || defined _CRAY && !defined _CRAYT3E
			
 
				+#  error Non-AXP Cray systems not supported
			
 
				+#endif
			
 
				+
			
 
				+#if defined _SH3
			
 
				+#  define POSH_CPU_SH3 1
			
 
				+#  define POSH_CPU_STRING "Hitachi SH-3"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __sh4__ || defined __SH4__
			
 
				+#  define POSH_CPU_SH3 1
			
 
				+#  define POSH_CPU_SH4 1
			
 
				+#  define POSH_CPU_STRING "Hitachi SH-4"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __sparc__ || defined __sparc
			
 
				+#  if defined __arch64__ || defined __sparcv9 || defined __sparc_v9__
			
 
				+#     define POSH_CPU_SPARC64 1 
			
 
				+#     define POSH_CPU_STRING "Sparc/64"
			
 
				+#  else
			
 
				+#     define POSH_CPU_STRING "Sparc/32"
			
 
				+#  endif
			
 
				+#  define POSH_CPU_SPARC 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined ARM || defined __arm__ || defined _ARM
			
 
				+#  define POSH_CPU_STRONGARM 1
			
 
				+#  define POSH_CPU_STRING "ARM"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __aarch64__
			
 
				+#  define POSH_CPU_AARCH64 1
			
 
				+#  define POSH_CPU_STRING "ARM64"
			
 
				+#endif
			
 
				+
			
 
				+#if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS
			
 
				+#  define POSH_CPU_MIPS 1 
			
 
				+#  if defined _R5900
			
 
				+#    define POSH_CPU_STRING "MIPS R5900 (PS2)"
			
 
				+#  else
			
 
				+#    define POSH_CPU_STRING "MIPS"
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined __ia64 || defined _M_IA64 || defined __ia64__ 
			
 
				+#  define POSH_CPU_IA64 1
			
 
				+#  define POSH_CPU_STRING "IA64"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __X86__ || defined __i386__ || defined i386 || defined _M_IX86 || defined __386__ || defined __x86_64__ || defined _M_X64
			
 
				+#  define POSH_CPU_X86 1
			
 
				+#  if defined __x86_64__ || defined _M_X64
			
 
				+#     define POSH_CPU_X86_64 1 
			
 
				+#  endif
			
 
				+#  if defined POSH_CPU_X86_64
			
 
				+#     define POSH_CPU_STRING "AMD x86-64"
			
 
				+#  else
			
 
				+#     define POSH_CPU_STRING "Intel 386+"
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+#if defined __alpha || defined alpha || defined _M_ALPHA || defined __alpha__
			
 
				+#  define POSH_CPU_AXP 1
			
 
				+#  define POSH_CPU_STRING "AXP"
			
 
				+#endif
			
 
				+
			
 
				+#if defined __hppa || defined hppa
			
 
				+#  define POSH_CPU_HPPA 1
			
 
				+#  define POSH_CPU_STRING "PA-RISC"
			
 
				+#endif
			
 
				+
			
 
				+#if !defined POSH_CPU_STRING
			
 
				+#  error POSH cannot determine target CPU
			
 
				+#  define POSH_CPU_STRING "Unknown" /* this is here for Doxygen's benefit */
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** -----------------------------------------------------------------------------
			
 
				+** Attempt to autodetect building for embedded on Sony PS2
			
 
				+** -----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#if !defined POSH_OS_STRING
			
 
				+#  if !defined FORCE_DOXYGEN
			
 
				+#    define POSH_OS_EMBEDDED 1 
			
 
				+#  endif
			
 
				+#  if defined _R5900
			
 
				+#     define POSH_OS_STRING "Sony PS2(embedded)"
			
 
				+#  else
			
 
				+#     define POSH_OS_STRING "Embedded/Unknown"
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ---------------------------------------------------------------------------
			
 
				+** Handle cdecl, stdcall, fastcall, etc.
			
 
				+** ---------------------------------------------------------------------------
			
 
				+*/
			
 
				+#if defined POSH_CPU_X86 && !defined POSH_CPU_X86_64
			
 
				+#  if defined __GNUC__
			
 
				+#     define POSH_CDECL __attribute__((cdecl))
			
 
				+#     define POSH_STDCALL __attribute__((stdcall))
			
 
				+#     define POSH_FASTCALL __attribute__((fastcall))
			
 
				+#  elif ( defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ || defined __MWERKS__ )
			
 
				+#     define POSH_CDECL    __cdecl
			
 
				+#     define POSH_STDCALL  __stdcall
			
 
				+#     define POSH_FASTCALL __fastcall
			
 
				+#  endif
			
 
				+#else
			
 
				+#  define POSH_CDECL    
			
 
				+#  define POSH_STDCALL  
			
 
				+#  define POSH_FASTCALL 
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ---------------------------------------------------------------------------
			
 
				+** Define POSH_IMPORTEXPORT signature based on POSH_DLL and POSH_BUILDING_LIB
			
 
				+** ---------------------------------------------------------------------------
			
 
				+*/
			
 
				+
			
 
				+/*
			
 
				+** We undefine this so that multiple inclusions will work
			
 
				+*/
			
 
				+#if defined POSH_IMPORTEXPORT
			
 
				+#  undef POSH_IMPORTEXPORT
			
 
				+#endif
			
 
				+
			
 
				+#if defined POSH_DLL
			
 
				+#   if defined POSH_OS_WIN32
			
 
				+#      if defined _MSC_VER 
			
 
				+#         if ( _MSC_VER >= 800 )
			
 
				+#            if defined POSH_BUILDING_LIB
			
 
				+#               define POSH_IMPORTEXPORT __declspec( dllexport )
			
 
				+#            else
			
 
				+#               define POSH_IMPORTEXPORT __declspec( dllimport )
			
 
				+#            endif
			
 
				+#         else
			
 
				+#            if defined POSH_BUILDING_LIB
			
 
				+#               define POSH_IMPORTEXPORT __export
			
 
				+#            else
			
 
				+#               define POSH_IMPORTEXPORT 
			
 
				+#            endif
			
 
				+#         endif
			
 
				+#      endif  /* defined _MSC_VER */
			
 
				+#      if defined __BORLANDC__
			
 
				+#         if ( __BORLANDC__ >= 0x500 )
			
 
				+#            if defined POSH_BUILDING_LIB 
			
 
				+#               define POSH_IMPORTEXPORT __declspec( dllexport )
			
 
				+#            else
			
 
				+#               define POSH_IMPORTEXPORT __declspec( dllimport )
			
 
				+#            endif
			
 
				+#         else
			
 
				+#            if defined POSH_BUILDING_LIB
			
 
				+#               define POSH_IMPORTEXPORT __export
			
 
				+#            else
			
 
				+#               define POSH_IMPORTEXPORT 
			
 
				+#            endif
			
 
				+#         endif
			
 
				+#      endif /* defined __BORLANDC__ */
			
 
				+       /* for all other compilers, we're just making a blanket assumption */
			
 
				+#      if defined __GNUC__ || defined __WATCOMC__ || defined __MWERKS__
			
 
				+#         if defined POSH_BUILDING_LIB
			
 
				+#            define POSH_IMPORTEXPORT __declspec( dllexport )
			
 
				+#         else
			
 
				+#            define POSH_IMPORTEXPORT __declspec( dllimport )
			
 
				+#         endif
			
 
				+#      endif /* all other compilers */
			
 
				+#      if !defined POSH_IMPORTEXPORT
			
 
				+#         error Building DLLs not supported on this compiler ([email protected] if you know how)
			
 
				+#      endif
			
 
				+#   endif /* defined POSH_OS_WIN32 */
			
 
				+#endif
			
 
				+
			
 
				+/* On pretty much everything else, we can thankfully just ignore this */
			
 
				+#if !defined POSH_IMPORTEXPORT
			
 
				+#  define POSH_IMPORTEXPORT
			
 
				+#endif
			
 
				+
			
 
				+#if defined FORCE_DOXYGEN
			
 
				+#  define POSH_DLL    
			
 
				+#  define POSH_BUILDING_LIB
			
 
				+#  undef POSH_DLL
			
 
				+#  undef POSH_BUILDING_LIB
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** (Re)define POSH_PUBLIC_API export signature 
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#ifdef POSH_PUBLIC_API
			
 
				+#  undef POSH_PUBLIC_API
			
 
				+#endif
			
 
				+
			
 
				+#if ( ( defined _MSC_VER ) && ( _MSC_VER < 800 ) ) || ( defined __BORLANDC__ && ( __BORLANDC__ < 0x500 ) )
			
 
				+#  define POSH_PUBLIC_API(rtype) extern rtype POSH_IMPORTEXPORT 
			
 
				+#else
			
 
				+#  define POSH_PUBLIC_API(rtype) extern POSH_IMPORTEXPORT rtype
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** Try to infer endianess.  Basically we just go through the CPUs we know are
			
 
				+** little endian, and assume anything that isn't one of those is big endian.
			
 
				+** As a sanity check, we also do this with operating systems we know are
			
 
				+** little endian, such as Windows.  Some processors are bi-endian, such as 
			
 
				+** the MIPS series, so we have to be careful about those.
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_CPU_AARCH64 || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__
			
 
				+#  define POSH_ENDIAN_STRING "little"
			
 
				+#  define POSH_LITTLE_ENDIAN 1
			
 
				+#else
			
 
				+#  define POSH_ENDIAN_STRING "big"
			
 
				+#  define POSH_BIG_ENDIAN 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined FORCE_DOXYGEN
			
 
				+#  define POSH_LITTLE_ENDIAN
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** Cross-platform compile time assertion macro
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#define POSH_COMPILE_TIME_ASSERT(name, x) typedef int _POSH_dummy_ ## name[(x) ? 1 : -1 ]
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** 64-bit Integer
			
 
				+**
			
 
				+** We don't require 64-bit support, nor do we emulate its functionality, we
			
 
				+** simply export it if it's available.  Since we can't count on <limits.h>
			
 
				+** for 64-bit support, we ignore the POSH_USE_LIMITS_H directive.
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#if defined ( __LP64__ ) || defined ( __powerpc64__ ) || defined POSH_CPU_SPARC64
			
 
				+#  define POSH_64BIT_INTEGER 1
			
 
				+typedef long posh_i64_t; 
			
 
				+typedef unsigned long posh_u64_t;
			
 
				+#  define POSH_I64( x ) ((posh_i64_t)x)
			
 
				+#  define POSH_U64( x ) ((posh_u64_t)x)
			
 
				+#  define POSH_I64_PRINTF_PREFIX "l"
			
 
				+#elif defined _MSC_VER || defined __BORLANDC__ || defined __WATCOMC__ || ( defined __alpha && defined __DECC )
			
 
				+#  define POSH_64BIT_INTEGER 1
			
 
				+typedef __int64 posh_i64_t;
			
 
				+typedef unsigned __int64 posh_u64_t;
			
 
				+#  define POSH_I64( x ) ((posh_i64_t)(x##i64))
			
 
				+#  define POSH_U64( x ) ((posh_u64_t)(x##ui64))
			
 
				+#  define POSH_I64_PRINTF_PREFIX "I64"
			
 
				+#elif defined __GNUC__ || defined __MWERKS__ || defined __SUNPRO_C || defined __SUNPRO_CC || defined __APPLE_CC__ || defined POSH_OS_IRIX || defined _LONG_LONG || defined _CRAYC
			
 
				+#  define POSH_64BIT_INTEGER 1
			
 
				+typedef long long posh_i64_t;
			
 
				+typedef unsigned long long posh_u64_t;
			
 
				+#  define POSH_U64( x ) ((posh_u64_t)(x##LL))
			
 
				+#  define POSH_I64( x ) ((posh_i64_t)(x##LL))
			
 
				+#  define POSH_I64_PRINTF_PREFIX "ll"
			
 
				+#endif
			
 
				+
			
 
				+/* hack */
			
 
				+/*#ifdef __MINGW32__
			
 
				+#undef POSH_I64
			
 
				+#undef POSH_U64
			
 
				+#undef POSH_I64_PRINTF_PREFIX
			
 
				+#define POSH_I64( x ) ((posh_i64_t)x)
			
 
				+#define POSH_U64( x ) ((posh_u64_t)x)
			
 
				+#define POSH_I64_PRINTF_PREFIX "I64"
			
 
				+#endif*/
			
 
				+
			
 
				+#ifdef FORCE_DOXYGEN
			
 
				+typedef long long posh_i64_t;
			
 
				+typedef unsigned long posh_u64_t;
			
 
				+#  define POSH_64BIT_INTEGER
			
 
				+#  define POSH_I64_PRINTF_PREFIX
			
 
				+#  define POSH_I64(x)
			
 
				+#  define POSH_U64(x)
			
 
				+#endif
			
 
				+
			
 
				+/** Minimum value for a 64-bit signed integer */
			
 
				+#define POSH_I64_MIN  POSH_I64(0x8000000000000000)
			
 
				+/** Maximum value for a 64-bit signed integer */
			
 
				+#define POSH_I64_MAX  POSH_I64(0x7FFFFFFFFFFFFFFF)
			
 
				+/** Minimum value for a 64-bit unsigned integer */
			
 
				+#define POSH_U64_MIN  POSH_U64(0)
			
 
				+/** Maximum value for a 64-bit unsigned integer */
			
 
				+#define POSH_U64_MAX  POSH_U64(0xFFFFFFFFFFFFFFFF)
			
 
				+
			
 
				+/* ----------------------------------------------------------------------------
			
 
				+** Basic Sized Types
			
 
				+**
			
 
				+** These types are expected to be EXACTLY sized so you can use them for
			
 
				+** serialization.
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#define POSH_FALSE 0 
			
 
				+#define POSH_TRUE  1 
			
 
				+
			
 
				+typedef int            posh_bool_t;
			
 
				+typedef unsigned char  posh_byte_t;
			
 
				+
			
 
				+/* NOTE: These assume that CHAR_BIT is 8!! */
			
 
				+typedef unsigned char  posh_u8_t;
			
 
				+typedef signed char    posh_i8_t;
			
 
				+
			
 
				+#if defined POSH_USE_LIMITS_H
			
 
				+#  if CHAR_BITS > 8
			
 
				+#    error This machine uses 9-bit characters.  This is a warning, you can comment this out now.
			
 
				+#  endif /* CHAR_BITS > 8 */
			
 
				+
			
 
				+/* 16-bit */
			
 
				+#  if ( USHRT_MAX == 65535 ) 
			
 
				+   typedef unsigned short posh_u16_t;
			
 
				+   typedef short          posh_i16_t;
			
 
				+#  else
			
 
				+   /* Yes, in theory there could still be a 16-bit character type and shorts are
			
 
				+      32-bits in size...if you find such an architecture, let me know =P */
			
 
				+#    error No 16-bit type found
			
 
				+#  endif
			
 
				+
			
 
				+/* 32-bit */
			
 
				+#  if ( INT_MAX == 2147483647 )
			
 
				+  typedef unsigned       posh_u32_t;
			
 
				+  typedef int            posh_i32_t;
			
 
				+#  elif ( LONG_MAX == 2147483647 )
			
 
				+  typedef unsigned long  posh_u32_t;
			
 
				+  typedef long           posh_i32_t;
			
 
				+#  else
			
 
				+      error No 32-bit type found
			
 
				+#  endif
			
 
				+
			
 
				+#else /* POSH_USE_LIMITS_H */
			
 
				+
			
 
				+  typedef unsigned short posh_u16_t;
			
 
				+  typedef short          posh_i16_t;
			
 
				+
			
 
				+#  if !defined POSH_OS_PALM
			
 
				+  typedef unsigned       posh_u32_t;
			
 
				+  typedef int            posh_i32_t;
			
 
				+#  else
			
 
				+  typedef unsigned long  posh_u32_t;
			
 
				+  typedef long           posh_i32_t;
			
 
				+#  endif
			
 
				+#endif
			
 
				+
			
 
				+/** Minimum value for a byte */
			
 
				+#define POSH_BYTE_MIN    0
			
 
				+/** Maximum value for an 8-bit unsigned value */
			
 
				+#define POSH_BYTE_MAX    255
			
 
				+/** Minimum value for a byte */
			
 
				+#define POSH_I16_MIN     ( ( posh_i16_t ) 0x8000 )
			
 
				+/** Maximum value for a 16-bit signed value */
			
 
				+#define POSH_I16_MAX     ( ( posh_i16_t ) 0x7FFF ) 
			
 
				+/** Minimum value for a 16-bit unsigned value */
			
 
				+#define POSH_U16_MIN     0
			
 
				+/** Maximum value for a 16-bit unsigned value */
			
 
				+#define POSH_U16_MAX     ( ( posh_u16_t ) 0xFFFF )
			
 
				+/** Minimum value for a 32-bit signed value */
			
 
				+#define POSH_I32_MIN     ( ( posh_i32_t ) 0x80000000 )
			
 
				+/** Maximum value for a 32-bit signed value */
			
 
				+#define POSH_I32_MAX     ( ( posh_i32_t ) 0x7FFFFFFF )
			
 
				+/** Minimum value for a 32-bit unsigned value */
			
 
				+#define POSH_U32_MIN     0
			
 
				+/** Maximum value for a 32-bit unsigned value */
			
 
				+#define POSH_U32_MAX     ( ( posh_u32_t ) 0xFFFFFFFF )
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** Sanity checks on expected sizes
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#if !defined FORCE_DOXYGEN
			
 
				+
			
 
				+POSH_COMPILE_TIME_ASSERT(posh_byte_t, sizeof(posh_byte_t) == 1);
			
 
				+POSH_COMPILE_TIME_ASSERT(posh_u8_t, sizeof(posh_u8_t) == 1);
			
 
				+POSH_COMPILE_TIME_ASSERT(posh_i8_t, sizeof(posh_i8_t) == 1);
			
 
				+POSH_COMPILE_TIME_ASSERT(posh_u16_t, sizeof(posh_u16_t) == 2);
			
 
				+POSH_COMPILE_TIME_ASSERT(posh_i16_t, sizeof(posh_i16_t) == 2);
			
 
				+POSH_COMPILE_TIME_ASSERT(posh_u32_t, sizeof(posh_u32_t) == 4);
			
 
				+POSH_COMPILE_TIME_ASSERT(posh_i32_t, sizeof(posh_i32_t) == 4);
			
 
				+
			
 
				+#if !defined POSH_NO_FLOAT
			
 
				+   POSH_COMPILE_TIME_ASSERT(posh_testfloat_t, sizeof(float)==4 );
			
 
				+   POSH_COMPILE_TIME_ASSERT(posh_testdouble_t, sizeof(double)==8);
			
 
				+#endif
			
 
				+
			
 
				+#if defined POSH_64BIT_INTEGER
			
 
				+   POSH_COMPILE_TIME_ASSERT(posh_u64_t, sizeof(posh_u64_t) == 8);
			
 
				+   POSH_COMPILE_TIME_ASSERT(posh_i64_t, sizeof(posh_i64_t) == 8);
			
 
				+#endif
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** 64-bit pointer support
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#if defined POSH_CPU_AXP && ( defined POSH_OS_TRU64 || defined POSH_OS_LINUX )
			
 
				+#  define POSH_64BIT_POINTER 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined POSH_CPU_X86_64 && defined POSH_OS_LINUX
			
 
				+#  define POSH_64BIT_POINTER 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined POSH_CPU_SPARC64 || defined POSH_OS_WIN64 || defined __64BIT__ || defined __LP64 || defined _LP64 || defined __LP64__ || defined _ADDR64 || defined _CRAYC
			
 
				+#   define POSH_64BIT_POINTER 1
			
 
				+#endif
			
 
				+
			
 
				+#if defined POSH_64BIT_POINTER
			
 
				+   POSH_COMPILE_TIME_ASSERT( posh_64bit_pointer, sizeof( void * ) == 8 );
			
 
				+#elif !defined FORCE_DOXYGEN
			
 
				+/* if this assertion is hit then you're on a system that either has 64-bit
			
 
				+   addressing and we didn't catch it, or you're on a system with 16-bit
			
 
				+   pointers.  In the latter case, POSH doesn't actually care, we're just
			
 
				+   triggering this assertion to make sure you're aware of the situation,
			
 
				+   so feel free to delete it.
			
 
				+
			
 
				+   If this assertion is triggered on a known 32 or 64-bit platform, 
			
 
				+   please let us know ([email protected]) */
			
 
				+   POSH_COMPILE_TIME_ASSERT( posh_32bit_pointer, sizeof( void * ) == 4 );
			
 
				+#endif
			
 
				+
			
 
				+#if defined FORCE_DOXYGEN
			
 
				+#  define POSH_64BIT_POINTER
			
 
				+#endif
			
 
				+
			
 
				+/*
			
 
				+** ----------------------------------------------------------------------------
			
 
				+** POSH Utility Functions
			
 
				+**
			
 
				+** These are optional POSH utility functions that are not required if you don't
			
 
				+** need anything except static checking of your host and target environment.
			
 
				+** 
			
 
				+** These functions are NOT wrapped with POSH_PUBLIC_API because I didn't want
			
 
				+** to enforce their export if your own library is only using them internally.
			
 
				+** ----------------------------------------------------------------------------
			
 
				+*/
			
 
				+#ifdef __cplusplus
			
 
				+extern "C" {
			
 
				+#endif
			
 
				+
			
 
				+const char *POSH_GetArchString( void );
			
 
				+
			
 
				+#if !defined POSH_NO_FLOAT
			
 
				+
			
 
				+posh_u32_t  POSH_LittleFloatBits( float f );
			
 
				+posh_u32_t  POSH_BigFloatBits( float f );
			
 
				+float       POSH_FloatFromLittleBits( posh_u32_t bits );
			
 
				+float       POSH_FloatFromBigBits( posh_u32_t bits );
			
 
				+
			
 
				+void        POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] );
			
 
				+double      POSH_DoubleFromBits( const posh_byte_t src[ 8 ] );
			
 
				+
			
 
				+/* unimplemented
			
 
				+float      *POSH_WriteFloatToLittle( void *dst, float f );
			
 
				+float      *POSH_WriteFloatToBig( void *dst, float f );
			
 
				+float       POSH_ReadFloatFromLittle( const void *src );
			
 
				+float       POSH_ReadFloatFromBig( const void *src );
			
 
				+
			
 
				+double     *POSH_WriteDoubleToLittle( void *dst, double d );
			
 
				+double     *POSH_WriteDoubleToBig( void *dst, double d );
			
 
				+double      POSH_ReadDoubleFromLittle( const void *src );
			
 
				+double      POSH_ReadDoubleFromBig( const void *src );
			
 
				+*/
			
 
				+#endif /* !defined POSH_NO_FLOAT */
			
 
				+
			
 
				+#if defined FORCE_DOXYGEN
			
 
				+#  define POSH_NO_FLOAT
			
 
				+#  undef  POSH_NO_FLOAT
			
 
				+#endif
			
 
				+
			
 
				+extern posh_u16_t  POSH_SwapU16( posh_u16_t u );
			
 
				+extern posh_i16_t  POSH_SwapI16( posh_i16_t u );
			
 
				+extern posh_u32_t  POSH_SwapU32( posh_u32_t u );
			
 
				+extern posh_i32_t  POSH_SwapI32( posh_i32_t u );
			
 
				+
			
 
				+#if defined POSH_64BIT_INTEGER
			
 
				+
			
 
				+extern posh_u64_t  POSH_SwapU64( posh_u64_t u );
			
 
				+extern posh_i64_t  POSH_SwapI64( posh_i64_t u );
			
 
				+
			
 
				+#endif /*POSH_64BIT_INTEGER */
			
 
				+
			
 
				+extern posh_u16_t *POSH_WriteU16ToLittle( void *dst, posh_u16_t value );
			
 
				+extern posh_i16_t *POSH_WriteI16ToLittle( void *dst, posh_i16_t value );
			
 
				+extern posh_u32_t *POSH_WriteU32ToLittle( void *dst, posh_u32_t value );
			
 
				+extern posh_i32_t *POSH_WriteI32ToLittle( void *dst, posh_i32_t value );
			
 
				+
			
 
				+extern posh_u16_t *POSH_WriteU16ToBig( void *dst, posh_u16_t value );
			
 
				+extern posh_i16_t *POSH_WriteI16ToBig( void *dst, posh_i16_t value );
			
 
				+extern posh_u32_t *POSH_WriteU32ToBig( void *dst, posh_u32_t value );
			
 
				+extern posh_i32_t *POSH_WriteI32ToBig( void *dst, posh_i32_t value );
			
 
				+
			
 
				+extern posh_u16_t  POSH_ReadU16FromLittle( const void *src );
			
 
				+extern posh_i16_t  POSH_ReadI16FromLittle( const void *src );
			
 
				+extern posh_u32_t  POSH_ReadU32FromLittle( const void *src );
			
 
				+extern posh_i32_t  POSH_ReadI32FromLittle( const void *src );
			
 
				+
			
 
				+extern posh_u16_t  POSH_ReadU16FromBig( const void *src );
			
 
				+extern posh_i16_t  POSH_ReadI16FromBig( const void *src );
			
 
				+extern posh_u32_t  POSH_ReadU32FromBig( const void *src );
			
 
				+extern posh_i32_t  POSH_ReadI32FromBig( const void *src );
			
 
				+
			
 
				+#if defined POSH_64BIT_INTEGER
			
 
				+extern posh_u64_t *POSH_WriteU64ToLittle( void *dst, posh_u64_t value );
			
 
				+extern posh_i64_t *POSH_WriteI64ToLittle( void *dst, posh_i64_t value );
			
 
				+extern posh_u64_t *POSH_WriteU64ToBig( void *dst, posh_u64_t value );
			
 
				+extern posh_i64_t *POSH_WriteI64ToBig( void *dst, posh_i64_t value );
			
 
				+
			
 
				+extern posh_u64_t  POSH_ReadU64FromLittle( const void *src );
			
 
				+extern posh_i64_t  POSH_ReadI64FromLittle( const void *src );
			
 
				+extern posh_u64_t  POSH_ReadU64FromBig( const void *src );
			
 
				+extern posh_i64_t  POSH_ReadI64FromBig( const void *src );
			
 
				+#endif /* POSH_64BIT_INTEGER */
			
 
				+
			
 
				+#if defined POSH_LITTLE_ENDIAN
			
 
				+
			
 
				+#  define POSH_LittleU16(x) (x)
			
 
				+#  define POSH_LittleU32(x) (x)
			
 
				+#  define POSH_LittleI16(x) (x)
			
 
				+#  define POSH_LittleI32(x) (x)
			
 
				+#  if defined POSH_64BIT_INTEGER
			
 
				+#    define POSH_LittleU64(x) (x)
			
 
				+#    define POSH_LittleI64(x) (x)
			
 
				+#  endif /* defined POSH_64BIT_INTEGER */
			
 
				+
			
 
				+#  define POSH_BigU16(x) POSH_SwapU16(x)
			
 
				+#  define POSH_BigU32(x) POSH_SwapU32(x)
			
 
				+#  define POSH_BigI16(x) POSH_SwapI16(x)
			
 
				+#  define POSH_BigI32(x) POSH_SwapI32(x)
			
 
				+#  if defined POSH_64BIT_INTEGER
			
 
				+#    define POSH_BigU64(x) POSH_SwapU64(x)
			
 
				+#    define POSH_BigI64(x) POSH_SwapI64(x)
			
 
				+#  endif /* defined POSH_64BIT_INTEGER */
			
 
				+
			
 
				+#else
			
 
				+
			
 
				+#  define POSH_BigU16(x) (x)
			
 
				+#  define POSH_BigU32(x) (x)
			
 
				+#  define POSH_BigI16(x) (x)
			
 
				+#  define POSH_BigI32(x) (x)
			
 
				+
			
 
				+#  if defined POSH_64BIT_INTEGER
			
 
				+#    define POSH_BigU64(x) (x)
			
 
				+#    define POSH_BigI64(x) (x)
			
 
				+#  endif /* POSH_64BIT_INTEGER */
			
 
				+
			
 
				+#  define POSH_LittleU16(x) POSH_SwapU16(x)
			
 
				+#  define POSH_LittleU32(x) POSH_SwapU32(x)
			
 
				+#  define POSH_LittleI16(x) POSH_SwapI16(x)
			
 
				+#  define POSH_LittleI32(x) POSH_SwapI32(x)
			
 
				+
			
 
				+#  if defined POSH_64BIT_INTEGER
			
 
				+#    define POSH_LittleU64(x) POSH_SwapU64(x)
			
 
				+#    define POSH_LittleI64(x) POSH_SwapI64(x)
			
 
				+#  endif /* POSH_64BIT_INTEGER */
			
 
				+
			
 
				+#endif
			
 
				+
			
 
				+#ifdef __cplusplus
			
 
				+}
			
 
				+#endif
			
 
				+
			
 
				+
			
--- a/3rdparty/nvtt/nvcore/stdstream.h
+++ b/3rdparty/nvtt/nvcore/stdstream.h
@@ -0,0 +1,459 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#include "nvcore.h"
			
 
				+#include "stream.h"
			
 
				+#include "array.h"
			
 
				+
			
 
				+#include <stdio.h> // fopen
			
 
				+#include <string.h> // memcpy
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+
			
 
				+    // Portable version of fopen.
			
 
				+    inline FILE * fileOpen(const char * fileName, const char * mode)
			
 
				+    {
			
 
				+        nvCheck(fileName != NULL);
			
 
				+#if NV_CC_MSVC && _MSC_VER >= 1400
			
 
				+        FILE * fp;
			
 
				+        if (fopen_s(&fp, fileName, mode) == 0) {
			
 
				+            return fp;
			
 
				+        }
			
 
				+        return NULL;
			
 
				+#else
			
 
				+        return fopen(fileName, mode);
			
 
				+#endif
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    /// Base stdio stream.
			
 
				+    class NVCORE_CLASS StdStream : public Stream
			
 
				+    {
			
 
				+        NV_FORBID_COPY(StdStream);
			
 
				+    public:
			
 
				+
			
 
				+        /// Ctor.
			
 
				+        StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { }
			
 
				+
			
 
				+        /// Dtor. 
			
 
				+        virtual ~StdStream()
			
 
				+        {
			
 
				+            if( m_fp != NULL && m_autoclose ) {
			
 
				+#if NV_OS_WIN32
			
 
				+                _fclose_nolock( m_fp );
			
 
				+#else
			
 
				+                fclose( m_fp );
			
 
				+#endif
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        /** @name Stream implementation. */
			
 
				+        //@{
			
 
				+        virtual void seek( uint pos )
			
 
				+        {
			
 
				+            nvDebugCheck(m_fp != NULL);
			
 
				+            nvDebugCheck(pos <= size());
			
 
				+#if NV_OS_WIN32
			
 
				+            _fseek_nolock(m_fp, pos, SEEK_SET);
			
 
				+#else
			
 
				+            fseek(m_fp, pos, SEEK_SET);
			
 
				+#endif
			
 
				+        }
			
 
				+
			
 
				+        virtual uint tell() const
			
 
				+        {
			
 
				+            nvDebugCheck(m_fp != NULL);
			
 
				+#if NV_OS_WIN32
			
 
				+            return _ftell_nolock(m_fp);
			
 
				+#else
			
 
				+            return (uint)ftell(m_fp);
			
 
				+#endif
			
 
				+        }
			
 
				+
			
 
				+        virtual uint size() const
			
 
				+        {
			
 
				+            nvDebugCheck(m_fp != NULL);
			
 
				+#if NV_OS_WIN32
			
 
				+            uint pos = _ftell_nolock(m_fp);
			
 
				+            _fseek_nolock(m_fp, 0, SEEK_END);
			
 
				+            uint end = _ftell_nolock(m_fp);
			
 
				+            _fseek_nolock(m_fp, pos, SEEK_SET);
			
 
				+#else
			
 
				+            uint pos = (uint)ftell(m_fp);
			
 
				+            fseek(m_fp, 0, SEEK_END);
			
 
				+            uint end = (uint)ftell(m_fp);
			
 
				+            fseek(m_fp, pos, SEEK_SET);
			
 
				+#endif
			
 
				+            return end;
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isError() const
			
 
				+        {
			
 
				+            return m_fp == NULL || ferror( m_fp ) != 0;
			
 
				+        }
			
 
				+
			
 
				+        virtual void clearError()
			
 
				+        {
			
 
				+            nvDebugCheck(m_fp != NULL);
			
 
				+            clearerr(m_fp);
			
 
				+        }
			
 
				+
			
 
				+        // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. 
			
 
				+        // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better 
			
 
				+        // implementation uses use ftell and fseek to determine our location within the file.
			
 
				+        virtual bool isAtEnd() const
			
 
				+        {
			
 
				+            if (m_fp == NULL) return true;
			
 
				+            //nvDebugCheck(m_fp != NULL);
			
 
				+            //return feof( m_fp ) != 0;
			
 
				+#if NV_OS_WIN32
			
 
				+            uint pos = _ftell_nolock(m_fp);
			
 
				+            _fseek_nolock(m_fp, 0, SEEK_END);
			
 
				+            uint end = _ftell_nolock(m_fp);
			
 
				+            _fseek_nolock(m_fp, pos, SEEK_SET);
			
 
				+#else
			
 
				+            uint pos = (uint)ftell(m_fp);
			
 
				+            fseek(m_fp, 0, SEEK_END);
			
 
				+            uint end = (uint)ftell(m_fp);
			
 
				+            fseek(m_fp, pos, SEEK_SET);
			
 
				+#endif
			
 
				+            return pos == end;
			
 
				+        }
			
 
				+
			
 
				+        /// Always true.
			
 
				+        virtual bool isSeekable() const { return true; }
			
 
				+        //@}
			
 
				+
			
 
				+    protected:
			
 
				+
			
 
				+        FILE * m_fp;
			
 
				+        bool m_autoclose;
			
 
				+
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    /// Standard output stream.
			
 
				+    class NVCORE_CLASS StdOutputStream : public StdStream
			
 
				+    {
			
 
				+        NV_FORBID_COPY(StdOutputStream);
			
 
				+    public:
			
 
				+
			
 
				+        /// Construct stream by file name.
			
 
				+        StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { }
			
 
				+
			
 
				+        /// Construct stream by file handle.
			
 
				+        StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose)
			
 
				+        {
			
 
				+        }
			
 
				+
			
 
				+        /** @name Stream implementation. */
			
 
				+        //@{
			
 
				+        /// Write data.
			
 
				+        virtual uint serialize( void * data, uint len )
			
 
				+        {
			
 
				+            nvDebugCheck(data != NULL);
			
 
				+            nvDebugCheck(m_fp != NULL);
			
 
				+#if NV_OS_WIN32
			
 
				+            return (uint)_fwrite_nolock(data, 1, len, m_fp);
			
 
				+#elif NV_OS_LINUX
			
 
				+            return (uint)fwrite_unlocked(data, 1, len, m_fp);
			
 
				+#elif NV_OS_DARWIN
			
 
				+            // @@ No error checking, always returns len.
			
 
				+            for (uint i = 0; i < len; i++) {
			
 
				+                putc_unlocked(((char *)data)[i], m_fp);
			
 
				+            }
			
 
				+            return len;
			
 
				+#else
			
 
				+            return (uint)fwrite(data, 1, len, m_fp);
			
 
				+#endif
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isLoading() const
			
 
				+        {
			
 
				+            return false;
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isSaving() const
			
 
				+        {
			
 
				+            return true;
			
 
				+        }
			
 
				+        //@}
			
 
				+
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    /// Standard input stream.
			
 
				+    class NVCORE_CLASS StdInputStream : public StdStream
			
 
				+    {
			
 
				+        NV_FORBID_COPY(StdInputStream);
			
 
				+    public:
			
 
				+
			
 
				+        /// Construct stream by file name.
			
 
				+        StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { }
			
 
				+
			
 
				+        /// Construct stream by file handle.
			
 
				+        StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
			
 
				+        {
			
 
				+        }
			
 
				+
			
 
				+        /** @name Stream implementation. */
			
 
				+        //@{
			
 
				+        /// Read data.
			
 
				+        virtual uint serialize( void * data, uint len )
			
 
				+        {
			
 
				+            nvDebugCheck(data != NULL);
			
 
				+            nvDebugCheck(m_fp != NULL);
			
 
				+#if NV_OS_WIN32
			
 
				+            return (uint)_fread_nolock(data, 1, len, m_fp);
			
 
				+#elif NV_OS_LINUX
			
 
				+            return (uint)fread_unlocked(data, 1, len, m_fp);
			
 
				+#elif NV_OS_DARWIN
			
 
				+            // @@ No error checking, always returns len.
			
 
				+            for (uint i = 0; i < len; i++) {
			
 
				+                ((char *)data)[i] = getc_unlocked(m_fp);
			
 
				+            }
			
 
				+            return len;
			
 
				+#else
			
 
				+            return (uint)fread(data, 1, len, m_fp);
			
 
				+#endif
			
 
				+            
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isLoading() const
			
 
				+        {
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isSaving() const
			
 
				+        {
			
 
				+            return false;
			
 
				+        }
			
 
				+        //@}
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+
			
 
				+    /// Memory input stream.
			
 
				+    class NVCORE_CLASS MemoryInputStream : public Stream
			
 
				+    {
			
 
				+        NV_FORBID_COPY(MemoryInputStream);
			
 
				+    public:
			
 
				+
			
 
				+        /// Ctor.
			
 
				+        MemoryInputStream( const uint8 * mem, uint size ) : m_mem(mem), m_ptr(mem), m_size(size) { }
			
 
				+
			
 
				+        /** @name Stream implementation. */
			
 
				+        //@{
			
 
				+        /// Read data.
			
 
				+        virtual uint serialize( void * data, uint len )
			
 
				+        {
			
 
				+            nvDebugCheck(data != NULL);
			
 
				+            nvDebugCheck(!isError());
			
 
				+
			
 
				+            uint left = m_size - tell();
			
 
				+            if (len > left) len = left;
			
 
				+
			
 
				+            memcpy( data, m_ptr, len );
			
 
				+            m_ptr += len;
			
 
				+
			
 
				+            return len;
			
 
				+        }
			
 
				+
			
 
				+        virtual void seek( uint pos )
			
 
				+        {
			
 
				+            nvDebugCheck(!isError());
			
 
				+            m_ptr = m_mem + pos;
			
 
				+            nvDebugCheck(!isError());
			
 
				+        }
			
 
				+
			
 
				+        virtual uint tell() const
			
 
				+        {
			
 
				+            nvDebugCheck(m_ptr >= m_mem);
			
 
				+            return uint(m_ptr - m_mem);
			
 
				+        }
			
 
				+
			
 
				+        virtual uint size() const
			
 
				+        {
			
 
				+            return m_size;
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isError() const
			
 
				+        {
			
 
				+            return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
			
 
				+        }
			
 
				+
			
 
				+        virtual void clearError()
			
 
				+        {
			
 
				+            // Nothing to do.
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isAtEnd() const
			
 
				+        {
			
 
				+            return m_ptr == m_mem + m_size;
			
 
				+        }
			
 
				+
			
 
				+        /// Always true.
			
 
				+        virtual bool isSeekable() const
			
 
				+        {
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isLoading() const
			
 
				+        {
			
 
				+            return true;
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isSaving() const
			
 
				+        {
			
 
				+            return false;
			
 
				+        }
			
 
				+        //@}
			
 
				+
			
 
				+        const uint8 * ptr() const { return m_ptr; }
			
 
				+
			
 
				+
			
 
				+    private:
			
 
				+
			
 
				+        const uint8 * m_mem;
			
 
				+        const uint8 * m_ptr;
			
 
				+        uint m_size;
			
 
				+
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    /// Buffer output stream.
			
 
				+    class NVCORE_CLASS BufferOutputStream : public Stream
			
 
				+    {
			
 
				+        NV_FORBID_COPY(BufferOutputStream);
			
 
				+    public:
			
 
				+
			
 
				+        BufferOutputStream(Array<uint8> & buffer) : m_buffer(buffer) { }
			
 
				+
			
 
				+        virtual uint serialize( void * data, uint len )
			
 
				+        {
			
 
				+            nvDebugCheck(data != NULL);
			
 
				+            m_buffer.append((uint8 *)data, len);
			
 
				+            return len;
			
 
				+        }
			
 
				+
			
 
				+        virtual void seek( uint /*pos*/ ) { /*Not implemented*/ }
			
 
				+        virtual uint tell() const { return m_buffer.size(); }
			
 
				+        virtual uint size() const { return m_buffer.size(); }
			
 
				+
			
 
				+        virtual bool isError() const { return false; }
			
 
				+        virtual void clearError() {}
			
 
				+
			
 
				+        virtual bool isAtEnd() const { return true; }
			
 
				+        virtual bool isSeekable() const { return false; }
			
 
				+        virtual bool isLoading() const { return false; }
			
 
				+        virtual bool isSaving() const { return true; }
			
 
				+
			
 
				+    private:
			
 
				+        Array<uint8> & m_buffer;
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    /// Protected input stream.
			
 
				+    class NVCORE_CLASS ProtectedStream : public Stream
			
 
				+    {
			
 
				+        NV_FORBID_COPY(ProtectedStream);
			
 
				+    public:
			
 
				+
			
 
				+        /// Ctor.
			
 
				+        ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
			
 
				+        { 
			
 
				+        }
			
 
				+
			
 
				+        /// Ctor.
			
 
				+        ProtectedStream( Stream * s, bool autodelete = true ) : 
			
 
				+        m_s(s), m_autodelete(autodelete) 
			
 
				+        {
			
 
				+            nvDebugCheck(m_s != NULL);
			
 
				+        }
			
 
				+
			
 
				+        /// Dtor.
			
 
				+        virtual ~ProtectedStream()
			
 
				+        {
			
 
				+            if( m_autodelete ) {
			
 
				+                delete m_s;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        /** @name Stream implementation. */
			
 
				+        //@{
			
 
				+        /// Read data.
			
 
				+        virtual uint serialize( void * data, uint len )
			
 
				+        {
			
 
				+            nvDebugCheck(data != NULL);
			
 
				+            len = m_s->serialize( data, len );
			
 
				+
			
 
				+            if( m_s->isError() ) {
			
 
				+                throw;
			
 
				+            }
			
 
				+
			
 
				+            return len;
			
 
				+        }
			
 
				+
			
 
				+        virtual void seek( uint pos )
			
 
				+        {
			
 
				+            m_s->seek( pos );
			
 
				+
			
 
				+            if( m_s->isError() ) {
			
 
				+                throw;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        virtual uint tell() const
			
 
				+        {
			
 
				+            return m_s->tell();
			
 
				+        }
			
 
				+
			
 
				+        virtual uint size() const
			
 
				+        {
			
 
				+            return m_s->size();
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isError() const
			
 
				+        {
			
 
				+            return m_s->isError();
			
 
				+        }
			
 
				+
			
 
				+        virtual void clearError()
			
 
				+        {
			
 
				+            m_s->clearError();
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isAtEnd() const
			
 
				+        {
			
 
				+            return m_s->isAtEnd();
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isSeekable() const
			
 
				+        {
			
 
				+            return m_s->isSeekable();
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isLoading() const
			
 
				+        {
			
 
				+            return m_s->isLoading();
			
 
				+        }
			
 
				+
			
 
				+        virtual bool isSaving() const
			
 
				+        {
			
 
				+            return m_s->isSaving();
			
 
				+        }
			
 
				+        //@}
			
 
				+
			
 
				+
			
 
				+    private:
			
 
				+
			
 
				+        Stream * const m_s;
			
 
				+        bool const m_autodelete;
			
 
				+
			
 
				+    };
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+
			
 
				+//#endif // NV_CORE_STDSTREAM_H
			
--- a/3rdparty/nvtt/nvcore/stream.h
+++ b/3rdparty/nvtt/nvcore/stream.h
@@ -0,0 +1,163 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_CORE_STREAM_H
			
 
				+#define NV_CORE_STREAM_H
			
 
				+
			
 
				+#include "nvcore.h"
			
 
				+#include "debug.h"
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+
			
 
				+    /// Base stream class.
			
 
				+    class NVCORE_CLASS Stream {
			
 
				+    public:
			
 
				+
			
 
				+        enum ByteOrder {
			
 
				+            LittleEndian = false,
			
 
				+            BigEndian = true,
			
 
				+        };
			
 
				+
			
 
				+        /// Get the byte order of the system.
			
 
				+        static ByteOrder getSystemByteOrder() { 
			
 
				+#if NV_LITTLE_ENDIAN
			
 
				+            return LittleEndian;
			
 
				+#else
			
 
				+            return BigEndian;
			
 
				+#endif
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+        /// Ctor.
			
 
				+        Stream() : m_byteOrder(LittleEndian) { }
			
 
				+
			
 
				+        /// Virtual destructor.
			
 
				+        virtual ~Stream() {}
			
 
				+
			
 
				+        /// Set byte order.
			
 
				+        void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
			
 
				+
			
 
				+        /// Get byte order.
			
 
				+        ByteOrder byteOrder() const { return m_byteOrder; }
			
 
				+
			
 
				+
			
 
				+        /// Serialize the given data.
			
 
				+        virtual uint serialize( void * data, uint len ) = 0;
			
 
				+
			
 
				+        /// Move to the given position in the archive.
			
 
				+        virtual void seek( uint pos ) = 0;
			
 
				+
			
 
				+        /// Return the current position in the archive.
			
 
				+        virtual uint tell() const = 0;
			
 
				+
			
 
				+        /// Return the current size of the archive.
			
 
				+        virtual uint size() const = 0;
			
 
				+
			
 
				+        /// Determine if there has been any error.
			
 
				+        virtual bool isError() const = 0;
			
 
				+
			
 
				+        /// Clear errors.
			
 
				+        virtual void clearError() = 0;
			
 
				+
			
 
				+        /// Return true if the stream is at the end.
			
 
				+        virtual bool isAtEnd() const = 0;
			
 
				+
			
 
				+        /// Return true if the stream is seekable.
			
 
				+        virtual bool isSeekable() const = 0;
			
 
				+
			
 
				+        /// Return true if this is an input stream.
			
 
				+        virtual bool isLoading() const = 0;
			
 
				+
			
 
				+        /// Return true if this is an output stream.
			
 
				+        virtual bool isSaving() const = 0;
			
 
				+
			
 
				+
			
 
				+        void advance(uint offset) { seek(tell() + offset); }
			
 
				+
			
 
				+
			
 
				+        // friends	
			
 
				+        friend Stream & operator<<( Stream & s, bool & c ) {
			
 
				+#if NV_OS_DARWIN && !NV_CC_CPP11
			
 
				+            nvStaticCheck(sizeof(bool) == 4);
			
 
				+            uint8 b = c ? 1 : 0;
			
 
				+            s.serialize( &b, 1 );
			
 
				+            c = (b == 1);
			
 
				+#else
			
 
				+            nvStaticCheck(sizeof(bool) == 1);
			
 
				+            s.serialize( &c, 1 );
			
 
				+#endif
			
 
				+            return s;
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, char & c ) {
			
 
				+            nvStaticCheck(sizeof(char) == 1);
			
 
				+            s.serialize( &c, 1 );
			
 
				+            return s;
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, uint8 & c ) {
			
 
				+            nvStaticCheck(sizeof(uint8) == 1);
			
 
				+            s.serialize( &c, 1 );
			
 
				+            return s;
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, int8 & c ) {
			
 
				+            nvStaticCheck(sizeof(int8) == 1);
			
 
				+            s.serialize( &c, 1 );
			
 
				+            return s;
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, uint16 & c ) {
			
 
				+            nvStaticCheck(sizeof(uint16) == 2);
			
 
				+            return s.byteOrderSerialize( &c, 2 );
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, int16 & c ) {
			
 
				+            nvStaticCheck(sizeof(int16) == 2);
			
 
				+            return s.byteOrderSerialize( &c, 2 );
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, uint32 & c ) {
			
 
				+            nvStaticCheck(sizeof(uint32) == 4);
			
 
				+            return s.byteOrderSerialize( &c, 4 );
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, int32 & c ) {
			
 
				+            nvStaticCheck(sizeof(int32) == 4);
			
 
				+            return s.byteOrderSerialize( &c, 4 );
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, uint64 & c ) {
			
 
				+            nvStaticCheck(sizeof(uint64) == 8);
			
 
				+            return s.byteOrderSerialize( &c, 8 );
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, int64 & c ) {
			
 
				+            nvStaticCheck(sizeof(int64) == 8);
			
 
				+            return s.byteOrderSerialize( &c, 8 );
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, float & c ) {
			
 
				+            nvStaticCheck(sizeof(float) == 4);
			
 
				+            return s.byteOrderSerialize( &c, 4 );
			
 
				+        }
			
 
				+        friend Stream & operator<<( Stream & s, double & c ) {
			
 
				+            nvStaticCheck(sizeof(double) == 8);
			
 
				+            return s.byteOrderSerialize( &c, 8 );
			
 
				+        }
			
 
				+
			
 
				+    protected:
			
 
				+
			
 
				+        /// Serialize in the stream byte order.
			
 
				+        Stream & byteOrderSerialize( void * v, uint len ) {
			
 
				+            if( m_byteOrder == getSystemByteOrder() ) {
			
 
				+                serialize( v, len );
			
 
				+            }
			
 
				+            else {
			
 
				+                for( uint i = len; i > 0; i-- ) {
			
 
				+                    serialize( (uint8 *)v + i - 1, 1 );
			
 
				+                }
			
 
				+            }
			
 
				+            return *this;
			
 
				+        }
			
 
				+
			
 
				+
			
 
				+    private:
			
 
				+
			
 
				+        ByteOrder m_byteOrder;
			
 
				+
			
 
				+    };
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_CORE_STREAM_H
			
--- a/3rdparty/nvtt/nvcore/strlib.h
+++ b/3rdparty/nvtt/nvcore/strlib.h
@@ -0,0 +1,429 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_CORE_STRING_H
			
 
				+#define NV_CORE_STRING_H
			
 
				+
			
 
				+#include "debug.h"
			
 
				+#include "hash.h" // hash
			
 
				+
			
 
				+//#include <string.h> // strlen, etc.
			
 
				+
			
 
				+#if NV_OS_WIN32
			
 
				+#define NV_PATH_SEPARATOR '\\'
			
 
				+#else
			
 
				+#define NV_PATH_SEPARATOR '/'
			
 
				+#endif
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+
			
 
				+    NVCORE_API uint strHash(const char * str, uint h) NV_PURE;
			
 
				+
			
 
				+    /// String hash based on Bernstein's hash.
			
 
				+    inline uint strHash(const char * data, uint h = 5381)
			
 
				+    {
			
 
				+        uint i = 0;
			
 
				+        while(data[i] != 0) {
			
 
				+            h = (33 * h) ^ uint(data[i]);
			
 
				+            i++;
			
 
				+        }
			
 
				+        return h;
			
 
				+    }
			
 
				+
			
 
				+    template <> struct Hash<const char *> {
			
 
				+        uint operator()(const char * str) const { return strHash(str); }
			
 
				+    };
			
 
				+
			
 
				+    NVCORE_API uint strLen(const char * str) NV_PURE;                       // Asserts on NULL strings.
			
 
				+
			
 
				+    NVCORE_API int strDiff(const char * s1, const char * s2) NV_PURE;       // Asserts on NULL strings.
			
 
				+    NVCORE_API int strCaseDiff(const char * s1, const char * s2) NV_PURE;   // Asserts on NULL strings.
			
 
				+    NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE;     // Accepts NULL strings.
			
 
				+    NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings.
			
 
				+
			
 
				+    template <> struct Equal<const char *> {
			
 
				+        bool operator()(const char * a, const char * b) const { return strEqual(a, b); }
			
 
				+    };
			
 
				+
			
 
				+    NVCORE_API bool strBeginsWith(const char * dst, const char * prefix) NV_PURE;
			
 
				+    NVCORE_API bool strEndsWith(const char * dst, const char * suffix) NV_PURE;
			
 
				+
			
 
				+
			
 
				+    NVCORE_API void strCpy(char * dst, uint size, const char * src);
			
 
				+    NVCORE_API void strCpy(char * dst, uint size, const char * src, uint len);
			
 
				+    NVCORE_API void strCat(char * dst, uint size, const char * src);
			
 
				+
			
 
				+    NVCORE_API const char * strSkipWhiteSpace(const char * str);
			
 
				+    NVCORE_API char * strSkipWhiteSpace(char * str);
			
 
				+
			
 
				+    NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE;
			
 
				+
			
 
				+    NVCORE_API bool isNumber(const char * str) NV_PURE;
			
 
				+
			
 
				+    /* @@ Implement these two functions and modify StringBuilder to use them?
			
 
				+    NVCORE_API void strFormat(const char * dst, const char * fmt, ...);
			
 
				+    NVCORE_API void strFormatList(const char * dst, const char * fmt, va_list arg);
			
 
				+
			
 
				+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) __attribute__((format (printf, 2, 3)));
			
 
				+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) {
			
 
				+        va_list args;
			
 
				+        va_start(args, fmt);
			
 
				+        strFormatList(buffer, count, fmt, args);
			
 
				+        va_end(args);
			
 
				+    }
			
 
				+    template <size_t count> void strFormatListSafe(char (&buffer)[count], const char *fmt, va_list arg) {
			
 
				+        va_list tmp;
			
 
				+        va_copy(tmp, args);
			
 
				+        strFormatList(buffer, count, fmt, tmp);
			
 
				+        va_end(tmp);
			
 
				+    }*/
			
 
				+
			
 
				+    template <int count> void strCpySafe(char (&buffer)[count], const char *src) {
			
 
				+        strCpy(buffer, count, src);
			
 
				+    }
			
 
				+
			
 
				+    template <int count> void strCatSafe(char (&buffer)[count], const char * src) {
			
 
				+        strCat(buffer, count, src);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+    /// String builder.
			
 
				+    class NVCORE_CLASS StringBuilder
			
 
				+    {
			
 
				+    public:
			
 
				+
			
 
				+        StringBuilder();
			
 
				+        explicit StringBuilder( uint size_hint );
			
 
				+        StringBuilder(const char * str);
			
 
				+        StringBuilder(const char * str, uint len);
			
 
				+        StringBuilder(const StringBuilder & other);
			
 
				+
			
 
				+        ~StringBuilder();
			
 
				+
			
 
				+        StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
			
 
				+        StringBuilder & formatList( const char * format, va_list arg );
			
 
				+
			
 
				+        StringBuilder & append(const char * str);
			
 
				+		StringBuilder & append(const char * str, uint len);
			
 
				+        StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3)));
			
 
				+        StringBuilder & appendFormatList(const char * format, va_list arg);
			
 
				+
			
 
				+        StringBuilder & appendSpace(uint n);
			
 
				+
			
 
				+        StringBuilder & number( int i, int base = 10 );
			
 
				+        StringBuilder & number( uint i, int base = 10 );
			
 
				+
			
 
				+        StringBuilder & reserve(uint size_hint);
			
 
				+        StringBuilder & copy(const char * str);
			
 
				+        StringBuilder & copy(const char * str, uint len);
			
 
				+        StringBuilder & copy(const StringBuilder & str);
			
 
				+
			
 
				+        StringBuilder & toLower();
			
 
				+        StringBuilder & toUpper();
			
 
				+
			
 
				+        bool endsWith(const char * str) const;
			
 
				+        bool beginsWith(const char * str) const;
			
 
				+
			
 
				+        char * reverseFind(char c);
			
 
				+
			
 
				+        void reset();
			
 
				+        bool isNull() const { return m_size == 0; }
			
 
				+
			
 
				+        // const char * accessors
			
 
				+        //operator const char * () const { return m_str; }
			
 
				+        //operator char * () { return m_str; }
			
 
				+        const char * str() const { return m_str; }
			
 
				+        char * str() { return m_str; }
			
 
				+
			
 
				+        char * release();
			
 
				+
			
 
				+        /// Implement value semantics.
			
 
				+        StringBuilder & operator=( const StringBuilder & s ) {
			
 
				+            return copy(s);
			
 
				+        }
			
 
				+
			
 
				+        /// Implement value semantics.
			
 
				+        StringBuilder & operator=( const char * s ) {
			
 
				+            return copy(s);
			
 
				+        }
			
 
				+
			
 
				+        /// Equal operator.
			
 
				+        bool operator==( const StringBuilder & s ) const {
			
 
				+            return strMatch(s.m_str, m_str);
			
 
				+        }
			
 
				+
			
 
				+        /// Return the exact length.
			
 
				+        uint length() const { return isNull() ? 0 : strLen(m_str); }
			
 
				+
			
 
				+        /// Return the size of the string container.
			
 
				+        uint capacity() const { return m_size; }
			
 
				+
			
 
				+        /// Return the hash of the string.
			
 
				+        uint hash() const { return isNull() ? 0 : strHash(m_str); }
			
 
				+
			
 
				+        // Swap strings.
			
 
				+        friend void swap(StringBuilder & a, StringBuilder & b);
			
 
				+
			
 
				+    protected:
			
 
				+
			
 
				+        /// Size of the string container.
			
 
				+        uint m_size;
			
 
				+
			
 
				+        /// String.
			
 
				+        char * m_str;
			
 
				+
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    /// Path string. @@ This should be called PathBuilder.
			
 
				+    class NVCORE_CLASS Path : public StringBuilder
			
 
				+    {
			
 
				+    public:
			
 
				+        Path() : StringBuilder() {}
			
 
				+        explicit Path(int size_hint) : StringBuilder(size_hint) {}
			
 
				+        Path(const char * str) : StringBuilder(str) {}
			
 
				+        Path(const Path & path) : StringBuilder(path) {}
			
 
				+
			
 
				+        const char * fileName() const;
			
 
				+        const char * extension() const;
			
 
				+
			
 
				+        void translatePath(char pathSeparator = NV_PATH_SEPARATOR);
			
 
				+
			
 
				+        void appendSeparator(char pathSeparator = NV_PATH_SEPARATOR);
			
 
				+
			
 
				+        void stripFileName();
			
 
				+        void stripExtension();
			
 
				+
			
 
				+        // statics
			
 
				+        NVCORE_API static char separator();
			
 
				+        NVCORE_API static const char * fileName(const char *);
			
 
				+        NVCORE_API static const char * extension(const char *);
			
 
				+
			
 
				+        NVCORE_API static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    /// String class.
			
 
				+    class NVCORE_CLASS String
			
 
				+    {
			
 
				+    public:
			
 
				+
			
 
				+        /// Constructs a null string. @sa isNull()
			
 
				+        String()
			
 
				+        {
			
 
				+            data = NULL;
			
 
				+        }
			
 
				+
			
 
				+        /// Constructs a shared copy of str.
			
 
				+        String(const String & str)
			
 
				+        {
			
 
				+            data = str.data;
			
 
				+            if (data != NULL) addRef();
			
 
				+        }
			
 
				+
			
 
				+        /// Constructs a shared string from a standard string.
			
 
				+        String(const char * str)
			
 
				+        {
			
 
				+            setString(str);
			
 
				+        }
			
 
				+
			
 
				+        /// Constructs a shared string from a standard string.
			
 
				+        String(const char * str, int length)
			
 
				+        {
			
 
				+            setString(str, length);
			
 
				+        }
			
 
				+
			
 
				+        /// Constructs a shared string from a StringBuilder.
			
 
				+        String(const StringBuilder & str)
			
 
				+        {
			
 
				+            setString(str);
			
 
				+        }
			
 
				+
			
 
				+        /// Dtor.
			
 
				+        ~String()
			
 
				+        {
			
 
				+            release();
			
 
				+        }
			
 
				+
			
 
				+        String clone() const;
			
 
				+
			
 
				+        /// Release the current string and allocate a new one.
			
 
				+        const String & operator=( const char * str )
			
 
				+        {
			
 
				+            release();
			
 
				+            setString( str );
			
 
				+            return *this;
			
 
				+        }
			
 
				+
			
 
				+        /// Release the current string and allocate a new one.
			
 
				+        const String & operator=( const StringBuilder & str )
			
 
				+        {
			
 
				+            release();
			
 
				+            setString( str );
			
 
				+            return *this;
			
 
				+        }
			
 
				+
			
 
				+        /// Implement value semantics.
			
 
				+        String & operator=( const String & str )
			
 
				+        {
			
 
				+            if (str.data != data)
			
 
				+            {
			
 
				+                release();
			
 
				+                data = str.data;
			
 
				+                addRef();
			
 
				+            }
			
 
				+            return *this;
			
 
				+        }
			
 
				+
			
 
				+        /// Equal operator.
			
 
				+        bool operator==( const String & str ) const
			
 
				+        {
			
 
				+            return strMatch(str.data, data);
			
 
				+        }
			
 
				+
			
 
				+        /// Equal operator.
			
 
				+        bool operator==( const char * str ) const
			
 
				+        {
			
 
				+            return strMatch(str, data);
			
 
				+        }
			
 
				+
			
 
				+        /// Not equal operator.
			
 
				+        bool operator!=( const String & str ) const
			
 
				+        {
			
 
				+            return !strMatch(str.data, data);
			
 
				+        }
			
 
				+
			
 
				+        /// Not equal operator.
			
 
				+        bool operator!=( const char * str ) const
			
 
				+        {
			
 
				+            return !strMatch(str, data);
			
 
				+        }
			
 
				+
			
 
				+        /// Returns true if this string is the null string.
			
 
				+        bool isNull() const { return data == NULL; }
			
 
				+
			
 
				+        /// Return the exact length.
			
 
				+        uint length() const { nvDebugCheck(data != NULL); return strLen(data); }
			
 
				+
			
 
				+        /// Return the hash of the string.
			
 
				+        uint hash() const { nvDebugCheck(data != NULL); return strHash(data); }
			
 
				+
			
 
				+        /// const char * cast operator.
			
 
				+        operator const char * () const { return data; }
			
 
				+
			
 
				+        /// Get string pointer.
			
 
				+        const char * str() const { return data; }
			
 
				+
			
 
				+
			
 
				+    private:
			
 
				+
			
 
				+        // Add reference count.
			
 
				+        void addRef();
			
 
				+
			
 
				+        // Decrease reference count.
			
 
				+        void release();
			
 
				+
			
 
				+        uint16 getRefCount() const
			
 
				+        {
			
 
				+            nvDebugCheck(data != NULL);
			
 
				+            return *reinterpret_cast<const uint16 *>(data - 2);
			
 
				+        }
			
 
				+
			
 
				+        void setRefCount(uint16 count) {
			
 
				+            nvDebugCheck(data != NULL);
			
 
				+            nvCheck(count < 0xFFFF);
			
 
				+            *reinterpret_cast<uint16 *>(const_cast<char *>(data - 2)) = uint16(count);
			
 
				+        }
			
 
				+
			
 
				+        void setData(const char * str) {
			
 
				+            data = str + 2;
			
 
				+        }
			
 
				+
			
 
				+        void allocString(const char * str)
			
 
				+        {
			
 
				+            allocString(str, strLen(str));
			
 
				+        }
			
 
				+
			
 
				+        void allocString(const char * str, uint length);
			
 
				+
			
 
				+        void setString(const char * str);
			
 
				+        void setString(const char * str, uint length);
			
 
				+        void setString(const StringBuilder & str);
			
 
				+
			
 
				+        // Swap strings.
			
 
				+        friend void swap(String & a, String & b);
			
 
				+
			
 
				+    private:
			
 
				+
			
 
				+        const char * data;
			
 
				+
			
 
				+    };
			
 
				+
			
 
				+    template <> struct Hash<String> {
			
 
				+        uint operator()(const String & str) const { return str.hash(); }
			
 
				+    };
			
 
				+
			
 
				+
			
 
				+    // Like AutoPtr, but for const char strings.
			
 
				+    class AutoString
			
 
				+    {
			
 
				+        NV_FORBID_COPY(AutoString);
			
 
				+        NV_FORBID_HEAPALLOC();
			
 
				+    public:
			
 
				+
			
 
				+        // Ctor.
			
 
				+        AutoString(const char * p = NULL) : m_ptr(p) { }
			
 
				+
			
 
				+#if NV_CC_CPP11
			
 
				+        // Move ctor.
			
 
				+        AutoString(AutoString && ap) : m_ptr(ap.m_ptr) { ap.m_ptr = NULL; }
			
 
				+#endif
			
 
				+        
			
 
				+        // Dtor. Deletes owned pointer.
			
 
				+        ~AutoString() {
			
 
				+            delete [] m_ptr;
			
 
				+            m_ptr = NULL;
			
 
				+        }
			
 
				+
			
 
				+        // Delete owned pointer and assign new one.
			
 
				+        void operator=(const char * p) {
			
 
				+            if (p != m_ptr) 
			
 
				+            {
			
 
				+                delete [] m_ptr;
			
 
				+                m_ptr = p;
			
 
				+            }
			
 
				+        }
			
 
				+
			
 
				+        // Get pointer.
			
 
				+        const char * ptr() const { return m_ptr; }
			
 
				+        operator const char *() const { return m_ptr; }
			
 
				+
			
 
				+        // Relinquish ownership of the underlying pointer and returns that pointer.
			
 
				+        const char * release() {
			
 
				+            const char * tmp = m_ptr;
			
 
				+            m_ptr = NULL;
			
 
				+            return tmp;
			
 
				+        }
			
 
				+
			
 
				+        // comparison operators.
			
 
				+        friend bool operator == (const AutoString & ap, const char * const p) {
			
 
				+            return (ap.ptr() == p);
			
 
				+        }
			
 
				+        friend bool operator != (const AutoString & ap, const char * const p) {
			
 
				+            return (ap.ptr() != p);
			
 
				+        }
			
 
				+        friend bool operator == (const char * const p, const AutoString & ap) {
			
 
				+            return (ap.ptr() == p);
			
 
				+        }
			
 
				+        friend bool operator != (const char * const p, const AutoString & ap) {
			
 
				+            return (ap.ptr() != p);
			
 
				+        }
			
 
				+
			
 
				+    private:
			
 
				+        const char * m_ptr;
			
 
				+    };
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_CORE_STRING_H
			
--- a/3rdparty/nvtt/nvcore/utils.h
+++ b/3rdparty/nvtt/nvcore/utils.h
@@ -0,0 +1,281 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_CORE_UTILS_H
			
 
				+#define NV_CORE_UTILS_H
			
 
				+
			
 
				+#include "debug.h" // nvdebugcheck
			
 
				+
			
 
				+#include <new> // for placement new
			
 
				+
			
 
				+
			
 
				+// Just in case. Grrr.
			
 
				+#undef min
			
 
				+#undef max
			
 
				+
			
 
				+#define NV_INT8_MIN    (-128)
			
 
				+#define NV_INT8_MAX    127
			
 
				+#define NV_UINT8_MAX    255
			
 
				+#define NV_INT16_MIN    (-32767-1)
			
 
				+#define NV_INT16_MAX    32767
			
 
				+#define NV_UINT16_MAX   0xffff
			
 
				+#define NV_INT32_MIN    (-2147483647-1)
			
 
				+#define NV_INT32_MAX    2147483647
			
 
				+#define NV_UINT32_MAX   0xffffffff
			
 
				+#define NV_INT64_MAX    POSH_I64(9223372036854775807)
			
 
				+#define NV_INT64_MIN    (-POSH_I64(9223372036854775807)-1)
			
 
				+#define NV_UINT64_MAX   POSH_U64(0xffffffffffffffff)
			
 
				+
			
 
				+#define NV_HALF_MAX     65504.0F
			
 
				+#define NV_FLOAT_MAX    3.402823466e+38F
			
 
				+
			
 
				+#define NV_INTEGER_TO_FLOAT_MAX  16777217     // Largest integer such that it and all smaller integers can be stored in a 32bit float.
			
 
				+
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    // Less error prone than casting. From CB:
			
 
				+    // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html
			
 
				+
			
 
				+    // These intentionally look like casts.
			
 
				+
			
 
				+    // uint32 casts:
			
 
				+    template <typename T> inline uint32 U32(T x) { return x; }
			
 
				+    template <> inline uint32 U32<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; }
			
 
				+    template <> inline uint32 U32<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; }
			
 
				+    //template <> inline uint32 U32<uint32>(uint32 x) { return x; }
			
 
				+    template <> inline uint32 U32<int32>(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; }
			
 
				+    //template <> inline uint32 U32<uint16>(uint16 x) { return x; }
			
 
				+    template <> inline uint32 U32<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; }
			
 
				+    //template <> inline uint32 U32<uint8>(uint8 x) { return x; }
			
 
				+    template <> inline uint32 U32<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; }
			
 
				+
			
 
				+    // int32 casts:
			
 
				+    template <typename T> inline int32 I32(T x) { return x; }
			
 
				+    template <> inline int32 I32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
			
 
				+    template <> inline int32 I32<int64>(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; }
			
 
				+    template <> inline int32 I32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
			
 
				+    //template <> inline int32 I32<int32>(int32 x) { return x; }
			
 
				+    //template <> inline int32 I32<uint16>(uint16 x) { return x; }
			
 
				+    //template <> inline int32 I32<int16>(int16 x) { return x; }
			
 
				+    //template <> inline int32 I32<uint8>(uint8 x) { return x; }
			
 
				+    //template <> inline int32 I32<int8>(int8 x) { return x; }
			
 
				+
			
 
				+    // uint16 casts:
			
 
				+    template <typename T> inline uint16 U16(T x) { return x; }
			
 
				+    template <> inline uint16 U16<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
			
 
				+    template <> inline uint16 U16<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
			
 
				+    template <> inline uint16 U16<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
			
 
				+    template <> inline uint16 U16<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
			
 
				+    //template <> inline uint16 U16<uint16>(uint16 x) { return x; }
			
 
				+    template <> inline uint16 U16<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; }
			
 
				+    //template <> inline uint16 U16<uint8>(uint8 x) { return x; }
			
 
				+    template <> inline uint16 U16<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; }
			
 
				+
			
 
				+    // int16 casts:
			
 
				+    template <typename T> inline int16 I16(T x) { return x; }
			
 
				+    template <> inline int16 I16<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
			
 
				+    template <> inline int16 I16<int64>(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
			
 
				+    template <> inline int16 I16<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
			
 
				+    template <> inline int16 I16<int32>(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
			
 
				+    template <> inline int16 I16<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
			
 
				+    //template <> inline int16 I16<int16>(int16 x) { return x; }
			
 
				+    //template <> inline int16 I16<uint8>(uint8 x) { return x; }
			
 
				+    //template <> inline int16 I16<int8>(int8 x) { return x; }
			
 
				+
			
 
				+    // uint8 casts:
			
 
				+    template <typename T> inline uint8 U8(T x) { return x; }
			
 
				+    template <> inline uint8 U8<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
			
 
				+    template <> inline uint8 U8<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
			
 
				+    template <> inline uint8 U8<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
			
 
				+    template <> inline uint8 U8<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
			
 
				+    template <> inline uint8 U8<uint16>(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
			
 
				+    template <> inline uint8 U8<int16>(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
			
 
				+    //template <> inline uint8 U8<uint8>(uint8 x) { return x; }
			
 
				+    template <> inline uint8 U8<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; }
			
 
				+    //template <> inline uint8 U8<float>(int8 x) { nvDebugCheck(x >= 0.0f && x <= 255.0f); return (uint8)x; }
			
 
				+
			
 
				+    // int8 casts:
			
 
				+    template <typename T> inline int8 I8(T x) { return x; }
			
 
				+    template <> inline int8 I8<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
			
 
				+    template <> inline int8 I8<int64>(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
			
 
				+    template <> inline int8 I8<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
			
 
				+    template <> inline int8 I8<int32>(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
			
 
				+    template <> inline int8 I8<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
			
 
				+    template <> inline int8 I8<int16>(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
			
 
				+    template <> inline int8 I8<uint8>(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
			
 
				+    //template <> inline int8 I8<int8>(int8 x) { return x; }
			
 
				+
			
 
				+    // float casts:
			
 
				+    template <typename T> inline float F32(T x) { return x; }
			
 
				+    template <> inline float F32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
			
 
				+    template <> inline float F32<int64>(int64 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
			
 
				+    template <> inline float F32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
			
 
				+    template <> inline float F32<int32>(int32 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
			
 
				+    // The compiler should not complain about these conversions:
			
 
				+    //template <> inline float F32<uint16>(uint16 x) { nvDebugCheck(return (float)x; }
			
 
				+    //template <> inline float F32<int16>(int16 x) { nvDebugCheck(return (float)x; }
			
 
				+    //template <> inline float F32<uint8>(uint8 x) { nvDebugCheck(return (float)x; }
			
 
				+    //template <> inline float F32<int8>(int8 x) { nvDebugCheck(return (float)x; }
			
 
				+
			
 
				+
			
 
				+    /// Swap two values.
			
 
				+    template <typename T> 
			
 
				+    inline void swap(T & a, T & b)
			
 
				+    {
			
 
				+        T temp(a);
			
 
				+        a = b; 
			
 
				+        b = temp;
			
 
				+    }
			
 
				+
			
 
				+    /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN.
			
 
				+    template <typename T> 
			
 
				+    //inline const T & max(const T & a, const T & b)
			
 
				+    inline T max(const T & a, const T & b)
			
 
				+    {
			
 
				+        return (b < a) ? a : b;
			
 
				+    }
			
 
				+
			
 
				+	/// Return the maximum of the four arguments.
			
 
				+	template <typename T> 
			
 
				+	//inline const T & max4(const T & a, const T & b, const T & c)
			
 
				+	inline T max4(const T & a, const T & b, const T & c, const T & d)
			
 
				+	{
			
 
				+		return max(max(a, b), max(c, d));
			
 
				+	}
			
 
				+
			
 
				+    /// Return the maximum of the three arguments.
			
 
				+    template <typename T> 
			
 
				+    //inline const T & max3(const T & a, const T & b, const T & c)
			
 
				+    inline T max3(const T & a, const T & b, const T & c)
			
 
				+    {
			
 
				+        return max(a, max(b, c));
			
 
				+    }
			
 
				+
			
 
				+    /// Return the minimum of two values.
			
 
				+    template <typename T> 
			
 
				+    //inline const T & min(const T & a, const T & b)
			
 
				+    inline T min(const T & a, const T & b)
			
 
				+    {
			
 
				+        return (a < b) ? a : b;
			
 
				+    }
			
 
				+
			
 
				+    /// Return the maximum of the three arguments.
			
 
				+    template <typename T> 
			
 
				+    //inline const T & min3(const T & a, const T & b, const T & c)
			
 
				+    inline T min3(const T & a, const T & b, const T & c)
			
 
				+    {
			
 
				+        return min(a, min(b, c));
			
 
				+    }
			
 
				+
			
 
				+    /// Clamp between two values.
			
 
				+    template <typename T> 
			
 
				+    //inline const T & clamp(const T & x, const T & a, const T & b)
			
 
				+    inline T clamp(const T & x, const T & a, const T & b)
			
 
				+    {
			
 
				+        return min(max(x, a), b);
			
 
				+    }
			
 
				+
			
 
				+    /** Return the next power of two. 
			
 
				+    * @see http://graphics.stanford.edu/~seander/bithacks.html
			
 
				+    * @warning Behaviour for 0 is undefined.
			
 
				+    * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
			
 
				+    * @note nextPowerOfTwo(x) = 2 << log2(x-1)
			
 
				+    */
			
 
				+    inline uint nextPowerOfTwo( uint x )
			
 
				+    {
			
 
				+        nvDebugCheck( x != 0 );
			
 
				+#if 1	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
			
 
				+        x--;
			
 
				+        x |= x >> 1;
			
 
				+        x |= x >> 2;
			
 
				+        x |= x >> 4;
			
 
				+        x |= x >> 8;
			
 
				+        x |= x >> 16;
			
 
				+        return x+1;	
			
 
				+#else
			
 
				+        uint p = 1;
			
 
				+        while( x > p ) {
			
 
				+            p += p;
			
 
				+        }
			
 
				+        return p;
			
 
				+#endif
			
 
				+    }
			
 
				+
			
 
				+    /// Return true if @a n is a power of two.
			
 
				+    inline bool isPowerOfTwo( uint n )
			
 
				+    {
			
 
				+        return (n & (n-1)) == 0;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // @@ Move this to utils?
			
 
				+    /// Delete all the elements of a container.
			
 
				+    template <typename T>
			
 
				+    void deleteAll(T & container)
			
 
				+    {
			
 
				+        for (typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
			
 
				+        {
			
 
				+            delete container[i];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // @@ Specialize these methods for numeric, pointer, and pod types.
			
 
				+
			
 
				+    template <typename T>
			
 
				+    void construct_range(T * restrict ptr, uint new_size, uint old_size) {
			
 
				+        for (uint i = old_size; i < new_size; i++) {
			
 
				+            new(ptr+i) T; // placement new
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    template <typename T>
			
 
				+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T & elem) {
			
 
				+        for (uint i = old_size; i < new_size; i++) {
			
 
				+            new(ptr+i) T(elem); // placement new
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    template <typename T>
			
 
				+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) {
			
 
				+        for (uint i = old_size; i < new_size; i++) {
			
 
				+            new(ptr+i) T(src[i]); // placement new
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    template <typename T>
			
 
				+    void destroy_range(T * restrict ptr, uint new_size, uint old_size) {
			
 
				+        for (uint i = new_size; i < old_size; i++) {
			
 
				+            (ptr+i)->~T(); // Explicit call to the destructor
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    template <typename T>
			
 
				+    void fill(T * restrict dst, uint count, const T & value) {
			
 
				+        for (uint i = 0; i < count; i++) {
			
 
				+            dst[i] = value;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    template <typename T>
			
 
				+    void copy_range(T * restrict dst, const T * restrict src, uint count) {
			
 
				+        for (uint i = 0; i < count; i++) {
			
 
				+            dst[i] = src[i];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    template <typename T>
			
 
				+    bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) {
			
 
				+        for (uint i = begin; i < end; i++) {
			
 
				+            if (ptr[i] == element) {
			
 
				+                if (index != NULL) *index = i;
			
 
				+                return true;
			
 
				+            }
			
 
				+        }
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_CORE_UTILS_H
			
--- a/3rdparty/nvtt/nvmath/Vector.inl
+++ b/3rdparty/nvtt/nvmath/Vector.inl
@@ -0,0 +1,921 @@
 
				+// This code is in the public domain -- [email protected]
			
 
				+
			
 
				+#ifndef NV_MATH_VECTOR_INL
			
 
				+#define NV_MATH_VECTOR_INL
			
 
				+
			
 
				+#include "vector.h"
			
 
				+#include "nvcore/utils.h" // min, max
			
 
				+#include "nvcore/hash.h" // hash
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+
			
 
				+    // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor.
			
 
				+    //template <typename T> T to(Vector2::Arg v) { return T(v.x, v.y); }
			
 
				+
			
 
				+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
			
 
				+    //template <typename T> T to(Vector3::Arg v) { return T(v.x, v.y, v.z); }
			
 
				+
			
 
				+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
			
 
				+    //template <typename T> T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); }
			
 
				+
			
 
				+
			
 
				+    // Vector2
			
 
				+    inline Vector2::Vector2() {}
			
 
				+    inline Vector2::Vector2(float f) : x(f), y(f) {}
			
 
				+    inline Vector2::Vector2(float x, float y) : x(x), y(y) {}
			
 
				+    inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
			
 
				+
			
 
				+    inline const Vector2 & Vector2::operator=(Vector2::Arg v)
			
 
				+    {
			
 
				+        x = v.x;
			
 
				+        y = v.y;
			
 
				+        return *this;
			
 
				+    }
			
 
				+
			
 
				+    inline const float * Vector2::ptr() const
			
 
				+    {
			
 
				+        return &x;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector2::set(float x, float y)
			
 
				+    {
			
 
				+        this->x = x;
			
 
				+        this->y = y;
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 Vector2::operator-() const
			
 
				+    {
			
 
				+        return Vector2(-x, -y);
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector2::operator+=(Vector2::Arg v)
			
 
				+    {
			
 
				+        x += v.x;
			
 
				+        y += v.y;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector2::operator-=(Vector2::Arg v)
			
 
				+    {
			
 
				+        x -= v.x;
			
 
				+        y -= v.y;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector2::operator*=(float s)
			
 
				+    {
			
 
				+        x *= s;
			
 
				+        y *= s;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector2::operator*=(Vector2::Arg v)
			
 
				+    {
			
 
				+        x *= v.x;
			
 
				+        y *= v.y;
			
 
				+    }
			
 
				+
			
 
				+    inline bool operator==(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return a.x == b.x && a.y == b.y; 
			
 
				+    }
			
 
				+    inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return a.x != b.x || a.y != b.y; 
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // Vector3
			
 
				+    inline Vector3::Vector3() {}
			
 
				+    inline Vector3::Vector3(float f) : x(f), y(f), z(f) {}
			
 
				+    inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
			
 
				+    inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {}
			
 
				+    inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
			
 
				+
			
 
				+    inline const Vector3 & Vector3::operator=(Vector3::Arg v)
			
 
				+    {
			
 
				+        x = v.x;
			
 
				+        y = v.y;
			
 
				+        z = v.z;
			
 
				+        return *this;
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    inline Vector2 Vector3::xy() const
			
 
				+    {
			
 
				+        return Vector2(x, y);
			
 
				+    }
			
 
				+
			
 
				+    inline const float * Vector3::ptr() const
			
 
				+    {
			
 
				+        return &x;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector3::set(float x, float y, float z)
			
 
				+    {
			
 
				+        this->x = x;
			
 
				+        this->y = y;
			
 
				+        this->z = z;
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 Vector3::operator-() const
			
 
				+    {
			
 
				+        return Vector3(-x, -y, -z);
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector3::operator+=(Vector3::Arg v)
			
 
				+    {
			
 
				+        x += v.x;
			
 
				+        y += v.y;
			
 
				+        z += v.z;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector3::operator-=(Vector3::Arg v)
			
 
				+    {
			
 
				+        x -= v.x;
			
 
				+        y -= v.y;
			
 
				+        z -= v.z;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector3::operator*=(float s)
			
 
				+    {
			
 
				+        x *= s;
			
 
				+        y *= s;
			
 
				+        z *= s;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector3::operator/=(float s)
			
 
				+    {
			
 
				+        float is = 1.0f / s;
			
 
				+        x *= is;
			
 
				+        y *= is;
			
 
				+        z *= is;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector3::operator*=(Vector3::Arg v)
			
 
				+    {
			
 
				+        x *= v.x;
			
 
				+        y *= v.y;
			
 
				+        z *= v.z;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector3::operator/=(Vector3::Arg v)
			
 
				+    {
			
 
				+        x /= v.x;
			
 
				+        y /= v.y;
			
 
				+        z /= v.z;
			
 
				+    }
			
 
				+
			
 
				+    inline bool operator==(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return a.x == b.x && a.y == b.y && a.z == b.z; 
			
 
				+    }
			
 
				+    inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return a.x != b.x || a.y != b.y || a.z != b.z; 
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // Vector4
			
 
				+    inline Vector4::Vector4() {}
			
 
				+    inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {}
			
 
				+    inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
			
 
				+    inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {}
			
 
				+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
			
 
				+    inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {}
			
 
				+    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
			
 
				+
			
 
				+    inline const Vector4 & Vector4::operator=(const Vector4 & v)
			
 
				+    {
			
 
				+        x = v.x;
			
 
				+        y = v.y;
			
 
				+        z = v.z;
			
 
				+        w = v.w;
			
 
				+        return *this;
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 Vector4::xy() const
			
 
				+    {
			
 
				+        return Vector2(x, y);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 Vector4::zw() const
			
 
				+    {
			
 
				+        return Vector2(z, w);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 Vector4::xyz() const
			
 
				+    {
			
 
				+        return Vector3(x, y, z);
			
 
				+    }
			
 
				+
			
 
				+    inline const float * Vector4::ptr() const
			
 
				+    {
			
 
				+        return &x;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector4::set(float x, float y, float z, float w)
			
 
				+    {
			
 
				+        this->x = x;
			
 
				+        this->y = y;
			
 
				+        this->z = z;
			
 
				+        this->w = w;
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 Vector4::operator-() const
			
 
				+    {
			
 
				+        return Vector4(-x, -y, -z, -w);
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector4::operator+=(Vector4::Arg v)
			
 
				+    {
			
 
				+        x += v.x;
			
 
				+        y += v.y;
			
 
				+        z += v.z;
			
 
				+        w += v.w;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector4::operator-=(Vector4::Arg v)
			
 
				+    {
			
 
				+        x -= v.x;
			
 
				+        y -= v.y;
			
 
				+        z -= v.z;
			
 
				+        w -= v.w;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector4::operator*=(float s)
			
 
				+    {
			
 
				+        x *= s;
			
 
				+        y *= s;
			
 
				+        z *= s;
			
 
				+        w *= s;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector4::operator/=(float s)
			
 
				+    {
			
 
				+        x /= s;
			
 
				+        y /= s;
			
 
				+        z /= s;
			
 
				+        w /= s;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector4::operator*=(Vector4::Arg v)
			
 
				+    {
			
 
				+        x *= v.x;
			
 
				+        y *= v.y;
			
 
				+        z *= v.z;
			
 
				+        w *= v.w;
			
 
				+    }
			
 
				+
			
 
				+    inline void Vector4::operator/=(Vector4::Arg v)
			
 
				+    {
			
 
				+        x /= v.x;
			
 
				+        y /= v.y;
			
 
				+        z /= v.z;
			
 
				+        w /= v.w;
			
 
				+    }
			
 
				+
			
 
				+    inline bool operator==(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 
			
 
				+    }
			
 
				+    inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; 
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // Functions
			
 
				+
			
 
				+
			
 
				+    // Vector2
			
 
				+
			
 
				+    inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return Vector2(a.x + b.x, a.y + b.y);
			
 
				+    }
			
 
				+    inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return add(a, b);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return Vector2(a.x - b.x, a.y - b.y);
			
 
				+    }
			
 
				+    inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return sub(a, b);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 scale(Vector2::Arg v, float s)
			
 
				+    {
			
 
				+        return Vector2(v.x * s, v.y * s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
			
 
				+    {
			
 
				+        return Vector2(v.x * s.x, v.y * s.y);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 operator*(Vector2::Arg v, float s)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
			
 
				+    {
			
 
				+        return Vector2(v1.x*v2.x, v1.y*v2.y);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 operator*(float s, Vector2::Arg v)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 operator/(Vector2::Arg v, float s)
			
 
				+    {
			
 
				+        return scale(v, 1.0f/s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
			
 
				+    {
			
 
				+        const float s = 1.0f - t;
			
 
				+        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
			
 
				+    }
			
 
				+
			
 
				+    inline float dot(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return a.x * b.x + a.y * b.y;
			
 
				+    }
			
 
				+
			
 
				+    inline float lengthSquared(Vector2::Arg v)
			
 
				+    {
			
 
				+        return v.x * v.x + v.y * v.y;
			
 
				+    }
			
 
				+
			
 
				+    inline float length(Vector2::Arg v)
			
 
				+    {
			
 
				+        return sqrtf(lengthSquared(v));
			
 
				+    }
			
 
				+
			
 
				+    inline float distance(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return length(a - b);
			
 
				+    }
			
 
				+
			
 
				+    inline float inverseLength(Vector2::Arg v)
			
 
				+    {
			
 
				+        return 1.0f / sqrtf(lengthSquared(v));
			
 
				+    }
			
 
				+
			
 
				+    inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
			
 
				+    {
			
 
				+        return equal(length(v), 1, epsilon);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        float l = length(v);
			
 
				+        NV_UNUSED(epsilon);
			
 
				+        nvDebugCheck(!isZero(l, epsilon));
			
 
				+        Vector2 n = scale(v, 1.0f / l);
			
 
				+        nvDebugCheck(isNormalized(n));
			
 
				+        return n;
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        float l = length(v);
			
 
				+        if (isZero(l, epsilon)) {
			
 
				+            return fallback;
			
 
				+        }
			
 
				+        return scale(v, 1.0f / l);
			
 
				+    }
			
 
				+
			
 
				+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
			
 
				+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
			
 
				+    inline Vector2 normalizeFast(Vector2::Arg v)
			
 
				+    {
			
 
				+        const float very_small_float = 1.0e-037f;
			
 
				+        float l = very_small_float + length(v);
			
 
				+        return scale(v, 1.0f / l);
			
 
				+    }
			
 
				+
			
 
				+    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return Vector2(min(a.x, b.x), min(a.y, b.y));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
			
 
				+    {
			
 
				+        return Vector2(max(a.x, b.x), max(a.y, b.y));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 clamp(Vector2::Arg v, float min, float max)
			
 
				+    {
			
 
				+        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 saturate(Vector2::Arg v)
			
 
				+    {
			
 
				+        return Vector2(saturate(v.x), saturate(v.y));
			
 
				+    }
			
 
				+
			
 
				+    inline bool isFinite(Vector2::Arg v)
			
 
				+    {
			
 
				+        return isFinite(v.x) && isFinite(v.y);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
			
 
				+    {
			
 
				+        if (!isFinite(v)) return fallback;
			
 
				+        Vector2 vf = v;
			
 
				+        nv::floatCleanup(vf.component, 2);
			
 
				+        return vf;
			
 
				+    }
			
 
				+
			
 
				+    // Note, this is the area scaled by 2!
			
 
				+    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
			
 
				+    {
			
 
				+	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
			
 
				+    }
			
 
				+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
			
 
				+    {
			
 
				+        // IC: While it may be appealing to use the following expression:
			
 
				+        //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
			
 
				+
			
 
				+        // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point 
			
 
				+        // numbers and the results becomes very unstable and dependent on the order of the factors.
			
 
				+
			
 
				+        // Instead, it's preferable to substract the vertices first, and multiply the resulting small values together. The result
			
 
				+        // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of 
			
 
				+        // the triangle.
			
 
				+
			
 
				+        //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f;
			
 
				+        return triangleArea(a-c, b-c);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    template <>
			
 
				+    inline uint hash(const Vector2 & v, uint h)
			
 
				+    {
			
 
				+        return sdbmFloatHash(v.component, 2, h);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+
			
 
				+    // Vector3
			
 
				+
			
 
				+    inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
			
 
				+    }
			
 
				+    inline Vector3 add(Vector3::Arg a, float b)
			
 
				+    {
			
 
				+        return Vector3(a.x + b, a.y + b, a.z + b);
			
 
				+    }
			
 
				+    inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return add(a, b);
			
 
				+    }
			
 
				+    inline Vector3 operator+(Vector3::Arg a, float b)
			
 
				+    {
			
 
				+        return add(a, b);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
			
 
				+    }
			
 
				+    inline Vector3 sub(Vector3::Arg a, float b)
			
 
				+    {
			
 
				+        return Vector3(a.x - b, a.y - b, a.z - b);
			
 
				+    }
			
 
				+    inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return sub(a, b);
			
 
				+    }
			
 
				+    inline Vector3 operator-(Vector3::Arg a, float b)
			
 
				+    {
			
 
				+        return sub(a, b);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 scale(Vector3::Arg v, float s)
			
 
				+    {
			
 
				+        return Vector3(v.x * s, v.y * s, v.z * s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
			
 
				+    {
			
 
				+        return Vector3(v.x * s.x, v.y * s.y, v.z * s.z);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator*(Vector3::Arg v, float s)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator*(float s, Vector3::Arg v)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator/(Vector3::Arg v, float s)
			
 
				+    {
			
 
				+        return scale(v, 1.0f/s);
			
 
				+    }
			
 
				+
			
 
				+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s)
			
 
				+    {
			
 
				+        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
			
 
				+    }*/
			
 
				+
			
 
				+    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t)
			
 
				+    {
			
 
				+        const float s = 1.0f - t;
			
 
				+        return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z);
			
 
				+    }
			
 
				+
			
 
				+    inline float dot(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return a.x * b.x + a.y * b.y + a.z * b.z;
			
 
				+    }
			
 
				+
			
 
				+    inline float lengthSquared(Vector3::Arg v)
			
 
				+    {
			
 
				+        return v.x * v.x + v.y * v.y + v.z * v.z;
			
 
				+    }
			
 
				+
			
 
				+    inline float length(Vector3::Arg v)
			
 
				+    {
			
 
				+        return sqrtf(lengthSquared(v));
			
 
				+    }
			
 
				+
			
 
				+    inline float distance(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return length(a - b);
			
 
				+    }
			
 
				+
			
 
				+    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return lengthSquared(a - b);
			
 
				+    }
			
 
				+
			
 
				+    inline float inverseLength(Vector3::Arg v)
			
 
				+    {
			
 
				+        return 1.0f / sqrtf(lengthSquared(v));
			
 
				+    }
			
 
				+
			
 
				+    inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
			
 
				+    {
			
 
				+        return equal(length(v), 1, epsilon);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        float l = length(v);
			
 
				+        NV_UNUSED(epsilon);
			
 
				+        nvDebugCheck(!isZero(l, epsilon));
			
 
				+        Vector3 n = scale(v, 1.0f / l);
			
 
				+        nvDebugCheck(isNormalized(n));
			
 
				+        return n;
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        float l = length(v);
			
 
				+        if (isZero(l, epsilon)) {
			
 
				+            return fallback;
			
 
				+        }
			
 
				+        return scale(v, 1.0f / l);
			
 
				+    }
			
 
				+
			
 
				+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
			
 
				+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
			
 
				+    inline Vector3 normalizeFast(Vector3::Arg v)
			
 
				+    {
			
 
				+        const float very_small_float = 1.0e-037f;
			
 
				+        float l = very_small_float + length(v);
			
 
				+        return scale(v, 1.0f / l);
			
 
				+    }
			
 
				+
			
 
				+    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
			
 
				+    {
			
 
				+        return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 clamp(Vector3::Arg v, float min, float max)
			
 
				+    {
			
 
				+        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 saturate(Vector3::Arg v)
			
 
				+    {
			
 
				+        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 floor(Vector3::Arg v)
			
 
				+    {
			
 
				+        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 ceil(Vector3::Arg v)
			
 
				+    {
			
 
				+        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
			
 
				+    }
			
 
				+
			
 
				+    inline bool isFinite(Vector3::Arg v)
			
 
				+    {
			
 
				+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
			
 
				+    {
			
 
				+        if (!isFinite(v)) return fallback;
			
 
				+        Vector3 vf = v;
			
 
				+        nv::floatCleanup(vf.component, 3);
			
 
				+        return vf;
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
			
 
				+    {
			
 
				+	    return v - (2 * dot(v, n)) * n;
			
 
				+    }
			
 
				+
			
 
				+    template <>
			
 
				+    inline uint hash(const Vector3 & v, uint h)
			
 
				+    {
			
 
				+        return sdbmFloatHash(v.component, 3, h);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+    // Vector4
			
 
				+
			
 
				+    inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
			
 
				+    }
			
 
				+    inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return add(a, b);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
			
 
				+    }
			
 
				+    inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return sub(a, b);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 scale(Vector4::Arg v, float s)
			
 
				+    {
			
 
				+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
			
 
				+    {
			
 
				+        return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 operator*(Vector4::Arg v, float s)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 operator*(float s, Vector4::Arg v)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 operator/(Vector4::Arg v, float s)
			
 
				+    {
			
 
				+        return scale(v, 1.0f/s);
			
 
				+    }
			
 
				+
			
 
				+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
			
 
				+    {
			
 
				+        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
			
 
				+    }*/
			
 
				+
			
 
				+    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
			
 
				+    {
			
 
				+        const float s = 1.0f - t;
			
 
				+        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
			
 
				+    }
			
 
				+
			
 
				+    inline float dot(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
			
 
				+    }
			
 
				+
			
 
				+    inline float lengthSquared(Vector4::Arg v)
			
 
				+    {
			
 
				+        return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
			
 
				+    }
			
 
				+
			
 
				+    inline float length(Vector4::Arg v)
			
 
				+    {
			
 
				+        return sqrtf(lengthSquared(v));
			
 
				+    }
			
 
				+
			
 
				+    inline float inverseLength(Vector4::Arg v)
			
 
				+    {
			
 
				+        return 1.0f / sqrtf(lengthSquared(v));
			
 
				+    }
			
 
				+
			
 
				+    inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
			
 
				+    {
			
 
				+        return equal(length(v), 1, epsilon);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        float l = length(v);
			
 
				+        NV_UNUSED(epsilon);
			
 
				+        nvDebugCheck(!isZero(l, epsilon));
			
 
				+        Vector4 n = scale(v, 1.0f / l);
			
 
				+        nvDebugCheck(isNormalized(n));
			
 
				+        return n;
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        float l = length(v);
			
 
				+        if (isZero(l, epsilon)) {
			
 
				+            return fallback;
			
 
				+        }
			
 
				+        return scale(v, 1.0f / l);
			
 
				+    }
			
 
				+
			
 
				+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
			
 
				+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
			
 
				+    inline Vector4 normalizeFast(Vector4::Arg v)
			
 
				+    {
			
 
				+        const float very_small_float = 1.0e-037f;
			
 
				+        float l = very_small_float + length(v);
			
 
				+        return scale(v, 1.0f / l);
			
 
				+    }
			
 
				+
			
 
				+    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
			
 
				+    {
			
 
				+        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 clamp(Vector4::Arg v, float min, float max)
			
 
				+    {
			
 
				+        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 saturate(Vector4::Arg v)
			
 
				+    {
			
 
				+        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
			
 
				+    }
			
 
				+
			
 
				+    inline bool isFinite(Vector4::Arg v)
			
 
				+    {
			
 
				+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
			
 
				+    {
			
 
				+        if (!isFinite(v)) return fallback;
			
 
				+        Vector4 vf = v;
			
 
				+        nv::floatCleanup(vf.component, 4);
			
 
				+        return vf;
			
 
				+    }
			
 
				+
			
 
				+    template <>
			
 
				+    inline uint hash(const Vector4 & v, uint h)
			
 
				+    {
			
 
				+        return sdbmFloatHash(v.component, 4, h);
			
 
				+    }
			
 
				+
			
 
				+
			
 
				+#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
			
 
				+
			
 
				+    //int:
			
 
				+
			
 
				+    inline Vector2 scale(Vector2::Arg v, int s)
			
 
				+    {
			
 
				+        return Vector2(v.x * s, v.y * s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 operator*(Vector2::Arg v, int s)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 operator*(int s, Vector2::Arg v)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector2 operator/(Vector2::Arg v, int s)
			
 
				+    {
			
 
				+        return scale(v, 1.0f/s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 scale(Vector3::Arg v, int s)
			
 
				+    {
			
 
				+        return Vector3(v.x * s, v.y * s, v.z * s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator*(Vector3::Arg v, int s)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator*(int s, Vector3::Arg v)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator/(Vector3::Arg v, int s)
			
 
				+    {
			
 
				+        return scale(v, 1.0f/s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 scale(Vector4::Arg v, int s)
			
 
				+    {
			
 
				+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 operator*(Vector4::Arg v, int s)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 operator*(int s, Vector4::Arg v)
			
 
				+    {
			
 
				+        return scale(v, s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector4 operator/(Vector4::Arg v, int s)
			
 
				+    {
			
 
				+        return scale(v, 1.0f/s);
			
 
				+    }
			
 
				+
			
 
				+    //double:
			
 
				+
			
 
				+    inline Vector3 operator*(Vector3::Arg v, double s)
			
 
				+    {
			
 
				+        return scale(v, (float)s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator*(double s, Vector3::Arg v)
			
 
				+    {
			
 
				+        return scale(v, (float)s);
			
 
				+    }
			
 
				+
			
 
				+    inline Vector3 operator/(Vector3::Arg v, double s)
			
 
				+    {
			
 
				+        return scale(v, 1.f/((float)s));
			
 
				+    }    
			
 
				+        
			
 
				+#endif //NV_OS_IOS
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_MATH_VECTOR_INL
			
--- a/3rdparty/nvtt/nvmath/fitting.cpp
+++ b/3rdparty/nvtt/nvmath/fitting.cpp
@@ -0,0 +1,1200 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#include "fitting.h"
			
 
				+#include "vector.inl"
			
 
				+#include "plane.inl"
			
 
				+#include "matrix.inl"
			
 
				+
			
 
				+#include "nvcore/array.inl"
			
 
				+#include "nvcore/utils.h" // max, swap
			
 
				+
			
 
				+using namespace nv;
			
 
				+
			
 
				+// @@ Move to EigenSolver.h
			
 
				+
			
 
				+// @@ We should be able to do something cheaper...
			
 
				+static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
			
 
				+{
			
 
				+	const Vector3 row0(matrix[0], matrix[1], matrix[2]);
			
 
				+	const Vector3 row1(matrix[1], matrix[3], matrix[4]);
			
 
				+	const Vector3 row2(matrix[2], matrix[4], matrix[5]);
			
 
				+
			
 
				+	float r0 = lengthSquared(row0);
			
 
				+	float r1 = lengthSquared(row1);
			
 
				+	float r2 = lengthSquared(row2);
			
 
				+
			
 
				+	if (r0 > r1 && r0 > r2) return row0;
			
 
				+	if (r1 > r2) return row1;
			
 
				+	return row2;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
			
 
				+{
			
 
				+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
			
 
				+    {
			
 
				+        return Vector3(0.0f);
			
 
				+    }
			
 
				+
			
 
				+    Vector3 v = estimatePrincipalComponent(matrix);
			
 
				+
			
 
				+    const int NUM = 8;
			
 
				+    for (int i = 0; i < NUM; i++)
			
 
				+    {
			
 
				+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
			
 
				+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
			
 
				+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
			
 
				+
			
 
				+        float norm = max(max(x, y), z);
			
 
				+
			
 
				+        v = Vector3(x, y, z) / norm;
			
 
				+    }
			
 
				+
			
 
				+    return v;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
			
 
				+{
			
 
				+    Vector3 centroid(0.0f);
			
 
				+
			
 
				+    for (int i = 0; i < n; i++)
			
 
				+    {
			
 
				+        centroid += points[i];
			
 
				+    }
			
 
				+    centroid /= float(n);
			
 
				+
			
 
				+    return centroid;
			
 
				+}
			
 
				+
			
 
				+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
			
 
				+{
			
 
				+    Vector3 centroid(0.0f);
			
 
				+    float total = 0.0f;
			
 
				+
			
 
				+    for (int i = 0; i < n; i++)
			
 
				+    {
			
 
				+        total += weights[i];
			
 
				+        centroid += weights[i]*points[i];
			
 
				+    }
			
 
				+    centroid /= total;
			
 
				+
			
 
				+    return centroid;
			
 
				+}
			
 
				+
			
 
				+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points)
			
 
				+{
			
 
				+    Vector4 centroid(0.0f);
			
 
				+
			
 
				+    for (int i = 0; i < n; i++)
			
 
				+    {
			
 
				+        centroid += points[i];
			
 
				+    }
			
 
				+    centroid /= float(n);
			
 
				+
			
 
				+    return centroid;
			
 
				+}
			
 
				+
			
 
				+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
			
 
				+{
			
 
				+    Vector4 centroid(0.0f);
			
 
				+    float total = 0.0f;
			
 
				+
			
 
				+    for (int i = 0; i < n; i++)
			
 
				+    {
			
 
				+        total += weights[i];
			
 
				+        centroid += weights[i]*points[i];
			
 
				+    }
			
 
				+    centroid /= total;
			
 
				+
			
 
				+    return centroid;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance)
			
 
				+{
			
 
				+    // compute the centroid
			
 
				+    Vector3 centroid = computeCentroid(n, points);
			
 
				+
			
 
				+    // compute covariance matrix
			
 
				+    for (int i = 0; i < 6; i++)
			
 
				+    {
			
 
				+        covariance[i] = 0.0f;
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < n; i++)
			
 
				+    {
			
 
				+        Vector3 v = points[i] - centroid;
			
 
				+
			
 
				+        covariance[0] += v.x * v.x;
			
 
				+        covariance[1] += v.x * v.y;
			
 
				+        covariance[2] += v.x * v.z;
			
 
				+        covariance[3] += v.y * v.y;
			
 
				+        covariance[4] += v.y * v.z;
			
 
				+        covariance[5] += v.z * v.z;
			
 
				+    }
			
 
				+
			
 
				+    return centroid;
			
 
				+}
			
 
				+
			
 
				+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance)
			
 
				+{
			
 
				+    // compute the centroid
			
 
				+    Vector3 centroid = computeCentroid(n, points, weights, metric);
			
 
				+
			
 
				+    // compute covariance matrix
			
 
				+    for (int i = 0; i < 6; i++)
			
 
				+    {
			
 
				+        covariance[i] = 0.0f;
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < n; i++)
			
 
				+    {
			
 
				+        Vector3 a = (points[i] - centroid) * metric;
			
 
				+        Vector3 b = weights[i]*a;
			
 
				+
			
 
				+        covariance[0] += a.x * b.x;
			
 
				+        covariance[1] += a.x * b.y;
			
 
				+        covariance[2] += a.x * b.z;
			
 
				+        covariance[3] += a.y * b.y;
			
 
				+        covariance[4] += a.y * b.z;
			
 
				+        covariance[5] += a.z * b.z;
			
 
				+    }
			
 
				+
			
 
				+    return centroid;
			
 
				+}
			
 
				+
			
 
				+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, float *__restrict covariance)
			
 
				+{
			
 
				+    // compute the centroid
			
 
				+    Vector4 centroid = computeCentroid(n, points);
			
 
				+
			
 
				+    // compute covariance matrix
			
 
				+    for (int i = 0; i < 10; i++)
			
 
				+    {
			
 
				+        covariance[i] = 0.0f;
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < n; i++)
			
 
				+    {
			
 
				+        Vector4 v = points[i] - centroid;
			
 
				+
			
 
				+        covariance[0] += v.x * v.x;
			
 
				+        covariance[1] += v.x * v.y;
			
 
				+        covariance[2] += v.x * v.z;
			
 
				+        covariance[3] += v.x * v.w;
			
 
				+
			
 
				+		covariance[4] += v.y * v.y;
			
 
				+        covariance[5] += v.y * v.z;
			
 
				+        covariance[6] += v.y * v.w;
			
 
				+
			
 
				+		covariance[7] += v.z * v.z;
			
 
				+		covariance[8] += v.z * v.w;
			
 
				+
			
 
				+		covariance[9] += v.w * v.w;
			
 
				+	}
			
 
				+
			
 
				+    return centroid;
			
 
				+}
			
 
				+
			
 
				+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric, float *__restrict covariance)
			
 
				+{
			
 
				+    // compute the centroid
			
 
				+    Vector4 centroid = computeCentroid(n, points, weights, metric);
			
 
				+
			
 
				+    // compute covariance matrix
			
 
				+    for (int i = 0; i < 10; i++)
			
 
				+    {
			
 
				+        covariance[i] = 0.0f;
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < n; i++)
			
 
				+    {
			
 
				+        Vector4 a = (points[i] - centroid) * metric;
			
 
				+        Vector4 b = weights[i]*a;
			
 
				+
			
 
				+        covariance[0] += a.x * b.x;
			
 
				+        covariance[1] += a.x * b.y;
			
 
				+        covariance[2] += a.x * b.z;
			
 
				+        covariance[3] += a.x * b.w;
			
 
				+
			
 
				+		covariance[4] += a.y * b.y;
			
 
				+        covariance[5] += a.y * b.z;
			
 
				+        covariance[6] += a.y * b.w;
			
 
				+
			
 
				+		covariance[7] += a.z * b.z;
			
 
				+		covariance[8] += a.z * b.w;
			
 
				+
			
 
				+		covariance[9] += a.w * b.w;
			
 
				+    }
			
 
				+
			
 
				+    return centroid;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points)
			
 
				+{
			
 
				+    float matrix[6];
			
 
				+    computeCovariance(n, points, matrix);
			
 
				+
			
 
				+    return firstEigenVector_PowerMethod(matrix);
			
 
				+}
			
 
				+
			
 
				+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
			
 
				+{
			
 
				+    float matrix[6];
			
 
				+    computeCovariance(n, points, weights, metric, matrix);
			
 
				+
			
 
				+    return firstEigenVector_PowerMethod(matrix);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static inline Vector3 firstEigenVector_EigenSolver3(const float *__restrict matrix)
			
 
				+{
			
 
				+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
			
 
				+    {
			
 
				+        return Vector3(0.0f);
			
 
				+    }
			
 
				+
			
 
				+    float eigenValues[3];
			
 
				+    Vector3 eigenVectors[3];
			
 
				+	if (!nv::Fit::eigenSolveSymmetric3(matrix, eigenValues, eigenVectors))
			
 
				+	{
			
 
				+		return Vector3(0.0f);
			
 
				+	}
			
 
				+
			
 
				+	return eigenVectors[0];
			
 
				+}
			
 
				+
			
 
				+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points)
			
 
				+{
			
 
				+    float matrix[6];
			
 
				+    computeCovariance(n, points, matrix);
			
 
				+
			
 
				+    return firstEigenVector_EigenSolver3(matrix);
			
 
				+}
			
 
				+
			
 
				+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
			
 
				+{
			
 
				+    float matrix[6];
			
 
				+    computeCovariance(n, points, weights, metric, matrix);
			
 
				+
			
 
				+    return firstEigenVector_EigenSolver3(matrix);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+static inline Vector4 firstEigenVector_EigenSolver4(const float *__restrict matrix)
			
 
				+{
			
 
				+    if (matrix[0] == 0 && matrix[4] == 0 && matrix[7] == 0&& matrix[9] == 0)
			
 
				+    {
			
 
				+        return Vector4(0.0f);
			
 
				+    }
			
 
				+
			
 
				+    float eigenValues[4];
			
 
				+    Vector4 eigenVectors[4];
			
 
				+	if (!nv::Fit::eigenSolveSymmetric4(matrix, eigenValues, eigenVectors))
			
 
				+	{
			
 
				+		return Vector4(0.0f);
			
 
				+	}
			
 
				+
			
 
				+	return eigenVectors[0];
			
 
				+}
			
 
				+
			
 
				+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points)
			
 
				+{
			
 
				+    float matrix[10];
			
 
				+    computeCovariance(n, points, matrix);
			
 
				+
			
 
				+    return firstEigenVector_EigenSolver4(matrix);
			
 
				+}
			
 
				+
			
 
				+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
			
 
				+{
			
 
				+    float matrix[10];
			
 
				+    computeCovariance(n, points, weights, metric, matrix);
			
 
				+
			
 
				+    return firstEigenVector_EigenSolver4(matrix);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R);
			
 
				+
			
 
				+Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points)
			
 
				+{
			
 
				+	// Store the points in an n x n matrix
			
 
				+    Array<float> Q; Q.resize(n*n, 0.0f);
			
 
				+	for (int i = 0; i < n; ++i)
			
 
				+	{
			
 
				+		Q[i*n+0] = points[i].x;
			
 
				+		Q[i*n+1] = points[i].y;
			
 
				+		Q[i*n+2] = points[i].z;
			
 
				+	}
			
 
				+
			
 
				+	// Alloc space for the SVD outputs
			
 
				+    Array<float> diag; diag.resize(n, 0.0f);
			
 
				+    Array<float> R; R.resize(n*n, 0.0f);
			
 
				+
			
 
				+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
			
 
				+
			
 
				+	// Get the principal component
			
 
				+	return Vector3(R[0], R[1], R[2]);
			
 
				+}
			
 
				+
			
 
				+Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points)
			
 
				+{
			
 
				+	// Store the points in an n x n matrix
			
 
				+    Array<float> Q; Q.resize(n*n, 0.0f);
			
 
				+	for (int i = 0; i < n; ++i)
			
 
				+	{
			
 
				+		Q[i*n+0] = points[i].x;
			
 
				+		Q[i*n+1] = points[i].y;
			
 
				+		Q[i*n+2] = points[i].z;
			
 
				+		Q[i*n+3] = points[i].w;
			
 
				+	}
			
 
				+
			
 
				+	// Alloc space for the SVD outputs
			
 
				+    Array<float> diag; diag.resize(n, 0.0f);
			
 
				+    Array<float> R; R.resize(n*n, 0.0f);
			
 
				+
			
 
				+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
			
 
				+
			
 
				+	// Get the principal component
			
 
				+	return Vector4(R[0], R[1], R[2], R[3]);
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
			
 
				+{
			
 
				+    // compute the centroid and covariance
			
 
				+    float matrix[6];
			
 
				+    Vector3 centroid = computeCovariance(n, points, matrix);
			
 
				+
			
 
				+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
			
 
				+    {
			
 
				+        // If no plane defined, then return a horizontal plane.
			
 
				+        return Plane(Vector3(0, 0, 1), centroid);
			
 
				+    }
			
 
				+
			
 
				+    float eigenValues[3];
			
 
				+    Vector3 eigenVectors[3];
			
 
				+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
			
 
				+        // If no plane defined, then return a horizontal plane.
			
 
				+        return Plane(Vector3(0, 0, 1), centroid);
			
 
				+    }
			
 
				+
			
 
				+    return Plane(eigenVectors[2], centroid);
			
 
				+}
			
 
				+
			
 
				+bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
			
 
				+{
			
 
				+    // compute the centroid and covariance
			
 
				+    float matrix[6];
			
 
				+    computeCovariance(n, points, matrix);
			
 
				+
			
 
				+    float eigenValues[3];
			
 
				+    Vector3 eigenVectors[3];
			
 
				+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    return eigenValues[2] < epsilon;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+// Tridiagonal solver from Charles Bloom. 
			
 
				+// Householder transforms followed by QL decomposition. 
			
 
				+// Seems to be based on the code from Numerical Recipes in C.
			
 
				+
			
 
				+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd);
			
 
				+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd);
			
 
				+
			
 
				+bool nv::Fit::eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
			
 
				+{
			
 
				+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
			
 
				+
			
 
				+    float subd[3];
			
 
				+    float diag[3];
			
 
				+    float work[3][3];
			
 
				+
			
 
				+    work[0][0] = matrix[0];
			
 
				+    work[0][1] = work[1][0] = matrix[1];
			
 
				+    work[0][2] = work[2][0] = matrix[2];
			
 
				+    work[1][1] = matrix[3];
			
 
				+    work[1][2] = work[2][1] = matrix[4];
			
 
				+    work[2][2] = matrix[5];
			
 
				+
			
 
				+    EigenSolver3_Tridiagonal(work, diag, subd);
			
 
				+    if (!EigenSolver3_QLAlgorithm(work, diag, subd))
			
 
				+    {
			
 
				+        for (int i = 0; i < 3; i++) {
			
 
				+            eigenValues[i] = 0;
			
 
				+            eigenVectors[i] = Vector3(0);
			
 
				+        }
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < 3; i++) {
			
 
				+        eigenValues[i] = (float)diag[i];
			
 
				+    }
			
 
				+
			
 
				+    // eigenvectors are the columns; make them the rows :
			
 
				+
			
 
				+    for (int i=0; i < 3; i++)
			
 
				+    {
			
 
				+        for (int j = 0; j < 3; j++)
			
 
				+        {
			
 
				+            eigenVectors[j].component[i] = (float) work[i][j];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // shuffle to sort by singular value :
			
 
				+    if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
			
 
				+    {
			
 
				+        swap(eigenValues[0], eigenValues[2]);
			
 
				+        swap(eigenVectors[0], eigenVectors[2]);
			
 
				+    }
			
 
				+    if (eigenValues[1] > eigenValues[0])
			
 
				+    {
			
 
				+        swap(eigenValues[0], eigenValues[1]);
			
 
				+        swap(eigenVectors[0], eigenVectors[1]);
			
 
				+    }
			
 
				+    if (eigenValues[2] > eigenValues[1])
			
 
				+    {
			
 
				+        swap(eigenValues[1], eigenValues[2]);
			
 
				+        swap(eigenVectors[1], eigenVectors[2]);
			
 
				+    }
			
 
				+
			
 
				+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
			
 
				+    nvDebugCheck(eigenValues[1] >= eigenValues[2]);
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd)
			
 
				+{
			
 
				+    // Householder reduction T = Q^t M Q
			
 
				+    //   Input:   
			
 
				+    //     mat, symmetric 3x3 matrix M
			
 
				+    //   Output:  
			
 
				+    //     mat, orthogonal matrix Q
			
 
				+    //     diag, diagonal entries of T
			
 
				+    //     subd, subdiagonal entries of T (T is symmetric)
			
 
				+    const float epsilon = 1e-08f;
			
 
				+
			
 
				+    float a = mat[0][0];
			
 
				+    float b = mat[0][1];
			
 
				+    float c = mat[0][2];
			
 
				+    float d = mat[1][1];
			
 
				+    float e = mat[1][2];
			
 
				+    float f = mat[2][2];
			
 
				+
			
 
				+    diag[0] = a;
			
 
				+    subd[2] = 0.f;
			
 
				+    if (fabsf(c) >= epsilon)
			
 
				+    {
			
 
				+        const float ell = sqrtf(b*b+c*c);
			
 
				+        b /= ell;
			
 
				+        c /= ell;
			
 
				+        const float q = 2*b*e+c*(f-d);
			
 
				+        diag[1] = d+c*q;
			
 
				+        diag[2] = f-c*q;
			
 
				+        subd[0] = ell;
			
 
				+        subd[1] = e-b*q;
			
 
				+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
			
 
				+        mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
			
 
				+        mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
			
 
				+    }
			
 
				+    else
			
 
				+    {
			
 
				+        diag[1] = d;
			
 
				+        diag[2] = f;
			
 
				+        subd[0] = b;
			
 
				+        subd[1] = e;
			
 
				+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
			
 
				+        mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
			
 
				+        mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd)
			
 
				+{
			
 
				+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
			
 
				+    // to diagonal
			
 
				+    const int maxiter = 32;
			
 
				+
			
 
				+    for (int ell = 0; ell < 3; ell++)
			
 
				+    {
			
 
				+        int iter;
			
 
				+        for (iter = 0; iter < maxiter; iter++)
			
 
				+        {
			
 
				+            int m;
			
 
				+            for (m = ell; m <= 1; m++)
			
 
				+            {
			
 
				+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
			
 
				+                if ( fabsf(subd[m]) + dd == dd )
			
 
				+                    break;
			
 
				+            }
			
 
				+            if ( m == ell )
			
 
				+                break;
			
 
				+
			
 
				+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
			
 
				+            float r = sqrtf(g*g+1);
			
 
				+            if ( g < 0 )
			
 
				+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
			
 
				+            else
			
 
				+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
			
 
				+            float s = 1, c = 1, p = 0;
			
 
				+            for (int i = m-1; i >= ell; i--)
			
 
				+            {
			
 
				+                float f = s*subd[i], b = c*subd[i];
			
 
				+                if ( fabsf(f) >= fabsf(g) )
			
 
				+                {
			
 
				+                    c = g/f;
			
 
				+                    r = sqrtf(c*c+1);
			
 
				+                    subd[i+1] = f*r;
			
 
				+                    c *= (s = 1/r);
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                    s = f/g;
			
 
				+                    r = sqrtf(s*s+1);
			
 
				+                    subd[i+1] = g*r;
			
 
				+                    s *= (c = 1/r);
			
 
				+                }
			
 
				+                g = diag[i+1]-p;
			
 
				+                r = (diag[i]-g)*s+2*b*c;
			
 
				+                p = s*r;
			
 
				+                diag[i+1] = g+p;
			
 
				+                g = c*r-b;
			
 
				+
			
 
				+                for (int k = 0; k < 3; k++)
			
 
				+                {
			
 
				+                    f = mat[k][i+1];
			
 
				+                    mat[k][i+1] = s*mat[k][i]+c*f;
			
 
				+                    mat[k][i] = c*mat[k][i]-s*f;
			
 
				+                }
			
 
				+            }
			
 
				+            diag[ell] -= p;
			
 
				+            subd[ell] = g;
			
 
				+            subd[m] = 0;
			
 
				+        }
			
 
				+
			
 
				+        if ( iter == maxiter )
			
 
				+            // should not get here under normal circumstances
			
 
				+            return false;
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+// Tridiagonal solver for 4x4 symmetric matrices.
			
 
				+
			
 
				+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd);
			
 
				+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd);
			
 
				+
			
 
				+bool nv::Fit::eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4])
			
 
				+{
			
 
				+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
			
 
				+
			
 
				+    float subd[4];
			
 
				+    float diag[4];
			
 
				+    float work[4][4];
			
 
				+
			
 
				+    work[0][0] = matrix[0];
			
 
				+    work[0][1] = work[1][0] = matrix[1];
			
 
				+    work[0][2] = work[2][0] = matrix[2];
			
 
				+    work[0][3] = work[3][0] = matrix[3];
			
 
				+    work[1][1] = matrix[4];
			
 
				+    work[1][2] = work[2][1] = matrix[5];
			
 
				+    work[1][3] = work[3][1] = matrix[6];
			
 
				+    work[2][2] = matrix[7];
			
 
				+    work[2][3] = work[3][2] = matrix[8];
			
 
				+    work[3][3] = matrix[9];
			
 
				+
			
 
				+    EigenSolver4_Tridiagonal(work, diag, subd);
			
 
				+    if (!EigenSolver4_QLAlgorithm(work, diag, subd))
			
 
				+    {
			
 
				+        for (int i = 0; i < 4; i++) {
			
 
				+            eigenValues[i] = 0;
			
 
				+            eigenVectors[i] = Vector4(0);
			
 
				+        }
			
 
				+        return false;
			
 
				+    }
			
 
				+
			
 
				+    for (int i = 0; i < 4; i++) {
			
 
				+        eigenValues[i] = (float)diag[i];
			
 
				+    }
			
 
				+
			
 
				+    // eigenvectors are the columns; make them the rows
			
 
				+
			
 
				+    for (int i = 0; i < 4; i++)
			
 
				+    {
			
 
				+        for (int j = 0; j < 4; j++)
			
 
				+        {
			
 
				+            eigenVectors[j].component[i] = (float) work[i][j];
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    // sort by singular value
			
 
				+
			
 
				+	for (int i = 0; i < 3; ++i)
			
 
				+	{
			
 
				+		for (int j = i+1; j < 4; ++j)
			
 
				+		{
			
 
				+			if (eigenValues[j] > eigenValues[i])
			
 
				+			{
			
 
				+				swap(eigenValues[i], eigenValues[j]);
			
 
				+				swap(eigenVectors[i], eigenVectors[j]);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2] && eigenValues[0] >= eigenValues[3]);
			
 
				+    nvDebugCheck(eigenValues[1] >= eigenValues[2] && eigenValues[1] >= eigenValues[3]);
			
 
				+    nvDebugCheck(eigenValues[2] >= eigenValues[2]);
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+inline float signNonzero(float x)
			
 
				+{
			
 
				+	return (x >= 0.0f) ? 1.0f : -1.0f;
			
 
				+}
			
 
				+
			
 
				+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd)
			
 
				+{
			
 
				+    // Householder reduction T = Q^t M Q
			
 
				+    //   Input:   
			
 
				+    //     mat, symmetric 3x3 matrix M
			
 
				+    //   Output:  
			
 
				+    //     mat, orthogonal matrix Q
			
 
				+    //     diag, diagonal entries of T
			
 
				+    //     subd, subdiagonal entries of T (T is symmetric)
			
 
				+
			
 
				+	static const int n = 4;
			
 
				+
			
 
				+	// Set epsilon relative to size of elements in matrix
			
 
				+	static const float relEpsilon = 1e-6f;
			
 
				+	float maxElement = FLT_MAX;
			
 
				+	for (int i = 0; i < n; ++i)
			
 
				+		for (int j = 0; j < n; ++j)
			
 
				+			maxElement = max(maxElement, fabsf(mat[i][j]));
			
 
				+	float epsilon = relEpsilon * maxElement;
			
 
				+
			
 
				+	// Iterative algorithm, works for any size of matrix but might be slower than
			
 
				+	// a closed-form solution for symmetric 4x4 matrices.  Based on this article:
			
 
				+	// http://en.wikipedia.org/wiki/Householder_transformation#Tridiagonalization
			
 
				+
			
 
				+	Matrix A, Q(identity);
			
 
				+	memcpy(&A, mat, sizeof(float)*n*n);
			
 
				+
			
 
				+	// We proceed from left to right, making the off-tridiagonal entries zero in
			
 
				+	// one column of the matrix at a time.
			
 
				+	for (int k = 0; k < n - 2; ++k)
			
 
				+	{
			
 
				+		float sum = 0.0f;
			
 
				+		for (int j = k+1; j < n; ++j)
			
 
				+			sum += A(j,k)*A(j,k);
			
 
				+		float alpha = -signNonzero(A(k+1,k)) * sqrtf(sum);
			
 
				+		float r = sqrtf(0.5f * (alpha*alpha - A(k+1,k)*alpha));
			
 
				+
			
 
				+		// If r is zero, skip this column - already in tridiagonal form
			
 
				+		if (fabsf(r) < epsilon)
			
 
				+			continue;
			
 
				+
			
 
				+		float v[n] = {};
			
 
				+		v[k+1] = 0.5f * (A(k+1,k) - alpha) / r;
			
 
				+		for (int j = k+2; j < n; ++j)
			
 
				+			v[j] = 0.5f * A(j,k) / r;
			
 
				+
			
 
				+		Matrix P(identity);
			
 
				+		for (int i = 0; i < n; ++i)
			
 
				+			for (int j = 0; j < n; ++j)
			
 
				+				P(i,j) -= 2.0f * v[i] * v[j];
			
 
				+
			
 
				+		A = mul(mul(P, A), P);
			
 
				+		Q = mul(Q, P);
			
 
				+	}
			
 
				+
			
 
				+	nvDebugCheck(fabsf(A(2,0)) < epsilon);
			
 
				+	nvDebugCheck(fabsf(A(0,2)) < epsilon);
			
 
				+	nvDebugCheck(fabsf(A(3,0)) < epsilon);
			
 
				+	nvDebugCheck(fabsf(A(0,3)) < epsilon);
			
 
				+	nvDebugCheck(fabsf(A(3,1)) < epsilon);
			
 
				+	nvDebugCheck(fabsf(A(1,3)) < epsilon);
			
 
				+
			
 
				+	for (int i = 0; i < n; ++i)
			
 
				+		diag[i] = A(i,i);
			
 
				+	for (int i = 0; i < n - 1; ++i)
			
 
				+		subd[i] = A(i+1,i);
			
 
				+	subd[n-1] = 0.0f;
			
 
				+
			
 
				+	memcpy(mat, &Q, sizeof(float)*n*n);
			
 
				+}
			
 
				+
			
 
				+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd)
			
 
				+{
			
 
				+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
			
 
				+    // to diagonal
			
 
				+    const int maxiter = 32;
			
 
				+
			
 
				+    for (int ell = 0; ell < 4; ell++)
			
 
				+    {
			
 
				+        int iter;
			
 
				+        for (iter = 0; iter < maxiter; iter++)
			
 
				+        {
			
 
				+            int m;
			
 
				+            for (m = ell; m < 3; m++)
			
 
				+            {
			
 
				+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
			
 
				+                if ( fabsf(subd[m]) + dd == dd )
			
 
				+                    break;
			
 
				+            }
			
 
				+            if ( m == ell )
			
 
				+                break;
			
 
				+
			
 
				+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
			
 
				+            float r = sqrtf(g*g+1);
			
 
				+            if ( g < 0 )
			
 
				+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
			
 
				+            else
			
 
				+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
			
 
				+            float s = 1, c = 1, p = 0;
			
 
				+            for (int i = m-1; i >= ell; i--)
			
 
				+            {
			
 
				+                float f = s*subd[i], b = c*subd[i];
			
 
				+                if ( fabsf(f) >= fabsf(g) )
			
 
				+                {
			
 
				+                    c = g/f;
			
 
				+                    r = sqrtf(c*c+1);
			
 
				+                    subd[i+1] = f*r;
			
 
				+                    c *= (s = 1/r);
			
 
				+                }
			
 
				+                else
			
 
				+                {
			
 
				+                    s = f/g;
			
 
				+                    r = sqrtf(s*s+1);
			
 
				+                    subd[i+1] = g*r;
			
 
				+                    s *= (c = 1/r);
			
 
				+                }
			
 
				+                g = diag[i+1]-p;
			
 
				+                r = (diag[i]-g)*s+2*b*c;
			
 
				+                p = s*r;
			
 
				+                diag[i+1] = g+p;
			
 
				+                g = c*r-b;
			
 
				+
			
 
				+                for (int k = 0; k < 4; k++)
			
 
				+                {
			
 
				+                    f = mat[k][i+1];
			
 
				+                    mat[k][i+1] = s*mat[k][i]+c*f;
			
 
				+                    mat[k][i] = c*mat[k][i]-s*f;
			
 
				+                }
			
 
				+            }
			
 
				+            diag[ell] -= p;
			
 
				+            subd[ell] = g;
			
 
				+            subd[m] = 0;
			
 
				+        }
			
 
				+
			
 
				+        if ( iter == maxiter )
			
 
				+            // should not get here under normal circumstances
			
 
				+            return false;
			
 
				+    }
			
 
				+
			
 
				+    return true;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
			
 
				+{
			
 
				+    // Compute principal component.
			
 
				+    float matrix[6];
			
 
				+    Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
			
 
				+    Vector3 principal = firstEigenVector_PowerMethod(matrix);
			
 
				+
			
 
				+    // Pick initial solution.
			
 
				+    int mini, maxi;
			
 
				+    mini = maxi = 0;
			
 
				+
			
 
				+    float mindps, maxdps;
			
 
				+    mindps = maxdps = dot(points[0] - centroid, principal);
			
 
				+
			
 
				+    for (int i = 1; i < n; ++i)
			
 
				+    {
			
 
				+        float dps = dot(points[i] - centroid, principal);
			
 
				+
			
 
				+        if (dps < mindps) {
			
 
				+            mindps = dps;
			
 
				+            mini = i;
			
 
				+        }
			
 
				+        else {
			
 
				+            maxdps = dps;
			
 
				+            maxi = i;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    cluster[0] = centroid + mindps * principal;
			
 
				+    cluster[1] = centroid + maxdps * principal;
			
 
				+    cluster[2] = (2.0f * cluster[0] + cluster[1]) / 3.0f;
			
 
				+    cluster[3] = (2.0f * cluster[1] + cluster[0]) / 3.0f;
			
 
				+
			
 
				+    // Now we have to iteratively refine the clusters.
			
 
				+    while (true)
			
 
				+    {
			
 
				+        Vector3 newCluster[4] = { Vector3(0.0f), Vector3(0.0f), Vector3(0.0f), Vector3(0.0f) };
			
 
				+        float total[4] = {0, 0, 0, 0};
			
 
				+
			
 
				+        for (int i = 0; i < n; ++i)
			
 
				+        {
			
 
				+            // Find nearest cluster.
			
 
				+            int nearest = 0;
			
 
				+            float mindist = FLT_MAX;
			
 
				+            for (int j = 0; j < 4; j++)
			
 
				+            {
			
 
				+                float dist = lengthSquared((cluster[j] - points[i]) * metric);
			
 
				+                if (dist < mindist)
			
 
				+                {
			
 
				+                    mindist = dist;
			
 
				+                    nearest = j;
			
 
				+                }
			
 
				+            }
			
 
				+
			
 
				+            newCluster[nearest] += weights[i] * points[i];
			
 
				+            total[nearest] += weights[i];
			
 
				+        }
			
 
				+
			
 
				+        for (int j = 0; j < 4; j++)
			
 
				+        {
			
 
				+            if (total[j] != 0)
			
 
				+                newCluster[j] /= total[j];
			
 
				+        }
			
 
				+
			
 
				+        if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
			
 
				+            equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
			
 
				+        {
			
 
				+            return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
			
 
				+        }
			
 
				+
			
 
				+        cluster[0] = newCluster[0];
			
 
				+        cluster[1] = newCluster[1];
			
 
				+        cluster[2] = newCluster[2];
			
 
				+        cluster[3] = newCluster[3];
			
 
				+
			
 
				+        // Sort clusters by weight.
			
 
				+        for (int i = 0; i < 4; i++)
			
 
				+        {
			
 
				+            for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
			
 
				+            {
			
 
				+                swap( total[j], total[j - 1] );
			
 
				+                swap( cluster[j], cluster[j - 1] );
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+
			
 
				+
			
 
				+// Adaptation of James Arvo's SVD code, as found in ZOH.
			
 
				+
			
 
				+inline float Sqr(float x) { return x*x; }
			
 
				+
			
 
				+inline float svd_pythag( float a, float b )
			
 
				+{
			
 
				+	float at = fabsf(a);
			
 
				+	float bt = fabsf(b);
			
 
				+	if( at > bt )
			
 
				+		return at * sqrtf( 1.0f + Sqr( bt / at ) );
			
 
				+	else if( bt > 0.0f )
			
 
				+		return bt * sqrtf( 1.0f + Sqr( at / bt ) );
			
 
				+	else return 0.0f;
			
 
				+}
			
 
				+
			
 
				+inline float SameSign( float a, float b ) 
			
 
				+{
			
 
				+	float t;
			
 
				+	if( b >= 0.0f ) t = fabsf( a );
			
 
				+	else t = -fabsf( a );
			
 
				+	return t;
			
 
				+}
			
 
				+
			
 
				+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R)
			
 
				+{
			
 
				+	static const int MaxIterations = 30;
			
 
				+
			
 
				+	int    i, j, k, l, p, q, iter;
			
 
				+	float  c, f, h, s, x, y, z;
			
 
				+	float  norm  = 0.0f;
			
 
				+	float  g     = 0.0f;
			
 
				+	float  scale = 0.0f;
			
 
				+
			
 
				+    Array<float> temp; temp.resize(cols, 0.0f);
			
 
				+
			
 
				+	for( i = 0; i < cols; i++ ) 
			
 
				+	{
			
 
				+		temp[i] = scale * g;
			
 
				+		scale   = 0.0f;
			
 
				+		g       = 0.0f;
			
 
				+		s       = 0.0f;
			
 
				+		l       = i + 1;
			
 
				+
			
 
				+		if( i < rows )
			
 
				+		{
			
 
				+			for( k = i; k < rows; k++ ) scale += fabsf( Q[k*cols+i] );
			
 
				+			if( scale != 0.0f ) 
			
 
				+			{
			
 
				+				for( k = i; k < rows; k++ ) 
			
 
				+				{
			
 
				+					Q[k*cols+i] /= scale;
			
 
				+					s += Sqr( Q[k*cols+i] );
			
 
				+				}
			
 
				+				f = Q[i*cols+i];
			
 
				+				g = -SameSign( sqrtf(s), f );
			
 
				+				h = f * g - s;
			
 
				+				Q[i*cols+i] = f - g;
			
 
				+				if( i != cols - 1 )
			
 
				+				{
			
 
				+					for( j = l; j < cols; j++ ) 
			
 
				+					{
			
 
				+						s = 0.0f;
			
 
				+						for( k = i; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
			
 
				+						f = s / h;
			
 
				+						for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
			
 
				+					}
			
 
				+				}
			
 
				+				for( k = i; k < rows; k++ ) Q[k*cols+i] *= scale;
			
 
				+			}
			
 
				+		}
			
 
				+
			
 
				+		diag[i] = scale * g;
			
 
				+		g       = 0.0f;
			
 
				+		s       = 0.0f;
			
 
				+		scale   = 0.0f;
			
 
				+
			
 
				+		if( i < rows && i != cols - 1 ) 
			
 
				+		{
			
 
				+			for( k = l; k < cols; k++ ) scale += fabsf( Q[i*cols+k] );
			
 
				+			if( scale != 0.0f ) 
			
 
				+			{
			
 
				+				for( k = l; k < cols; k++ ) 
			
 
				+				{
			
 
				+					Q[i*cols+k] /= scale;
			
 
				+					s += Sqr( Q[i*cols+k] );
			
 
				+				}
			
 
				+				f = Q[i*cols+l];
			
 
				+				g = -SameSign( sqrtf(s), f );
			
 
				+				h = f * g - s;
			
 
				+				Q[i*cols+l] = f - g;
			
 
				+				for( k = l; k < cols; k++ ) temp[k] = Q[i*cols+k] / h;
			
 
				+				if( i != rows - 1 ) 
			
 
				+				{
			
 
				+					for( j = l; j < rows; j++ ) 
			
 
				+					{
			
 
				+						s = 0.0f;
			
 
				+						for( k = l; k < cols; k++ ) s += Q[j*cols+k] * Q[i*cols+k];
			
 
				+						for( k = l; k < cols; k++ ) Q[j*cols+k] += s * temp[k];
			
 
				+					}
			
 
				+				}
			
 
				+				for( k = l; k < cols; k++ ) Q[i*cols+k] *= scale;
			
 
				+			}
			
 
				+		}
			
 
				+		norm = max( norm, fabsf( diag[i] ) + fabsf( temp[i] ) );
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	for( i = cols - 1; i >= 0; i-- ) 
			
 
				+	{
			
 
				+		if( i < cols - 1 ) 
			
 
				+		{
			
 
				+			if( g != 0.0f ) 
			
 
				+			{
			
 
				+				for( j = l; j < cols; j++ ) R[i*cols+j] = ( Q[i*cols+j] / Q[i*cols+l] ) / g;
			
 
				+				for( j = l; j < cols; j++ ) 
			
 
				+				{
			
 
				+					s = 0.0f;
			
 
				+					for( k = l; k < cols; k++ ) s += Q[i*cols+k] * R[j*cols+k];
			
 
				+					for( k = l; k < cols; k++ ) R[j*cols+k] += s * R[i*cols+k];
			
 
				+				}
			
 
				+			}
			
 
				+			for( j = l; j < cols; j++ ) 
			
 
				+			{
			
 
				+				R[i*cols+j] = 0.0f;
			
 
				+				R[j*cols+i] = 0.0f;
			
 
				+			}
			
 
				+		}
			
 
				+		R[i*cols+i] = 1.0f;
			
 
				+		g = temp[i];
			
 
				+		l = i;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	for( i = cols - 1; i >= 0; i-- ) 
			
 
				+	{
			
 
				+		l = i + 1;
			
 
				+		g = diag[i];
			
 
				+		if( i < cols - 1 ) for( j = l; j < cols; j++ ) Q[i*cols+j] = 0.0f;
			
 
				+		if( g != 0.0f ) 
			
 
				+		{
			
 
				+			g = 1.0f / g;
			
 
				+			if( i != cols - 1 ) 
			
 
				+			{
			
 
				+				for( j = l; j < cols; j++ ) 
			
 
				+				{
			
 
				+					s = 0.0f;
			
 
				+					for( k = l; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
			
 
				+					f = ( s / Q[i*cols+i] ) * g;
			
 
				+					for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
			
 
				+				}
			
 
				+			}
			
 
				+			for( j = i; j < rows; j++ ) Q[j*cols+i] *= g;
			
 
				+		} 
			
 
				+		else 
			
 
				+		{
			
 
				+			for( j = i; j < rows; j++ ) Q[j*cols+i] = 0.0f;
			
 
				+		}
			
 
				+		Q[i*cols+i] += 1.0f;
			
 
				+	}
			
 
				+
			
 
				+
			
 
				+	for( k = cols - 1; k >= 0; k-- ) 
			
 
				+	{
			
 
				+		for( iter = 1; iter <= MaxIterations; iter++ ) 
			
 
				+		{
			
 
				+			int jump;
			
 
				+
			
 
				+			for( l = k; l >= 0; l-- )
			
 
				+			{
			
 
				+				q = l - 1;
			
 
				+				if( fabsf( temp[l] ) + norm == norm ) { jump = 1; break; }
			
 
				+				if( fabsf( diag[q] ) + norm == norm ) { jump = 0; break; }
			
 
				+			}
			
 
				+
			
 
				+			if( !jump )
			
 
				+			{
			
 
				+				c = 0.0f;
			
 
				+				s = 1.0f;
			
 
				+				for( i = l; i <= k; i++ )
			
 
				+				{
			
 
				+					f = s * temp[i];
			
 
				+					temp[i] *= c;
			
 
				+					if( fabsf( f ) + norm == norm ) break;
			
 
				+					g = diag[i];
			
 
				+					h = svd_pythag( f, g );
			
 
				+					diag[i] = h;
			
 
				+					h = 1.0f / h;
			
 
				+					c = g * h;
			
 
				+					s = -f * h;
			
 
				+					for( j = 0; j < rows; j++ ) 
			
 
				+					{
			
 
				+						y = Q[j*cols+q];
			
 
				+						z = Q[j*cols+i];
			
 
				+						Q[j*cols+q] = y * c + z * s;
			
 
				+						Q[j*cols+i] = z * c - y * s;
			
 
				+					}
			
 
				+				}
			
 
				+			}
			
 
				+
			
 
				+			z = diag[k];
			
 
				+			if( l == k ) 
			
 
				+			{
			
 
				+				if( z < 0.0f ) 
			
 
				+				{
			
 
				+					diag[k] = -z;
			
 
				+					for( j = 0; j < cols; j++ ) R[k*cols+j] *= -1.0f; 
			
 
				+				}
			
 
				+				break;
			
 
				+			}
			
 
				+			if( iter >= MaxIterations ) return;
			
 
				+			x = diag[l];
			
 
				+			q = k - 1;
			
 
				+			y = diag[q];
			
 
				+			g = temp[q];
			
 
				+			h = temp[k];
			
 
				+			f = ( ( y - z ) * ( y + z ) + ( g - h ) * ( g + h ) ) / ( 2.0f * h * y );
			
 
				+			g = svd_pythag( f, 1.0f );
			
 
				+			f = ( ( x - z ) * ( x + z ) + h * ( ( y / ( f + SameSign( g, f ) ) ) - h ) ) / x;
			
 
				+			c = 1.0f;
			
 
				+			s = 1.0f;
			
 
				+			for( j = l; j <= q; j++ ) 
			
 
				+			{
			
 
				+				i = j + 1;
			
 
				+				g = temp[i];
			
 
				+				y = diag[i];
			
 
				+				h = s * g;
			
 
				+				g = c * g;
			
 
				+				z = svd_pythag( f, h );
			
 
				+				temp[j] = z;
			
 
				+				c = f / z;
			
 
				+				s = h / z;
			
 
				+				f = x * c + g * s;
			
 
				+				g = g * c - x * s;
			
 
				+				h = y * s;
			
 
				+				y = y * c;
			
 
				+				for( p = 0; p < cols; p++ ) 
			
 
				+				{
			
 
				+					x = R[j*cols+p];
			
 
				+					z = R[i*cols+p];
			
 
				+					R[j*cols+p] = x * c + z * s;
			
 
				+					R[i*cols+p] = z * c - x * s;
			
 
				+				}
			
 
				+				z = svd_pythag( f, h );
			
 
				+				diag[j] = z;
			
 
				+				if( z != 0.0f ) 
			
 
				+				{
			
 
				+					z = 1.0f / z;
			
 
				+					c = f * z;
			
 
				+					s = h * z;
			
 
				+				}
			
 
				+				f = c * g + s * y;
			
 
				+				x = c * y - s * g;
			
 
				+				for( p = 0; p < rows; p++ ) 
			
 
				+				{
			
 
				+					y = Q[p*cols+j];
			
 
				+					z = Q[p*cols+i];
			
 
				+					Q[p*cols+j] = y * c + z * s;
			
 
				+					Q[p*cols+i] = z * c - y * s;
			
 
				+				}
			
 
				+			}
			
 
				+			temp[l] = 0.0f;
			
 
				+			temp[k] = f;
			
 
				+			diag[k] = x;
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	// Sort the singular values into descending order.
			
 
				+
			
 
				+	for( i = 0; i < cols - 1; i++ )
			
 
				+	{
			
 
				+		float biggest = diag[i];  // Biggest singular value so far.
			
 
				+		int   bindex  = i;        // The row/col it occurred in.
			
 
				+		for( j = i + 1; j < cols; j++ )
			
 
				+		{
			
 
				+			if( diag[j] > biggest ) 
			
 
				+			{
			
 
				+				biggest = diag[j];
			
 
				+				bindex  = j;
			
 
				+			}            
			
 
				+		}
			
 
				+		if( bindex != i )  // Need to swap rows and columns.
			
 
				+		{
			
 
				+			// Swap columns in Q.
			
 
				+			for (int j = 0; j < rows; ++j)
			
 
				+				swap(Q[j*cols+i], Q[j*cols+bindex]);
			
 
				+
			
 
				+			// Swap rows in R.
			
 
				+			for (int j = 0; j < rows; ++j)
			
 
				+				swap(R[i*cols+j], R[bindex*cols+j]);
			
 
				+
			
 
				+			// Swap elements in diag.
			
 
				+			swap(diag[i], diag[bindex]);
			
 
				+		}
			
 
				+	}
			
 
				+}
			
--- a/3rdparty/nvtt/nvmath/fitting.h
+++ b/3rdparty/nvtt/nvmath/fitting.h
@@ -0,0 +1,49 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#ifndef NV_MATH_FITTING_H
			
 
				+#define NV_MATH_FITTING_H
			
 
				+
			
 
				+#include "vector.h"
			
 
				+#include "plane.h"
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    namespace Fit
			
 
				+    {
			
 
				+        Vector3 computeCentroid(int n, const Vector3 * points);
			
 
				+        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
			
 
				+
			
 
				+        Vector4 computeCentroid(int n, const Vector4 * points);
			
 
				+        Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
			
 
				+
			
 
				+        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
			
 
				+        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance);
			
 
				+
			
 
				+        Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
			
 
				+        Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);
			
 
				+
			
 
				+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
			
 
				+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
			
 
				+
			
 
				+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
			
 
				+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
			
 
				+
			
 
				+		Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
			
 
				+        Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
			
 
				+
			
 
				+        Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
			
 
				+        Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);
			
 
				+
			
 
				+        Plane bestPlane(int n, const Vector3 * points);
			
 
				+        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
			
 
				+
			
 
				+        bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
			
 
				+        bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]);
			
 
				+
			
 
				+        // Returns number of clusters [1-4].
			
 
				+        int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
			
 
				+    }
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_MATH_FITTING_H
			
--- a/3rdparty/nvtt/nvmath/matrix.h
+++ b/3rdparty/nvtt/nvmath/matrix.h
@@ -0,0 +1,112 @@
 
				+// This code is in the public domain -- [email protected]
			
 
				+
			
 
				+#ifndef NV_MATH_MATRIX_H
			
 
				+#define NV_MATH_MATRIX_H
			
 
				+
			
 
				+#include "vector.h"
			
 
				+
			
 
				+// - Matrices are stored in memory in *column major* order.
			
 
				+// - Points are to be though of as column vectors.
			
 
				+// - Transformation of a point p by a matrix M is: p' = M * p
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    enum identity_t { identity };
			
 
				+
			
 
				+    // 3x3 matrix.
			
 
				+    class NVMATH_CLASS Matrix3
			
 
				+    {
			
 
				+    public:
			
 
				+        Matrix3();
			
 
				+        explicit Matrix3(float f);
			
 
				+        explicit Matrix3(identity_t);
			
 
				+        Matrix3(const Matrix3 & m);
			
 
				+        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
			
 
				+
			
 
				+        float data(uint idx) const;
			
 
				+        float & data(uint idx);
			
 
				+        float get(uint row, uint col) const;
			
 
				+        float operator()(uint row, uint col) const;
			
 
				+        float & operator()(uint row, uint col);
			
 
				+
			
 
				+        Vector3 row(uint i) const;
			
 
				+        Vector3 column(uint i) const;
			
 
				+
			
 
				+        void operator*=(float s);
			
 
				+        void operator/=(float s);
			
 
				+        void operator+=(const Matrix3 & m);
			
 
				+        void operator-=(const Matrix3 & m);
			
 
				+
			
 
				+        void scale(float s);
			
 
				+        void scale(Vector3::Arg s);
			
 
				+        float determinant() const;
			
 
				+
			
 
				+    private:
			
 
				+        float m_data[9];
			
 
				+    };
			
 
				+
			
 
				+    // Solve equation system using LU decomposition and back-substitution.
			
 
				+    extern bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x);
			
 
				+
			
 
				+    // Solve equation system using Cramer's inverse.
			
 
				+    extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);
			
 
				+
			
 
				+
			
 
				+    // 4x4 matrix.
			
 
				+    class NVMATH_CLASS Matrix
			
 
				+    {
			
 
				+    public:
			
 
				+        typedef Matrix const & Arg;
			
 
				+
			
 
				+        Matrix();
			
 
				+        explicit Matrix(float f);
			
 
				+        explicit Matrix(identity_t);
			
 
				+        Matrix(const Matrix3 & m);
			
 
				+        Matrix(const Matrix & m);
			
 
				+        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
			
 
				+        //explicit Matrix(const float m[]);	// m is assumed to contain 16 elements
			
 
				+
			
 
				+        float data(uint idx) const;
			
 
				+        float & data(uint idx);
			
 
				+        float get(uint row, uint col) const;
			
 
				+        float operator()(uint row, uint col) const;
			
 
				+        float & operator()(uint row, uint col);
			
 
				+        const float * ptr() const;
			
 
				+
			
 
				+        Vector4 row(uint i) const;
			
 
				+        Vector4 column(uint i) const;
			
 
				+
			
 
				+        void zero();
			
 
				+        void identity();
			
 
				+
			
 
				+        void scale(float s);
			
 
				+        void scale(Vector3::Arg s);
			
 
				+        void translate(Vector3::Arg t);
			
 
				+        void rotate(float theta, float v0, float v1, float v2);
			
 
				+        float determinant() const;
			
 
				+
			
 
				+        void operator+=(const Matrix & m);
			
 
				+        void operator-=(const Matrix & m);
			
 
				+
			
 
				+        void apply(Matrix::Arg m);
			
 
				+
			
 
				+    private:
			
 
				+        float m_data[16];
			
 
				+    };
			
 
				+
			
 
				+    // Solve equation system using LU decomposition and back-substitution.
			
 
				+    extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);
			
 
				+
			
 
				+    // Solve equation system using Cramer's inverse.
			
 
				+    extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);
			
 
				+
			
 
				+    // Compute inverse using LU decomposition.
			
 
				+    extern Matrix inverseLU(const Matrix & m);
			
 
				+
			
 
				+    // Compute inverse using Gaussian elimination and partial pivoting.
			
 
				+    extern Matrix inverse(const Matrix & m);
			
 
				+    extern Matrix3 inverse(const Matrix3 & m);
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_MATH_MATRIX_H
			
--- a/3rdparty/nvtt/nvmath/matrix.inl
+++ b/3rdparty/nvtt/nvmath/matrix.inl
@@ -0,0 +1,1274 @@
 
				+// This code is in the public domain -- [email protected]

			
 
				+

			
 
				+#pragma once

			
 
				+#ifndef NV_MATH_MATRIX_INL

			
 
				+#define NV_MATH_MATRIX_INL

			
 
				+

			
 
				+#include "Matrix.h"

			
 
				+

			
 
				+namespace nv

			
 
				+{

			
 
				+    inline Matrix3::Matrix3() {}

			
 
				+    

			
 
				+    inline Matrix3::Matrix3(float f)

			
 
				+    {

			
 
				+        for(int i = 0; i < 9; i++) {

			
 
				+            m_data[i] = f;

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3::Matrix3(identity_t)

			
 
				+    {

			
 
				+        for(int i = 0; i < 3; i++) {

			
 
				+            for(int j = 0; j < 3; j++) {

			
 
				+                m_data[3*j+i] = (i == j) ? 1.0f : 0.0f;

			
 
				+            }

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3::Matrix3(const Matrix3 & m)

			
 
				+    {

			
 
				+        for(int i = 0; i < 9; i++) {

			
 
				+            m_data[i] = m.m_data[i];

			
 
				+        }

			
 
				+    }

			
 
				+    

			
 
				+    inline Matrix3::Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)

			
 
				+    {

			
 
				+        m_data[0] = v0.x; m_data[1] = v0.y; m_data[2] = v0.z;

			
 
				+        m_data[3] = v1.x; m_data[4] = v1.y; m_data[5] = v1.z;

			
 
				+        m_data[6] = v2.x; m_data[7] = v2.y; m_data[8] = v2.z;

			
 
				+    }

			
 
				+

			
 
				+    inline float Matrix3::data(uint idx) const

			
 
				+    {

			
 
				+        nvDebugCheck(idx < 9);

			
 
				+        return m_data[idx];

			
 
				+    }

			
 
				+    inline float & Matrix3::data(uint idx)

			
 
				+    {

			
 
				+        nvDebugCheck(idx < 9);

			
 
				+        return m_data[idx];

			
 
				+    }

			
 
				+    inline float Matrix3::get(uint row, uint col) const

			
 
				+    {

			
 
				+        nvDebugCheck(row < 3 && col < 3);

			
 
				+        return m_data[col * 3 + row];

			
 
				+    }

			
 
				+    inline float Matrix3::operator()(uint row, uint col) const

			
 
				+    {

			
 
				+        nvDebugCheck(row < 3 && col < 3);

			
 
				+        return m_data[col * 3 + row];

			
 
				+    }

			
 
				+    inline float & Matrix3::operator()(uint row, uint col)

			
 
				+    {

			
 
				+        nvDebugCheck(row < 3 && col < 3);

			
 
				+        return m_data[col * 3 + row];

			
 
				+    }

			
 
				+

			
 
				+    inline Vector3 Matrix3::row(uint i) const

			
 
				+    {

			
 
				+        nvDebugCheck(i < 3);

			
 
				+        return Vector3(get(i, 0), get(i, 1), get(i, 2));

			
 
				+    }

			
 
				+    inline Vector3 Matrix3::column(uint i) const

			
 
				+    {

			
 
				+        nvDebugCheck(i < 3);

			
 
				+        return Vector3(get(0, i), get(1, i), get(2, i));

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix3::operator*=(float s)

			
 
				+    {

			
 
				+        for(int i = 0; i < 9; i++) {

			
 
				+            m_data[i] *= s;

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix3::operator/=(float s)

			
 
				+    {

			
 
				+        float is = 1.0f /s;

			
 
				+        for(int i = 0; i < 9; i++) {

			
 
				+            m_data[i] *= is;

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix3::operator+=(const Matrix3 & m)

			
 
				+    {

			
 
				+        for(int i = 0; i < 9; i++) {

			
 
				+            m_data[i] += m.m_data[i];

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix3::operator-=(const Matrix3 & m)

			
 
				+    {

			
 
				+        for(int i = 0; i < 9; i++) {

			
 
				+            m_data[i] -= m.m_data[i];

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3 operator+(const Matrix3 & a, const Matrix3 & b)

			
 
				+    {

			
 
				+        Matrix3 m = a;

			
 
				+        m += b;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3 operator-(const Matrix3 & a, const Matrix3 & b)

			
 
				+    {

			
 
				+        Matrix3 m = a;

			
 
				+        m -= b;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3 operator*(const Matrix3 & a, float s)

			
 
				+    {

			
 
				+        Matrix3 m = a;

			
 
				+        m *= s;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3 operator*(float s, const Matrix3 & a)

			
 
				+    {

			
 
				+        Matrix3 m = a;

			
 
				+        m *= s;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3 operator/(const Matrix3 & a, float s)

			
 
				+    {

			
 
				+        Matrix3 m = a;

			
 
				+        m /= s;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3 mul(const Matrix3 & a, const Matrix3 & b)

			
 
				+    {

			
 
				+        Matrix3 m;

			
 
				+

			
 
				+        for(int i = 0; i < 3; i++) {

			
 
				+            const float ai0 = a(i,0), ai1 = a(i,1), ai2 = a(i,2);

			
 
				+            m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0) + ai2 * b(2,0);

			
 
				+            m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1) + ai2 * b(2,1);

			
 
				+            m(i, 2) = ai0 * b(0,2) + ai1 * b(1,2) + ai2 * b(2,2);

			
 
				+        }

			
 
				+

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix3 operator*(const Matrix3 & a, const Matrix3 & b)

			
 
				+    {

			
 
				+        return mul(a, b);

			
 
				+    }

			
 
				+

			
 
				+    // Transform the given 3d vector with the given matrix.

			
 
				+    inline Vector3 transform(const Matrix3 & m, const Vector3 & p)

			
 
				+    {

			
 
				+        return Vector3(

			
 
				+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),

			
 
				+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),

			
 
				+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix3::scale(float s)

			
 
				+    {

			
 
				+        for (int i = 0; i < 9; i++) {

			
 
				+            m_data[i] *= s;

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix3::scale(Vector3::Arg s)

			
 
				+    {

			
 
				+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x;

			
 
				+        m_data[3] *= s.y; m_data[4] *= s.y; m_data[5] *= s.y;

			
 
				+        m_data[6] *= s.z; m_data[7] *= s.z; m_data[8] *= s.z;

			
 
				+    }

			
 
				+

			
 
				+    inline float Matrix3::determinant() const

			
 
				+    {

			
 
				+        return 

			
 
				+            get(0,0) * get(1,1) * get(2,2) + 

			
 
				+            get(0,1) * get(1,2) * get(2,0) + 

			
 
				+            get(0,2) * get(1,0) * get(2,1) -

			
 
				+            get(0,2) * get(1,1) * get(2,0) - 

			
 
				+            get(0,1) * get(1,0) * get(2,2) -

			
 
				+            get(0,0) * get(1,2) * get(2,1);

			
 
				+    }

			
 
				+

			
 
				+    // Inverse using Cramer's rule.

			
 
				+    inline Matrix3 inverseCramer(const Matrix3 & m)

			
 
				+    {

			
 
				+        const float det = m.determinant();

			
 
				+        if (equal(det, 0.0f, 0.0f)) {

			
 
				+            return Matrix3(0);

			
 
				+        }

			
 
				+

			
 
				+        Matrix3 r;

			
 
				+

			
 
				+        r.data(0) =  - m.data(5) * m.data(7) + m.data(4) * m.data(8);

			
 
				+        r.data(1) =  + m.data(5) * m.data(6) - m.data(3) * m.data(8);

			
 
				+        r.data(2) =  - m.data(4) * m.data(6) + m.data(3) * m.data(7);

			
 
				+

			
 
				+        r.data(3) =  + m.data(2) * m.data(7) - m.data(1) * m.data(8);

			
 
				+        r.data(4) =  - m.data(2) * m.data(6) + m.data(0) * m.data(8);

			
 
				+        r.data(5) =  + m.data(1) * m.data(6) - m.data(0) * m.data(7);

			
 
				+

			
 
				+        r.data(6) =  - m.data(2) * m.data(4) + m.data(1) * m.data(5);

			
 
				+        r.data(7) =  + m.data(2) * m.data(3) - m.data(0) * m.data(5);

			
 
				+        r.data(8) =  - m.data(1) * m.data(3) + m.data(0) * m.data(4);

			
 
				+

			
 
				+        r.scale(1.0f / det);

			
 
				+

			
 
				+        return r;

			
 
				+    }

			
 
				+

			
 
				+

			
 
				+

			
 
				+    inline Matrix::Matrix()

			
 
				+    {

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix::Matrix(float f)

			
 
				+    {

			
 
				+        for(int i = 0; i < 16; i++) {

			
 
				+            m_data[i] = 0.0f;

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix::Matrix(identity_t)

			
 
				+    {

			
 
				+        for(int i = 0; i < 4; i++) {

			
 
				+            for(int j = 0; j < 4; j++) {

			
 
				+                m_data[4*j+i] = (i == j) ? 1.0f : 0.0f;

			
 
				+            }

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix::Matrix(const Matrix & m)

			
 
				+    {

			
 
				+        for(int i = 0; i < 16; i++) {

			
 
				+            m_data[i] = m.m_data[i];

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix::Matrix(const Matrix3 & m)

			
 
				+    {

			
 
				+        for(int i = 0; i < 3; i++) {

			
 
				+            for(int j = 0; j < 3; j++) {

			
 
				+                operator()(i, j) = m.get(i, j);

			
 
				+            }

			
 
				+        }

			
 
				+        for(int i = 0; i < 4; i++) {

			
 
				+            operator()(3, i) = 0;

			
 
				+            operator()(i, 3) = 0;

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)

			
 
				+    {

			
 
				+        m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;

			
 
				+        m_data[ 4] = v1.x; m_data[ 5] = v1.y; m_data[ 6] = v1.z; m_data[ 7] = v1.w;

			
 
				+        m_data[ 8] = v2.x; m_data[ 9] = v2.y; m_data[10] = v2.z; m_data[11] = v2.w;

			
 
				+        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;

			
 
				+    }

			
 
				+

			
 
				+    /*inline Matrix::Matrix(const float m[])

			
 
				+    {

			
 
				+        for(int i = 0; i < 16; i++) {

			
 
				+            m_data[i] = m[i];

			
 
				+        }

			
 
				+    }*/

			
 
				+

			
 
				+

			
 
				+    // Accessors

			
 
				+    inline float Matrix::data(uint idx) const

			
 
				+    {

			
 
				+        nvDebugCheck(idx < 16);

			
 
				+        return m_data[idx];

			
 
				+    }

			
 
				+    inline float & Matrix::data(uint idx)

			
 
				+    {

			
 
				+        nvDebugCheck(idx < 16);

			
 
				+        return m_data[idx];

			
 
				+    }

			
 
				+    inline float Matrix::get(uint row, uint col) const

			
 
				+    {

			
 
				+        nvDebugCheck(row < 4 && col < 4);

			
 
				+        return m_data[col * 4 + row];

			
 
				+    }

			
 
				+    inline float Matrix::operator()(uint row, uint col) const

			
 
				+    {

			
 
				+        nvDebugCheck(row < 4 && col < 4);

			
 
				+        return m_data[col * 4 + row];

			
 
				+    }

			
 
				+    inline float & Matrix::operator()(uint row, uint col)

			
 
				+    {

			
 
				+        nvDebugCheck(row < 4 && col < 4);

			
 
				+        return m_data[col * 4 + row];

			
 
				+    }

			
 
				+

			
 
				+    inline const float * Matrix::ptr() const

			
 
				+    {

			
 
				+        return m_data;

			
 
				+    }

			
 
				+

			
 
				+    inline Vector4 Matrix::row(uint i) const

			
 
				+    {

			
 
				+        nvDebugCheck(i < 4);

			
 
				+        return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3));

			
 
				+    }

			
 
				+

			
 
				+    inline Vector4 Matrix::column(uint i) const

			
 
				+    {

			
 
				+        nvDebugCheck(i < 4);

			
 
				+        return Vector4(get(0, i), get(1, i), get(2, i), get(3, i));

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix::zero()

			
 
				+    {

			
 
				+        m_data[0] = 0; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;

			
 
				+        m_data[4] = 0; m_data[5] = 0; m_data[6] = 0; m_data[7] = 0;

			
 
				+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 0; m_data[11] = 0;

			
 
				+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 0;

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix::identity()

			
 
				+    {

			
 
				+        m_data[0] = 1; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;

			
 
				+        m_data[4] = 0; m_data[5] = 1; m_data[6] = 0; m_data[7] = 0;

			
 
				+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 1; m_data[11] = 0;

			
 
				+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 1;

			
 
				+    }

			
 
				+

			
 
				+    // Apply scale.

			
 
				+    inline void Matrix::scale(float s)

			
 
				+    {

			
 
				+        m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s;

			
 
				+        m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s;

			
 
				+        m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s;

			
 
				+        m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s;

			
 
				+    }

			
 
				+

			
 
				+    // Apply scale.

			
 
				+    inline void Matrix::scale(Vector3::Arg s)

			
 
				+    {

			
 
				+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; m_data[3] *= s.x;

			
 
				+        m_data[4] *= s.y; m_data[5] *= s.y; m_data[6] *= s.y; m_data[7] *= s.y;

			
 
				+        m_data[8] *= s.z; m_data[9] *= s.z; m_data[10] *= s.z; m_data[11] *= s.z;

			
 
				+    }

			
 
				+

			
 
				+    // Apply translation.

			
 
				+    inline void Matrix::translate(Vector3::Arg t)

			
 
				+    {

			
 
				+        m_data[12] = m_data[0] * t.x + m_data[4] * t.y + m_data[8]  * t.z + m_data[12];

			
 
				+        m_data[13] = m_data[1] * t.x + m_data[5] * t.y + m_data[9]  * t.z + m_data[13];

			
 
				+        m_data[14] = m_data[2] * t.x + m_data[6] * t.y + m_data[10] * t.z + m_data[14];

			
 
				+        m_data[15] = m_data[3] * t.x + m_data[7] * t.y + m_data[11] * t.z + m_data[15];

			
 
				+    }

			
 
				+

			
 
				+    Matrix rotation(float theta, float v0, float v1, float v2);

			
 
				+

			
 
				+    // Apply rotation.

			
 
				+    inline void Matrix::rotate(float theta, float v0, float v1, float v2)

			
 
				+    {

			
 
				+        Matrix R(rotation(theta, v0, v1, v2));

			
 
				+        apply(R);

			
 
				+    }

			
 
				+

			
 
				+    // Apply transform.

			
 
				+    inline void Matrix::apply(Matrix::Arg m)

			
 
				+    {

			
 
				+        nvDebugCheck(this != &m);

			
 
				+

			
 
				+        for(int i = 0; i < 4; i++) {

			
 
				+            const float ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3);

			
 
				+            m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0);

			
 
				+            m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1);

			
 
				+            m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2);

			
 
				+            m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3);

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    // Get scale matrix.

			
 
				+    inline Matrix scale(Vector3::Arg s)

			
 
				+    {

			
 
				+        Matrix m(identity);

			
 
				+        m(0,0) = s.x;

			
 
				+        m(1,1) = s.y;

			
 
				+        m(2,2) = s.z;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    // Get scale matrix.

			
 
				+    inline Matrix scale(float s)

			
 
				+    {

			
 
				+        Matrix m(identity);

			
 
				+        m(0,0) = m(1,1) = m(2,2) = s;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    // Get translation matrix.

			
 
				+    inline Matrix translation(Vector3::Arg t)

			
 
				+    {

			
 
				+        Matrix m(identity);

			
 
				+        m(0,3) = t.x;

			
 
				+        m(1,3) = t.y;

			
 
				+        m(2,3) = t.z;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    // Get rotation matrix.

			
 
				+    inline Matrix rotation(float theta, float v0, float v1, float v2)

			
 
				+    {

			
 
				+        float cost = cosf(theta);

			
 
				+        float sint = sinf(theta);

			
 
				+

			
 
				+        Matrix m(identity);

			
 
				+

			
 
				+        if( 1 == v0 && 0 == v1 && 0 == v2 ) {

			
 
				+            m(1,1) = cost; m(2,1) = -sint;

			
 
				+            m(1,2) = sint; m(2,2) = cost;

			
 
				+        }

			
 
				+        else if( 0 == v0  && 1 == v1 && 0 == v2 ) {

			
 
				+            m(0,0) = cost; m(2,0) = sint;

			
 
				+            m(1,2) = -sint; m(2,2) = cost;

			
 
				+        }

			
 
				+        else if( 0 == v0 && 0 == v1 && 1 == v2 ) {

			
 
				+            m(0,0) = cost; m(1,0) = -sint;

			
 
				+            m(0,1) = sint; m(1,1) = cost;

			
 
				+        } 

			
 
				+        else {

			
 
				+            float a2, b2, c2;

			
 
				+            a2 = v0 * v0;

			
 
				+            b2 = v1 * v1;

			
 
				+            c2 = v2 * v2;

			
 
				+

			
 
				+            float iscale = 1.0f / sqrtf(a2 + b2 + c2);

			
 
				+            v0 *= iscale;

			
 
				+            v1 *= iscale;

			
 
				+            v2 *= iscale;

			
 
				+

			
 
				+            float abm, acm, bcm;

			
 
				+            float mcos, asin, bsin, csin;

			
 
				+            mcos = 1.0f - cost;

			
 
				+            abm = v0 * v1 * mcos;

			
 
				+            acm = v0 * v2 * mcos;

			
 
				+            bcm = v1 * v2 * mcos;

			
 
				+            asin = v0 * sint;

			
 
				+            bsin = v1 * sint;

			
 
				+            csin = v2 * sint;

			
 
				+            m(0,0) = a2 * mcos + cost;

			
 
				+            m(1,0) = abm - csin;

			
 
				+            m(2,0) = acm + bsin;

			
 
				+            m(3,0) = abm + csin;

			
 
				+            m(1,1) = b2 * mcos + cost;

			
 
				+            m(2,1) = bcm - asin;

			
 
				+            m(3,1) = acm - bsin;

			
 
				+            m(1,2) = bcm + asin;

			
 
				+            m(2,2) = c2 * mcos + cost;

			
 
				+        }

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    //Matrix rotation(float yaw, float pitch, float roll);

			
 
				+    //Matrix skew(float angle, Vector3::Arg v1, Vector3::Arg v2);

			
 
				+

			
 
				+    // Get frustum matrix.

			
 
				+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)

			
 
				+    {

			
 
				+        Matrix m(0.0f);

			
 
				+

			
 
				+        float doubleznear = 2.0f * zNear;

			
 
				+        float one_deltax = 1.0f / (xmax - xmin);

			
 
				+        float one_deltay = 1.0f / (ymax - ymin);

			
 
				+        float one_deltaz = 1.0f / (zFar - zNear);

			
 
				+

			
 
				+        m(0,0) = doubleznear * one_deltax;

			
 
				+        m(1,1) = doubleznear * one_deltay;

			
 
				+        m(0,2) = (xmax + xmin) * one_deltax;

			
 
				+        m(1,2) = (ymax + ymin) * one_deltay;

			
 
				+        m(2,2) = -(zFar + zNear) * one_deltaz;

			
 
				+        m(3,2) = -1.0f;

			
 
				+        m(2,3) = -(zFar * doubleznear) * one_deltaz;

			
 
				+

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    // Get inverse frustum matrix.

			
 
				+    inline Matrix frustumInverse(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)

			
 
				+    {

			
 
				+        Matrix m(0.0f);

			
 
				+

			
 
				+        float one_doubleznear = 1.0f / (2.0f * zNear);

			
 
				+        float one_doubleznearzfar = 1.0f / (2.0f * zNear * zFar);

			
 
				+

			
 
				+        m(0,0) = (xmax - xmin) * one_doubleznear;

			
 
				+        m(0,3) = (xmax + xmin) * one_doubleznear;

			
 
				+        m(1,1) = (ymax - ymin) * one_doubleznear;

			
 
				+        m(1,3) = (ymax + ymin) * one_doubleznear;

			
 
				+        m(2,3) = -1;

			
 
				+        m(3,2) = -(zFar - zNear) * one_doubleznearzfar;

			
 
				+        m(3,3) = (zFar + zNear) * one_doubleznearzfar;

			
 
				+

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    // Get infinite frustum matrix.

			
 
				+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear)

			
 
				+    {

			
 
				+        Matrix m(0.0f);

			
 
				+

			
 
				+        float doubleznear = 2.0f * zNear;

			
 
				+        float one_deltax = 1.0f / (xmax - xmin);

			
 
				+        float one_deltay = 1.0f / (ymax - ymin);

			
 
				+        float nudge = 1.0; // 0.999;

			
 
				+

			
 
				+        m(0,0) = doubleznear * one_deltax;

			
 
				+        m(1,1) = doubleznear * one_deltay;

			
 
				+        m(0,2) = (xmax + xmin) * one_deltax;

			
 
				+        m(1,2) = (ymax + ymin) * one_deltay;

			
 
				+        m(2,2) = -1.0f * nudge;

			
 
				+        m(3,2) = -1.0f;

			
 
				+        m(2,3) = -doubleznear * nudge;

			
 
				+

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    // Get perspective matrix.

			
 
				+    inline Matrix perspective(float fovy, float aspect, float zNear, float zFar)

			
 
				+    {

			
 
				+        float xmax = zNear * tanf(fovy / 2);

			
 
				+        float xmin = -xmax;

			
 
				+

			
 
				+        float ymax = xmax / aspect;

			
 
				+        float ymin = -ymax;

			
 
				+

			
 
				+        return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	

			
 
				+    }

			
 
				+

			
 
				+    // Get inverse perspective matrix.

			
 
				+    inline Matrix perspectiveInverse(float fovy, float aspect, float zNear, float zFar)

			
 
				+    {

			
 
				+        float xmax = zNear * tanf(fovy / 2);

			
 
				+        float xmin = -xmax;

			
 
				+

			
 
				+        float ymax = xmax / aspect;

			
 
				+        float ymin = -ymax;

			
 
				+

			
 
				+        return frustumInverse(xmin, xmax, ymin, ymax, zNear, zFar);	

			
 
				+    }

			
 
				+

			
 
				+    // Get infinite perspective matrix.

			
 
				+    inline Matrix perspective(float fovy, float aspect, float zNear)

			
 
				+    {

			
 
				+        float x = zNear * tanf(fovy / 2);

			
 
				+        float y = x / aspect;

			
 
				+        return frustum( -x, x, -y, y, zNear );	

			
 
				+    }

			
 
				+

			
 
				+    // Get matrix determinant.

			
 
				+    inline float Matrix::determinant() const

			
 
				+    {

			
 
				+        return 

			
 
				+            m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] +

			
 
				+            m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] +

			
 
				+            m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] +

			
 
				+            m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] +

			
 
				+            m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] +

			
 
				+            m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15];

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix transpose(Matrix::Arg m)

			
 
				+    {

			
 
				+        Matrix r;

			
 
				+        for (int i = 0; i < 4; i++)

			
 
				+        {

			
 
				+            for (int j = 0; j < 4; j++)

			
 
				+            {

			
 
				+                r(i, j) = m(j, i);

			
 
				+            }

			
 
				+        }

			
 
				+        return r;

			
 
				+    }

			
 
				+

			
 
				+    // Inverse using Cramer's rule.

			
 
				+    inline Matrix inverseCramer(Matrix::Arg m)

			
 
				+    {

			
 
				+        Matrix r;

			
 
				+        r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);

			
 
				+        r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15);

			
 
				+        r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15);

			
 
				+        r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11);

			
 
				+        r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15);

			
 
				+        r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15);

			
 
				+        r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15);

			
 
				+        r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11);

			
 
				+        r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15);

			
 
				+        r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15);

			
 
				+        r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15);

			
 
				+        r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11);

			
 
				+        r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14);

			
 
				+        r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14);

			
 
				+        r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14);

			
 
				+        r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10);

			
 
				+        r.scale(1.0f / m.determinant());

			
 
				+        return r;

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix isometryInverse(Matrix::Arg m)

			
 
				+    {

			
 
				+        Matrix r(identity);

			
 
				+

			
 
				+        // transposed 3x3 upper left matrix

			
 
				+        for (int i = 0; i < 3; i++)

			
 
				+        {

			
 
				+            for (int j = 0; j < 3; j++)

			
 
				+            {

			
 
				+                r(i, j) = m(j, i);

			
 
				+            }

			
 
				+        }

			
 
				+

			
 
				+        // translate by the negative offsets

			
 
				+        r.translate(-Vector3(m.data(12), m.data(13), m.data(14)));

			
 
				+

			
 
				+        return r;

			
 
				+    }

			
 
				+

			
 
				+    // Transform the given 3d point with the given matrix.

			
 
				+    inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p)

			
 
				+    {

			
 
				+        return Vector3(

			
 
				+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + m(0,3),

			
 
				+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + m(1,3),

			
 
				+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + m(2,3));

			
 
				+    }

			
 
				+

			
 
				+    // Transform the given 3d vector with the given matrix.

			
 
				+    inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p)

			
 
				+    {

			
 
				+        return Vector3(

			
 
				+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),

			
 
				+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),

			
 
				+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));

			
 
				+    }

			
 
				+

			
 
				+    // Transform the given 4d vector with the given matrix.

			
 
				+    inline Vector4 transform(Matrix::Arg m, Vector4::Arg p)

			
 
				+    {

			
 
				+        return Vector4(

			
 
				+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + p.w * m(0,3),

			
 
				+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + p.w * m(1,3),

			
 
				+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + p.w * m(2,3),

			
 
				+            p.x * m(3,0) + p.y * m(3,1) + p.z * m(3,2) + p.w * m(3,3));

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix mul(Matrix::Arg a, Matrix::Arg b)

			
 
				+    {

			
 
				+        // @@ Is this the right order? mul(a, b) = b * a

			
 
				+        Matrix m = a;

			
 
				+        m.apply(b);

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix::operator+=(const Matrix & m)

			
 
				+    {

			
 
				+        for(int i = 0; i < 16; i++) {

			
 
				+            m_data[i] += m.m_data[i];

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline void Matrix::operator-=(const Matrix & m)

			
 
				+    {

			
 
				+        for(int i = 0; i < 16; i++) {

			
 
				+            m_data[i] -= m.m_data[i];

			
 
				+        }

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix operator+(const Matrix & a, const Matrix & b)

			
 
				+    {

			
 
				+        Matrix m = a;

			
 
				+        m += b;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+    inline Matrix operator-(const Matrix & a, const Matrix & b)

			
 
				+    {

			
 
				+        Matrix m = a;

			
 
				+        m -= b;

			
 
				+        return m;

			
 
				+    }

			
 
				+

			
 
				+

			
 
				+} // nv namespace

			
 
				+

			
 
				+

			
 
				+#if 0 // old code.

			
 
				+/** @name Special matrices. */

			
 
				+//@{

			
 
				+/** Generate a translation matrix. */

			
 
				+void TranslationMatrix(const Vec3 & v) {

			
 
				+    data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0;

			
 
				+    data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0;

			
 
				+    data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0;

			
 
				+    data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1;

			
 
				+}

			
 
				+

			
 
				+/** Rotate theta degrees around v. */

			
 
				+void RotationMatrix( float theta, float v0, float v1, float v2 ) {

			
 
				+    float cost = cos(theta);

			
 
				+    float sint = sin(theta);

			
 
				+

			
 
				+    if( 1 == v0 && 0 == v1 && 0 == v2 ) {

			
 
				+        data[0] = 1.0f;	data[1] = 0.0f;	data[2] = 0.0f;	data[3] = 0.0f;

			
 
				+        data[4] = 0.0f;	data[5] = cost;	data[6] = -sint;data[7] = 0.0f;

			
 
				+        data[8] = 0.0f;	data[9] = sint;	data[10] = cost;data[11] = 0.0f;

			
 
				+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;

			
 
				+    }

			
 
				+    else if( 0 == v0  && 1 == v1 && 0 == v2 ) {

			
 
				+        data[0] = cost;	data[1] = 0.0f;	data[2] = sint;	data[3] = 0.0f;

			
 
				+        data[4] = 0.0f;	data[5] = 1.0f;	data[6] = 0.0f;	data[7] = 0.0f;

			
 
				+        data[8] = -sint;data[9] = 0.0f;data[10] = cost;	data[11] = 0.0f;

			
 
				+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;

			
 
				+    }

			
 
				+    else if( 0 == v0 && 0 == v1 && 1 == v2 ) {

			
 
				+        data[0] = cost;	data[1] = -sint;data[2] = 0.0f;	data[3] = 0.0f;

			
 
				+        data[4] = sint; data[5] = cost;	data[6] = 0.0f;	data[7] = 0.0f;

			
 
				+        data[8] = 0.0f;	data[9] = 0.0f;	data[10] = 1.0f;data[11] = 0.0f;

			
 
				+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;

			
 
				+    } 

			
 
				+    else {

			
 
				+        //we need scale a,b,c to unit length.

			
 
				+        float a2, b2, c2;

			
 
				+        a2 = v0 * v0;

			
 
				+        b2 = v1 * v1;

			
 
				+        c2 = v2 * v2;

			
 
				+

			
 
				+        float iscale = 1.0f / sqrtf(a2 + b2 + c2);

			
 
				+        v0 *= iscale;

			
 
				+        v1 *= iscale;

			
 
				+        v2 *= iscale;

			
 
				+

			
 
				+        float abm, acm, bcm;

			
 
				+        float mcos, asin, bsin, csin;

			
 
				+        mcos = 1.0f - cost;

			
 
				+        abm = v0 * v1 * mcos;

			
 
				+        acm = v0 * v2 * mcos;

			
 
				+        bcm = v1 * v2 * mcos;

			
 
				+        asin = v0 * sint;

			
 
				+        bsin = v1 * sint;

			
 
				+        csin = v2 * sint;

			
 
				+        data[0] = a2 * mcos + cost;

			
 
				+        data[1] = abm - csin;

			
 
				+        data[2] = acm + bsin;

			
 
				+        data[3] = abm + csin;

			
 
				+        data[4] = 0.0f;

			
 
				+        data[5] = b2 * mcos + cost;

			
 
				+        data[6] = bcm - asin;

			
 
				+        data[7] = acm - bsin;

			
 
				+        data[8] = 0.0f;

			
 
				+        data[9] = bcm + asin;

			
 
				+        data[10] = c2 * mcos + cost;

			
 
				+        data[11] = 0.0f;

			
 
				+        data[12] = 0.0f;

			
 
				+        data[13] = 0.0f;

			
 
				+        data[14] = 0.0f;

			
 
				+        data[15] = 1.0f;

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+/*

			
 
				+void SkewMatrix(float angle, const Vec3 & v1, const Vec3 & v2) {

			
 
				+v1.Normalize();

			
 
				+v2.Normalize();

			
 
				+

			
 
				+Vec3 v3;

			
 
				+v3.Cross(v1, v2);

			
 
				+v3.Normalize();

			
 
				+

			
 
				+// Get skew factor.

			
 
				+float costheta = Vec3DotProduct(v1, v2);

			
 
				+float sintheta = Real.Sqrt(1 - costheta * costheta);

			
 
				+float skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta;

			
 
				+

			
 
				+// Build orthonormal matrix.

			
 
				+v1 = FXVector3.Cross(v3, v2);

			
 
				+v1.Normalize();

			
 
				+

			
 
				+Matrix R = Matrix::Identity;

			
 
				+R[0, 0] = v3.X; // Not sure this is in the correct order...

			
 
				+R[1, 0] = v3.Y;

			
 
				+R[2, 0] = v3.Z;

			
 
				+R[0, 1] = v1.X;

			
 
				+R[1, 1] = v1.Y;

			
 
				+R[2, 1] = v1.Z;

			
 
				+R[0, 2] = v2.X;

			
 
				+R[1, 2] = v2.Y;

			
 
				+R[2, 2] = v2.Z;

			
 
				+

			
 
				+// Build skew matrix.

			
 
				+Matrix S = Matrix::Identity;

			
 
				+S[2, 1] = -skew;

			
 
				+

			
 
				+// Return skew transform.

			
 
				+return R * S * R.Transpose;	// Not sure this is in the correct order...

			
 
				+}

			
 
				+*/

			
 
				+

			
 
				+/**

			
 
				+* Generate rotation matrix for the euler angles. This is the same as computing

			
 
				+* 3 rotation matrices and multiplying them together in our custom order.

			
 
				+*

			
 
				+* @todo Have to recompute this code for our new convention.

			
 
				+**/

			
 
				+void RotationMatrix( float yaw, float pitch, float roll ) {

			
 
				+    float sy = sin(yaw+ToRadian(90));

			
 
				+    float cy = cos(yaw+ToRadian(90));

			
 
				+    float sp = sin(pitch-ToRadian(90));

			
 
				+    float cp = cos(pitch-ToRadian(90));

			
 
				+    float sr = sin(roll);

			
 
				+    float cr = cos(roll);

			
 
				+

			
 
				+    data[0] = cr*cy + sr*sp*sy;

			
 
				+    data[1] = cp*sy;

			
 
				+    data[2] = -sr*cy + cr*sp*sy;

			
 
				+    data[3] = 0;

			
 
				+

			
 
				+    data[4] = -cr*sy + sr*sp*cy;

			
 
				+    data[5] = cp*cy;

			
 
				+    data[6] = sr*sy + cr*sp*cy;

			
 
				+    data[7] = 0;

			
 
				+

			
 
				+    data[8] = sr*cp;

			
 
				+    data[9] = -sp;

			
 
				+    data[10] = cr*cp;

			
 
				+    data[11] = 0;

			
 
				+

			
 
				+    data[12] = 0;

			
 
				+    data[13] = 0;

			
 
				+    data[14] = 0;

			
 
				+    data[15] = 1;

			
 
				+}

			
 
				+

			
 
				+/** Create a frustum matrix with the far plane at the infinity. */

			
 
				+void Frustum( float xmin, float xmax, float ymin, float ymax, float zNear, float zFar ) {

			
 
				+    float one_deltax, one_deltay, one_deltaz, doubleznear;

			
 
				+

			
 
				+    doubleznear = 2.0f * zNear;

			
 
				+    one_deltax = 1.0f / (xmax - xmin);

			
 
				+    one_deltay = 1.0f / (ymax - ymin);

			
 
				+    one_deltaz = 1.0f / (zFar - zNear);

			
 
				+

			
 
				+    data[0] = (float)(doubleznear * one_deltax);

			
 
				+    data[1] = 0.0f;

			
 
				+    data[2] = 0.0f;

			
 
				+    data[3] = 0.0f;

			
 
				+    data[4] = 0.0f;

			
 
				+    data[5] = (float)(doubleznear * one_deltay);

			
 
				+    data[6] = 0.f;

			
 
				+    data[7] = 0.f;

			
 
				+    data[8] = (float)((xmax + xmin) * one_deltax);

			
 
				+    data[9] = (float)((ymax + ymin) * one_deltay);

			
 
				+    data[10] = (float)(-(zFar + zNear) * one_deltaz);

			
 
				+    data[11] = -1.f;

			
 
				+    data[12] = 0.f;

			
 
				+    data[13] = 0.f;

			
 
				+    data[14] = (float)(-(zFar * doubleznear) * one_deltaz);

			
 
				+    data[15] = 0.f;

			
 
				+}

			
 
				+

			
 
				+/** Create a frustum matrix with the far plane at the infinity. */

			
 
				+void FrustumInf( float xmin, float xmax, float ymin, float ymax, float zNear ) {

			
 
				+    float one_deltax, one_deltay, doubleznear, nudge;

			
 
				+

			
 
				+    doubleznear = 2.0f * zNear;

			
 
				+    one_deltax = 1.0f / (xmax - xmin);

			
 
				+    one_deltay = 1.0f / (ymax - ymin);

			
 
				+    nudge = 1.0; // 0.999;

			
 
				+

			
 
				+    data[0] = doubleznear * one_deltax;

			
 
				+    data[1] = 0.0f;

			
 
				+    data[2] = 0.0f;

			
 
				+    data[3] = 0.0f;

			
 
				+

			
 
				+    data[4] = 0.0f;

			
 
				+    data[5] = doubleznear * one_deltay;

			
 
				+    data[6] = 0.f;

			
 
				+    data[7] = 0.f;

			
 
				+

			
 
				+    data[8] = (xmax + xmin) * one_deltax;

			
 
				+    data[9] = (ymax + ymin) * one_deltay;

			
 
				+    data[10] = -1.0f * nudge;

			
 
				+    data[11] = -1.0f;

			
 
				+

			
 
				+    data[12] = 0.f;

			
 
				+    data[13] = 0.f;

			
 
				+    data[14] = -doubleznear * nudge;

			
 
				+    data[15] = 0.f;

			
 
				+}

			
 
				+

			
 
				+/** Create an inverse frustum matrix with the far plane at the infinity. */

			
 
				+void FrustumInfInv( float left, float right, float bottom, float top, float zNear ) {

			
 
				+    // this matrix is wrong (not tested floatly) I think it should be transposed.

			
 
				+    data[0] = (right - left) / (2 * zNear);

			
 
				+    data[1] = 0;

			
 
				+    data[2] = 0;

			
 
				+    data[3] = (right + left) / (2 * zNear);

			
 
				+    data[4] = 0;

			
 
				+    data[5] = (top - bottom) / (2 * zNear);

			
 
				+    data[6] = 0;

			
 
				+    data[7] = (top + bottom) / (2 * zNear);

			
 
				+    data[8] = 0;

			
 
				+    data[9] = 0;

			
 
				+    data[10] = 0;

			
 
				+    data[11] = -1;

			
 
				+    data[12] = 0;

			
 
				+    data[13] = 0;

			
 
				+    data[14] = -1 / (2 * zNear);

			
 
				+    data[15] = 1 / (2 * zNear);

			
 
				+}

			
 
				+

			
 
				+/** Create an homogeneous projection matrix. */

			
 
				+void Perspective( float fov, float aspect, float zNear, float zFar ) {

			
 
				+    float xmin, xmax, ymin, ymax;

			
 
				+

			
 
				+    xmax = zNear * tan( fov/2 );

			
 
				+    xmin = -xmax;

			
 
				+

			
 
				+    ymax = xmax / aspect;

			
 
				+    ymin = -ymax;

			
 
				+

			
 
				+    Frustum(xmin, xmax, ymin, ymax, zNear, zFar);

			
 
				+}

			
 
				+

			
 
				+/** Create a projection matrix with the far plane at the infinity. */

			
 
				+void PerspectiveInf( float fov, float aspect, float zNear ) {

			
 
				+    float x = zNear * tan( fov/2 );

			
 
				+    float y = x / aspect;

			
 
				+    FrustumInf( -x, x, -y, y, zNear );

			
 
				+}

			
 
				+

			
 
				+/** Create an inverse projection matrix with far plane at the infinity. */

			
 
				+void PerspectiveInfInv( float fov, float aspect, float zNear ) {

			
 
				+    float x = zNear * tan( fov/2 );

			
 
				+    float y = x / aspect;

			
 
				+    FrustumInfInv( -x, x, -y, y, zNear );

			
 
				+}

			
 
				+

			
 
				+/** Build bone matrix from quatertion and offset. */

			
 
				+void BoneMatrix(const Quat & q, const Vec3 & offset) {

			
 
				+    float x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz;

			
 
				+

			
 
				+    // calculate coefficients

			
 
				+    x2 = q.x + q.x;

			
 
				+    y2 = q.y + q.y;

			
 
				+    z2 = q.z + q.z;

			
 
				+

			
 
				+    xx = q.x * x2;   xy = q.x * y2;   xz = q.x * z2;

			
 
				+    yy = q.y * y2;   yz = q.y * z2;   zz = q.z * z2;

			
 
				+    wx = q.w * x2;   wy = q.w * y2;   wz = q.w * z2;

			
 
				+

			
 
				+    data[0] = 1.0f - (yy + zz); 	

			
 
				+    data[1] = xy - wz;

			
 
				+    data[2] = xz + wy;		

			
 
				+    data[3] = 0.0f;

			
 
				+

			
 
				+    data[4] = xy + wz;		

			
 
				+    data[5] = 1.0f - (xx + zz);

			
 
				+    data[6] = yz - wx;		

			
 
				+    data[7] = 0.0f;

			
 
				+

			
 
				+    data[8] = xz - wy;		

			
 
				+    data[9] = yz + wx;

			
 
				+    data[10] = 1.0f - (xx + yy);		

			
 
				+    data[11] = 0.0f;

			
 
				+

			
 
				+    data[12] = offset.x;

			
 
				+    data[13] = offset.y;

			
 
				+    data[14] = offset.z;			

			
 
				+    data[15] = 1.0f;

			
 
				+}

			
 
				+

			
 
				+//@}

			
 
				+

			
 
				+

			
 
				+/** @name Transformations: */

			
 
				+//@{

			
 
				+

			
 
				+/** Apply a general scale. */

			
 
				+void Scale( float x, float y, float z ) {

			
 
				+    data[0] *= x;	data[4] *= y;	data[8]  *= z;

			
 
				+    data[1] *= x;	data[5] *= y;	data[9]  *= z;

			
 
				+    data[2] *= x;	data[6] *= y;	data[10] *= z;

			
 
				+    data[3] *= x;	data[7] *= y;	data[11] *= z;

			
 
				+}

			
 
				+

			
 
				+/** Apply a rotation of theta degrees around the axis v*/

			
 
				+void Rotate( float theta, const Vec3 & v ) {

			
 
				+    Matrix b;

			
 
				+    b.RotationMatrix( theta, v[0], v[1], v[2] );

			
 
				+    Multiply4x3( b );

			
 
				+}

			
 
				+

			
 
				+/** Apply a rotation of theta degrees around the axis v*/

			
 
				+void Rotate( float theta, float v0, float v1, float v2 ) {

			
 
				+    Matrix b;

			
 
				+    b.RotationMatrix( theta, v0, v1, v2 );

			
 
				+    Multiply4x3( b );

			
 
				+}

			
 
				+

			
 
				+/**

			
 
				+* Translate the matrix by t. This is the same as multiplying by a

			
 
				+* translation matrix with the given offset.

			
 
				+* this = T * this

			
 
				+*/

			
 
				+void Translate( const Vec3 &t ) {

			
 
				+    data[12] = data[0] * t.x + data[4] * t.y + data[8]  * t.z + data[12];

			
 
				+    data[13] = data[1] * t.x + data[5] * t.y + data[9]  * t.z + data[13];

			
 
				+    data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14];

			
 
				+    data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15];

			
 
				+}

			
 
				+

			
 
				+/** 

			
 
				+* Translate the matrix by x, y, z. This is the same as multiplying by a 

			
 
				+* translation matrix with the given offsets.

			
 
				+*/

			
 
				+void Translate( float x, float y, float z ) {

			
 
				+    data[12] = data[0] * x + data[4] * y + data[8]  * z + data[12];

			
 
				+    data[13] = data[1] * x + data[5] * y + data[9]  * z + data[13];

			
 
				+    data[14] = data[2] * x + data[6] * y + data[10] * z + data[14];

			
 
				+    data[15] = data[3] * x + data[7] * y + data[11] * z + data[15];

			
 
				+}

			
 
				+

			
 
				+/** Compute the transposed matrix. */

			
 
				+void Transpose() {

			
 
				+    piSwap(data[1], data[4]);

			
 
				+    piSwap(data[2], data[8]);

			
 
				+    piSwap(data[6], data[9]);

			
 
				+    piSwap(data[3], data[12]);

			
 
				+    piSwap(data[7], data[13]);

			
 
				+    piSwap(data[11], data[14]);

			
 
				+}

			
 
				+

			
 
				+/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */

			
 
				+void IsometryInverse() {

			
 
				+    // transposed 3x3 upper left matrix

			
 
				+    piSwap(data[1], data[4]);

			
 
				+    piSwap(data[2], data[8]);

			
 
				+    piSwap(data[6], data[9]);

			
 
				+

			
 
				+    // translate by the negative offsets

			
 
				+    Vec3 v(-data[12], -data[13], -data[14]);

			
 
				+    data[12] = data[13] = data[14] = 0;

			
 
				+    Translate(v);

			
 
				+}

			
 
				+

			
 
				+/** Compute the inverse of the affine portion of this matrix. */

			
 
				+void AffineInverse() {

			
 
				+    data[12] = data[13] = data[14] = 0;

			
 
				+    Transpose();

			
 
				+}

			
 
				+//@}

			
 
				+

			
 
				+/** @name Matrix operations: */

			
 
				+//@{

			
 
				+

			
 
				+/** Return the determinant of this matrix. */

			
 
				+float Determinant() const {

			
 
				+    return	data[0] * data[5] * data[10] * data[15] + 

			
 
				+        data[1] * data[6] * data[11] * data[12] +

			
 
				+        data[2] * data[7] * data[ 8] * data[13] +

			
 
				+        data[3] * data[4] * data[ 9] * data[14] -

			
 
				+        data[3] * data[6] * data[ 9] * data[12] -

			
 
				+        data[2] * data[5] * data[ 8] * data[15] -

			
 
				+        data[1] * data[4] * data[11] * data[14] -

			
 
				+        data[0] * data[7] * data[10] * data[12];

			
 
				+}

			
 
				+

			
 
				+

			
 
				+/** Standard matrix product: this *= B. */

			
 
				+void Multiply4x4( const Matrix & restrict B ) {

			
 
				+    Multiply4x4(*this, B);

			
 
				+}

			
 
				+

			
 
				+/** Standard matrix product: this = A * B. this != B*/

			
 
				+void Multiply4x4( const Matrix & A, const Matrix & restrict B ) {

			
 
				+    piDebugCheck(this != &B);

			
 
				+

			
 
				+    for(int i = 0; i < 4; i++) {

			
 
				+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);

			
 
				+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);

			
 
				+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);

			
 
				+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);

			
 
				+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);

			
 
				+    }

			
 
				+

			
 
				+    /* Unrolled but does not allow this == A

			
 
				+    data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3];

			
 
				+    data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3];

			
 
				+    data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3];

			
 
				+    data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3];

			
 
				+    data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7];

			
 
				+    data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7];

			
 
				+    data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7];

			
 
				+    data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7];

			
 
				+    data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11];

			
 
				+    data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11];

			
 
				+    data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11];

			
 
				+    data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11];

			
 
				+    data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15];

			
 
				+    data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15];

			
 
				+    data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15];

			
 
				+    data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15];

			
 
				+    */

			
 
				+}

			
 
				+

			
 
				+/** Standard matrix product: this *= B. */

			
 
				+void Multiply4x3( const Matrix & restrict B ) {

			
 
				+    Multiply4x3(*this, B);

			
 
				+}

			
 
				+

			
 
				+/** Standard product of matrices, where the last row is [0 0 0 1]. */

			
 
				+void Multiply4x3( const Matrix & A, const Matrix & restrict B ) {

			
 
				+    piDebugCheck(this != &B);

			
 
				+

			
 
				+    for(int i = 0; i < 3; i++) {

			
 
				+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);

			
 
				+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);

			
 
				+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);

			
 
				+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);

			
 
				+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);

			
 
				+    }

			
 
				+    data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f;

			
 
				+

			
 
				+    /* Unrolled but does not allow this == A

			
 
				+    data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3];

			
 
				+    data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3];

			
 
				+    data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3];

			
 
				+    data[3] = 0.0f;

			
 
				+    data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7];

			
 
				+    data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7];

			
 
				+    data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7];

			
 
				+    data[7] = 0.0f;

			
 
				+    data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11];

			
 
				+    data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11];

			
 
				+    data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11];

			
 
				+    data[11]= 0.0f;

			
 
				+    data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15];

			
 
				+    data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15];

			
 
				+    data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15];

			
 
				+    data[15]= 1.0f;

			
 
				+    */

			
 
				+}

			
 
				+//@}

			
 
				+

			
 
				+

			
 
				+/** @name Vector operations: */

			
 
				+//@{

			
 
				+

			
 
				+/** Transform 3d vector (w=0). */

			
 
				+void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const {

			
 
				+    piDebugCheck(&orig != dest);

			
 
				+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8];

			
 
				+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9];

			
 
				+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10];

			
 
				+}

			
 
				+/** Transform 3d vector by the transpose (w=0). */

			
 
				+void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const {

			
 
				+    piDebugCheck(&orig != dest);

			
 
				+    dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2];

			
 
				+    dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6];

			
 
				+    dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10];

			
 
				+}

			
 
				+

			
 
				+/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */

			
 
				+void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const {

			
 
				+    piDebugCheck(&orig != dest);

			
 
				+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];

			
 
				+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];

			
 
				+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];

			
 
				+}

			
 
				+

			
 
				+/** Transform a point, normalize it, and return w. */

			
 
				+float TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const {

			
 
				+    piDebugCheck(&orig != dest);

			
 
				+    float w;

			
 
				+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];

			
 
				+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];

			
 
				+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];

			
 
				+    w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]);

			
 
				+    *dest *= w;

			
 
				+    return w;

			
 
				+}

			
 
				+

			
 
				+/** Transform a point and return w. */

			
 
				+float TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const {

			
 
				+    piDebugCheck(&orig != dest);

			
 
				+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];

			
 
				+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];

			
 
				+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];

			
 
				+    return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];

			
 
				+}

			
 
				+

			
 
				+/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */

			
 
				+void TransformVec4(const Vec3 & orig, Vec4 * dest) const {

			
 
				+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];

			
 
				+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];

			
 
				+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];

			
 
				+    dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];

			
 
				+}

			
 
				+//@}

			
 
				+

			
 
				+/** @name Matrix analysis. */

			
 
				+//@{

			
 
				+

			
 
				+/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */

			
 
				+void GetEulerAnglesZYZ(float * s, float * t, float * r) const {

			
 
				+    if( GetElem(2,2) < 1.0f ) {

			
 
				+        if( GetElem(2,2) > -1.0f ) {

			
 
				+            // 	cs*ct*cr-ss*sr 		-ss*ct*cr-cs*sr		st*cr

			
 
				+            //	cs*ct*sr+ss*cr		-ss*ct*sr+cs*cr		st*sr

			
 
				+            //	-cs*st				ss*st				ct

			
 
				+            *s = atan2(GetElem(1,2), -GetElem(0,2));

			
 
				+            *t = acos(GetElem(2,2));

			
 
				+            *r = atan2(GetElem(2,1), GetElem(2,0));		

			
 
				+        }

			
 
				+        else {

			
 
				+            // 	-c(s-r)	 	s(s-r)		0

			
 
				+            //	s(s-r)		c(s-r)		0

			
 
				+            //	0			0			-1

			
 
				+            *s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r

			
 
				+            *t = PI;

			
 
				+            *r = 0;

			
 
				+        }

			
 
				+    }

			
 
				+    else {

			
 
				+        // 	c(s+r)		-s(s+r)		0

			
 
				+        //	s(s+r)		c(s+r)		0

			
 
				+        //	0			0			1

			
 
				+        *s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r

			
 
				+        *t = 0;

			
 
				+        *r = 0;

			
 
				+    }

			
 
				+}

			
 
				+

			
 
				+//@}

			
 
				+

			
 
				+MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m );

			
 
				+

			
 
				+/** Print to debug output. */

			
 
				+void Print() const {

			
 
				+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] );

			
 
				+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] );

			
 
				+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] );

			
 
				+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] );

			
 
				+}

			
 
				+

			
 
				+

			
 
				+public:

			
 
				+

			
 
				+    float data[16];

			
 
				+

			
 
				+};

			
 
				+#endif

			
 
				+

			
 
				+

			
 
				+#endif // NV_MATH_MATRIX_INL

			
--- a/3rdparty/nvtt/nvmath/nvmath.h
+++ b/3rdparty/nvtt/nvmath/nvmath.h
@@ -0,0 +1,56 @@
 
				+// This code is in the public domain -- [email protected]
			
 
				+
			
 
				+#ifndef NV_MATH_H
			
 
				+#define NV_MATH_H
			
 
				+
			
 
				+#include <math.h>
			
 
				+#include <float.h>  // finite, isnan
			
 
				+
			
 
				+#include "nvcore/utils.h"   // max, clamp
			
 
				+
			
 
				+#define NVMATH_API
			
 
				+#define NVMATH_CLASS
			
 
				+
			
 
				+#define PI                  float(3.1415926535897932384626433833)
			
 
				+#define NV_EPSILON          (0.0001f)
			
 
				+#define NV_NORMAL_EPSILON   (0.001f)
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
			
 
				+    inline float toDegree(float radian) { return radian * (180.0f / PI); }
			
 
				+
			
 
				+    // Robust floating point comparisons:
			
 
				+    // http://realtimecollisiondetection.net/blog/?p=89
			
 
				+    inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        //return fabs(f0-f1) <= epsilon;
			
 
				+        return fabs(f0-f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1));
			
 
				+    }
			
 
				+
			
 
				+    inline bool isZero(const float f, const float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        return fabsf(f) <= epsilon;
			
 
				+    }
			
 
				+
			
 
				+    inline bool isFinite(const float f)
			
 
				+    {
			
 
				+        return _finite(f) != 0;
			
 
				+    }
			
 
				+
			
 
				+    // Eliminates negative zeros from a float array.
			
 
				+    inline void floatCleanup(float * fp, int n)
			
 
				+    {
			
 
				+        for (int i = 0; i < n; i++) {
			
 
				+            //nvDebugCheck(isFinite(fp[i]));
			
 
				+            union { float f; uint32 i; } x = { fp[i] };
			
 
				+            if (x.i == 0x80000000) fp[i] = 0.0f;
			
 
				+        }
			
 
				+    }
			
 
				+
			
 
				+    inline float saturate(float f) {
			
 
				+        return clamp(f, 0.0f, 1.0f);
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+#endif // NV_MATH_H
			
--- a/3rdparty/nvtt/nvmath/plane.h
+++ b/3rdparty/nvtt/nvmath/plane.h
@@ -0,0 +1,40 @@
 
				+// This code is in the public domain -- Ignacio Castańo <[email protected]>
			
 
				+
			
 
				+#ifndef NV_MATH_PLANE_H
			
 
				+#define NV_MATH_PLANE_H
			
 
				+
			
 
				+#include "nvmath.h"
			
 
				+#include "vector.h"
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    class Matrix;
			
 
				+
			
 
				+    class NVMATH_CLASS Plane
			
 
				+    {
			
 
				+    public:
			
 
				+        Plane();
			
 
				+        Plane(float x, float y, float z, float w);
			
 
				+        Plane(const Vector4 & v);
			
 
				+        Plane(const Vector3 & v, float d);
			
 
				+        Plane(const Vector3 & normal, const Vector3 & point);
			
 
				+        Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2);
			
 
				+
			
 
				+        const Plane & operator=(const Plane & v);
			
 
				+
			
 
				+        Vector3 vector() const;
			
 
				+        float offset() const;
			
 
				+
			
 
				+        void operator*=(float s);
			
 
				+
			
 
				+        Vector4 v;
			
 
				+    };
			
 
				+
			
 
				+    Plane transformPlane(const Matrix &, const Plane &);
			
 
				+
			
 
				+    Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c);
			
 
				+
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_MATH_PLANE_H
			
--- a/3rdparty/nvtt/nvmath/plane.inl
+++ b/3rdparty/nvtt/nvmath/plane.inl
@@ -0,0 +1,49 @@
 
				+// This code is in the public domain -- Ignacio Castaño <[email protected]>
			
 
				+
			
 
				+#pragma once
			
 
				+#ifndef NV_MATH_PLANE_INL
			
 
				+#define NV_MATH_PLANE_INL
			
 
				+
			
 
				+#include "Plane.h"
			
 
				+#include "Vector.inl"
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    inline Plane::Plane() {}
			
 
				+    inline Plane::Plane(float x, float y, float z, float w) : v(x, y, z, w) {}
			
 
				+    inline Plane::Plane(const Vector4 & v) : v(v) {}
			
 
				+    inline Plane::Plane(const Vector3 & v, float d) : v(v, d) {}
			
 
				+    inline Plane::Plane(const Vector3 & normal, const Vector3 & point) : v(normal, -dot(normal, point)) {}
			
 
				+    inline Plane::Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2) {
			
 
				+        Vector3 n = cross(v1-v0, v2-v0);
			
 
				+        float d = -dot(n, v0);
			
 
				+        v = Vector4(n, d);
			
 
				+    }
			
 
				+
			
 
				+    inline const Plane & Plane::operator=(const Plane & p) { v = p.v; return *this; }
			
 
				+
			
 
				+    inline Vector3 Plane::vector() const { return v.xyz(); }
			
 
				+    inline float Plane::offset() const { return v.w; }
			
 
				+
			
 
				+    // Normalize plane.
			
 
				+    inline Plane normalize(const Plane & plane, float epsilon = NV_EPSILON)
			
 
				+    {
			
 
				+        const float len = length(plane.vector());
			
 
				+        const float inv = isZero(len, epsilon) ? 0 : 1.0f / len;
			
 
				+        return Plane(plane.v * inv);
			
 
				+    }
			
 
				+
			
 
				+    // Get the signed distance from the given point to this plane.
			
 
				+    inline float distance(const Plane & plane, const Vector3 & point)
			
 
				+    {
			
 
				+        return dot(plane.vector(), point) + plane.offset();
			
 
				+    }
			
 
				+
			
 
				+    inline void Plane::operator*=(float s)
			
 
				+    {
			
 
				+        v *= s;
			
 
				+    }
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+#endif // NV_MATH_PLANE_H
			
--- a/3rdparty/nvtt/nvmath/vector.h
+++ b/3rdparty/nvtt/nvmath/vector.h
@@ -0,0 +1,148 @@
 
				+// This code is in the public domain -- [email protected]
			
 
				+
			
 
				+#ifndef NV_MATH_VECTOR_H
			
 
				+#define NV_MATH_VECTOR_H
			
 
				+
			
 
				+#include "nvmath.h"
			
 
				+
			
 
				+namespace nv
			
 
				+{
			
 
				+    class NVMATH_CLASS Vector2
			
 
				+    {
			
 
				+    public:
			
 
				+        typedef Vector2 const & Arg;
			
 
				+
			
 
				+        Vector2();
			
 
				+        explicit Vector2(float f);
			
 
				+        Vector2(float x, float y);
			
 
				+        Vector2(Vector2::Arg v);
			
 
				+
			
 
				+        //template <typename T> explicit Vector2(const T & v) : x(v.x), y(v.y) {}
			
 
				+        //template <typename T> operator T() const { return T(x, y); }
			
 
				+
			
 
				+        const Vector2 & operator=(Vector2::Arg v);
			
 
				+
			
 
				+        const float * ptr() const;
			
 
				+
			
 
				+        void set(float x, float y);
			
 
				+
			
 
				+        Vector2 operator-() const;
			
 
				+        void operator+=(Vector2::Arg v);
			
 
				+        void operator-=(Vector2::Arg v);
			
 
				+        void operator*=(float s);
			
 
				+        void operator*=(Vector2::Arg v);
			
 
				+
			
 
				+        friend bool operator==(Vector2::Arg a, Vector2::Arg b);
			
 
				+        friend bool operator!=(Vector2::Arg a, Vector2::Arg b);
			
 
				+
			
 
				+        union {
			
 
				+            struct {
			
 
				+                float x, y;
			
 
				+            };
			
 
				+            float component[2];
			
 
				+        };
			
 
				+    };
			
 
				+
			
 
				+    class NVMATH_CLASS Vector3
			
 
				+    {
			
 
				+    public:
			
 
				+        typedef Vector3 const & Arg;
			
 
				+
			
 
				+        Vector3();
			
 
				+        explicit Vector3(float x);
			
 
				+        //explicit Vector3(int x) : x(float(x)), y(float(x)), z(float(x)) {}
			
 
				+        Vector3(float x, float y, float z);
			
 
				+        Vector3(Vector2::Arg v, float z);
			
 
				+        Vector3(Vector3::Arg v);
			
 
				+
			
 
				+        //template <typename T> explicit Vector3(const T & v) : x(v.x), y(v.y), z(v.z) {}
			
 
				+        //template <typename T> operator T() const { return T(x, y, z); }
			
 
				+
			
 
				+        const Vector3 & operator=(Vector3::Arg v);
			
 
				+
			
 
				+        Vector2 xy() const;
			
 
				+
			
 
				+        const float * ptr() const;
			
 
				+
			
 
				+        void set(float x, float y, float z);
			
 
				+
			
 
				+        Vector3 operator-() const;
			
 
				+        void operator+=(Vector3::Arg v);
			
 
				+        void operator-=(Vector3::Arg v);
			
 
				+        void operator*=(float s);
			
 
				+        void operator/=(float s);
			
 
				+        void operator*=(Vector3::Arg v);
			
 
				+        void operator/=(Vector3::Arg v);
			
 
				+
			
 
				+        friend bool operator==(Vector3::Arg a, Vector3::Arg b);
			
 
				+        friend bool operator!=(Vector3::Arg a, Vector3::Arg b);
			
 
				+
			
 
				+        union {
			
 
				+            struct {
			
 
				+                float x, y, z;
			
 
				+            };
			
 
				+            float component[3];
			
 
				+        };
			
 
				+    };
			
 
				+
			
 
				+    class NVMATH_CLASS Vector4
			
 
				+    {
			
 
				+    public:
			
 
				+        typedef Vector4 const & Arg;
			
 
				+
			
 
				+        Vector4();
			
 
				+        explicit Vector4(float x);
			
 
				+        Vector4(float x, float y, float z, float w);
			
 
				+        Vector4(Vector2::Arg v, float z, float w);
			
 
				+        Vector4(Vector2::Arg v, Vector2::Arg u);
			
 
				+        Vector4(Vector3::Arg v, float w);
			
 
				+        Vector4(Vector4::Arg v);
			
 
				+        //	Vector4(const Quaternion & v);
			
 
				+
			
 
				+        //template <typename T> explicit Vector4(const T & v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
			
 
				+        //template <typename T> operator T() const { return T(x, y, z, w); }
			
 
				+
			
 
				+        const Vector4 & operator=(Vector4::Arg v);
			
 
				+
			
 
				+        Vector2 xy() const;
			
 
				+        Vector2 zw() const;
			
 
				+        Vector3 xyz() const;
			
 
				+
			
 
				+        const float * ptr() const;
			
 
				+
			
 
				+        void set(float x, float y, float z, float w);
			
 
				+
			
 
				+        Vector4 operator-() const;
			
 
				+        void operator+=(Vector4::Arg v);
			
 
				+        void operator-=(Vector4::Arg v);
			
 
				+        void operator*=(float s);
			
 
				+        void operator/=(float s);
			
 
				+        void operator*=(Vector4::Arg v);
			
 
				+        void operator/=(Vector4::Arg v);
			
 
				+
			
 
				+        friend bool operator==(Vector4::Arg a, Vector4::Arg b);
			
 
				+        friend bool operator!=(Vector4::Arg a, Vector4::Arg b);
			
 
				+
			
 
				+        union {
			
 
				+            struct {
			
 
				+                float x, y, z, w;
			
 
				+            };
			
 
				+            float component[4];
			
 
				+        };
			
 
				+    };
			
 
				+
			
 
				+} // nv namespace
			
 
				+
			
 
				+// If we had these functions, they would be ambiguous, the compiler would not know which one to pick:
			
 
				+//template <typename T> Vector2 to(const T & v) { return Vector2(v.x, v.y); }
			
 
				+//template <typename T> Vector3 to(const T & v) { return Vector3(v.x, v.y, v.z); }
			
 
				+//template <typename T> Vector4 to(const T & v) { return Vector4(v.x, v.y, v.z, v.z); }
			
 
				+
			
 
				+// We could use a cast operator so that we could infer the expected type, but that doesn't work the same way in all compilers and produces horrible error messages.
			
 
				+
			
 
				+// Instead we simply have explicit casts:
			
 
				+template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
			
 
				+template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
			
 
				+template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); }
			
 
				+
			
 
				+#endif // NV_MATH_VECTOR_H
			
--- a/3rdparty/nvtt/nvtt.cpp
+++ b/3rdparty/nvtt/nvtt.cpp
@@ -0,0 +1,95 @@
 
				+/*
			
 
				+ * Copyright 2011-2015 Branimir Karadzic. All rights reserved.
			
 
				+ * License: http://www.opensource.org/licenses/BSD-2-Clause
			
 
				+ */
			
 
				+
			
 
				+#include "nvtt.h"
			
 
				+
			
 
				+#include <string.h>
			
 
				+#include <bx/uint32_t.h>
			
 
				+
			
 
				+#include "bc6h/zoh.h"
			
 
				+#include "bc7/avpcl.h"
			
 
				+#include "nvmath/vector.inl"
			
 
				+
			
 
				+NVCORE_API int nvAbort(const char *, const char *, int , const char *, const char *, ...) __attribute__((format (printf, 5, 6)))
			
 
				+{
			
 
				+	abort();
			
 
				+	return 0;
			
 
				+}
			
 
				+
			
 
				+namespace nvtt
			
 
				+{
			
 
				+	using namespace nv;
			
 
				+
			
 
				+	void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output)
			
 
				+	{
			
 
				+		const uint8_t* src = (const uint8_t*)_input;
			
 
				+		char* dst = (char*)_output;
			
 
				+
			
 
				+		for (uint32_t yy = 0; yy < _height; yy += 4)
			
 
				+		{
			
 
				+			for (uint32_t xx = 0; xx < _width; xx += 4)
			
 
				+			{
			
 
				+				const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4];
			
 
				+
			
 
				+				ZOH::Utils::FORMAT = ZOH::UNSIGNED_F16;
			
 
				+				ZOH::Tile zohTile(4, 4);
			
 
				+
			
 
				+				memset(zohTile.data, 0, sizeof(zohTile.data) );
			
 
				+				memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map) );
			
 
				+
			
 
				+				for (uint32_t blockY = 0; blockY < 4; ++blockY)
			
 
				+				{
			
 
				+					for (uint32_t blockX = 0; blockX < 4; ++blockX)
			
 
				+					{
			
 
				+						Vector4 color = rgba[blockY*4 + blockX];
			
 
				+						uint16 rHalf = bx::halfFromFloat(color.x);
			
 
				+						uint16 gHalf = bx::halfFromFloat(color.y);
			
 
				+						uint16 bHalf = bx::halfFromFloat(color.z);
			
 
				+						zohTile.data[blockY][blockX].x = ZOH::Tile::half2float(rHalf);
			
 
				+						zohTile.data[blockY][blockX].y = ZOH::Tile::half2float(gHalf);
			
 
				+						zohTile.data[blockY][blockX].z = ZOH::Tile::half2float(bHalf);
			
 
				+						zohTile.importance_map[blockY][blockX] = 1.0f;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				ZOH::compress(zohTile, &dst[( (yy*_width) + xx)/4 * 16]);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output)
			
 
				+	{
			
 
				+		const uint8_t* src = (const uint8_t*)_input;
			
 
				+		char* dst = (char*)_output;
			
 
				+
			
 
				+		for (uint32_t yy = 0; yy < _height; yy += 4)
			
 
				+		{
			
 
				+			for (uint32_t xx = 0; xx < _width; xx += 4)
			
 
				+			{
			
 
				+				const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4];
			
 
				+
			
 
				+				AVPCL::mode_rgb     = false;
			
 
				+				AVPCL::flag_premult = false;
			
 
				+				AVPCL::flag_nonuniform     = false;
			
 
				+				AVPCL::flag_nonuniform_ati = false;
			
 
				+
			
 
				+				AVPCL::Tile avpclTile(4, 4);
			
 
				+				memset(avpclTile.data, 0, sizeof(avpclTile.data) );
			
 
				+				for (uint32_t blockY = 0; blockY < 4; ++blockY)
			
 
				+				{
			
 
				+					for (uint32_t blockX = 0; blockX < 4; ++blockX)
			
 
				+					{
			
 
				+						Vector4 color = rgba[blockY*4 + blockX];
			
 
				+						avpclTile.data[blockY][blockX] = color * 255.0f;
			
 
				+						avpclTile.importance_map[blockY][blockX] = 1.0f;
			
 
				+					}
			
 
				+				}
			
 
				+
			
 
				+				AVPCL::compress(avpclTile, &dst[( (yy*_width) + xx)/4 * 16]);
			
 
				+			}
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+} //namespace nvtt
			
--- a/3rdparty/nvtt/nvtt.h
+++ b/3rdparty/nvtt/nvtt.h
@@ -0,0 +1,13 @@
 
				+#ifndef NVTT_H
			
 
				+#define NVTT_H
			
 
				+
			
 
				+#include <stdint.h>
			
 
				+
			
 
				+namespace nvtt
			
 
				+{
			
 
				+void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output);
			
 
				+void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output);
			
 
				+
			
 
				+} // namespace nvtt
			
 
				+
			
 
				+#endif // NVTT_H
			
--- a/scripts/texturec.lua
+++ b/scripts/texturec.lua
@@ -12,6 +12,7 @@ project "texturec"
 
				 		path.join(BGFX_DIR, "include"),
			
 
				 		path.join(BGFX_DIR, "src"),
			
 
				 		path.join(BGFX_DIR, "3rdparty"),
			
 
				+		path.join(BGFX_DIR, "3rdparty/nvtt"),
			
 
				 	}
			
 
				 
			
 
				 	files {
			
@@ -20,6 +21,8 @@ project "texturec"
 
				 		path.join(BGFX_DIR, "3rdparty/libsquish/**.h"),
			
 
				 		path.join(BGFX_DIR, "3rdparty/etc1/**.cpp"),
			
 
				 		path.join(BGFX_DIR, "3rdparty/etc1/**.h"),
			
 
				+		path.join(BGFX_DIR, "3rdparty/nvtt/**.cpp"),
			
 
				+		path.join(BGFX_DIR, "3rdparty/nvtt/**.h"),
			
 
				 		path.join(BGFX_DIR, "tools/texturec/**.cpp"),
			
 
				 		path.join(BGFX_DIR, "tools/texturec/**.h"),
			
 
				 	}
			
--- a/tools/texturec/texturec.cpp
+++ b/tools/texturec/texturec.cpp
@@ -13,6 +13,7 @@
 
				 #include "image.h"
			
 
				 #include <libsquish/squish.h>
			
 
				 #include <etc1/etc1.h>
			
 
				+#include <nvtt/nvtt.h>
			
 
				 
			
 
				 #if 0
			
 
				 #	define BX_TRACE(_format, ...) fprintf(stderr, "" _format "\n", ##__VA_ARGS__)
			
@@ -113,6 +114,14 @@ int main(int _argc, const char* _argv[])
 
				 		{
			
 
				 			format = TextureFormat::ETC1;
			
 
				 		}
			
 
				+		else if (0 == bx::stricmp(type, "bc6h") )
			
 
				+		{
			
 
				+			format = TextureFormat::BC6H;
			
 
				+		}
			
 
				+		else if (0 == bx::stricmp(type, "bc7") )
			
 
				+		{
			
 
				+			format = TextureFormat::BC7;
			
 
				+		}
			
 
				 	}
			
 
				 
			
 
				 	uint32_t size = (uint32_t)bx::getSize(&reader);
			
@@ -154,10 +163,33 @@ int main(int _argc, const char* _argv[])
 
				 					);
			
 
				 				break;
			
 
				 
			
 
				+			case TextureFormat::BC4:
			
 
				+			case TextureFormat::BC5:
			
 
				+				break;
			
 
				+
			
 
				+			case TextureFormat::BC6H:
			
 
				+				nvtt::compressBC6H(rgba, mip.m_width, mip.m_height, 4, output);
			
 
				+				break;
			
 
				+
			
 
				+			case TextureFormat::BC7:
			
 
				+				nvtt::compressBC7(rgba, mip.m_width, mip.m_height, 4, output);
			
 
				+				break;
			
 
				+
			
 
				 			case TextureFormat::ETC1:
			
 
				 				etc1_encode_image(rgba, mip.m_width, mip.m_height, 4, mip.m_width*4, output);
			
 
				 				break;
			
 
				 
			
 
				+			case TextureFormat::ETC2:
			
 
				+			case TextureFormat::ETC2A:
			
 
				+			case TextureFormat::ETC2A1:
			
 
				+			case TextureFormat::PTC12:
			
 
				+			case TextureFormat::PTC14:
			
 
				+			case TextureFormat::PTC12A:
			
 
				+			case TextureFormat::PTC14A:
			
 
				+			case TextureFormat::PTC22:
			
 
				+			case TextureFormat::PTC24:
			
 
				+				break;
			
 
				+
			
 
				 			default:
			
 
				 				break;
			
 
				 			}