Browse Source

Added stripped down NVTT library.

Branimir Karadžić 10 years ago
parent
commit
8ab70bd8cf
56 changed files with 20626 additions and 0 deletions
  1. 24 0
      3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt
  2. 76 0
      3rdparty/nvtt/bc6h/bits.h
  3. 133 0
      3rdparty/nvtt/bc6h/shapes_two.h
  4. 83 0
      3rdparty/nvtt/bc6h/tile.h
  5. 197 0
      3rdparty/nvtt/bc6h/zoh.cpp
  6. 65 0
      3rdparty/nvtt/bc6h/zoh.h
  7. 324 0
      3rdparty/nvtt/bc6h/zoh_utils.cpp
  8. 73 0
      3rdparty/nvtt/bc6h/zoh_utils.h
  9. 799 0
      3rdparty/nvtt/bc6h/zohone.cpp
  10. 883 0
      3rdparty/nvtt/bc6h/zohtwo.cpp
  11. 264 0
      3rdparty/nvtt/bc7/avpcl.cpp
  12. 99 0
      3rdparty/nvtt/bc7/avpcl.h
  13. 1066 0
      3rdparty/nvtt/bc7/avpcl_mode0.cpp
  14. 1047 0
      3rdparty/nvtt/bc7/avpcl_mode1.cpp
  15. 1004 0
      3rdparty/nvtt/bc7/avpcl_mode2.cpp
  16. 1059 0
      3rdparty/nvtt/bc7/avpcl_mode3.cpp
  17. 1214 0
      3rdparty/nvtt/bc7/avpcl_mode4.cpp
  18. 1216 0
      3rdparty/nvtt/bc7/avpcl_mode5.cpp
  19. 1055 0
      3rdparty/nvtt/bc7/avpcl_mode6.cpp
  20. 1094 0
      3rdparty/nvtt/bc7/avpcl_mode7.cpp
  21. 389 0
      3rdparty/nvtt/bc7/avpcl_utils.cpp
  22. 61 0
      3rdparty/nvtt/bc7/avpcl_utils.h
  23. 76 0
      3rdparty/nvtt/bc7/bits.h
  24. 81 0
      3rdparty/nvtt/bc7/endpts.h
  25. 132 0
      3rdparty/nvtt/bc7/shapes_three.h
  26. 133 0
      3rdparty/nvtt/bc7/shapes_two.h
  27. 41 0
      3rdparty/nvtt/bc7/tile.h
  28. 437 0
      3rdparty/nvtt/nvcore/Array.inl
  29. 216 0
      3rdparty/nvtt/nvcore/Debug.h
  30. 181 0
      3rdparty/nvtt/nvcore/array.h
  31. 53 0
      3rdparty/nvtt/nvcore/defsgnucdarwin.h
  32. 59 0
      3rdparty/nvtt/nvcore/defsgnuclinux.h
  33. 65 0
      3rdparty/nvtt/nvcore/defsgnucwin32.h
  34. 94 0
      3rdparty/nvtt/nvcore/defsvcwin32.h
  35. 68 0
      3rdparty/nvtt/nvcore/foreach.h
  36. 83 0
      3rdparty/nvtt/nvcore/hash.h
  37. 29 0
      3rdparty/nvtt/nvcore/memory.h
  38. 299 0
      3rdparty/nvtt/nvcore/nvcore.h
  39. 1030 0
      3rdparty/nvtt/nvcore/posh.h
  40. 459 0
      3rdparty/nvtt/nvcore/stdstream.h
  41. 163 0
      3rdparty/nvtt/nvcore/stream.h
  42. 429 0
      3rdparty/nvtt/nvcore/strlib.h
  43. 281 0
      3rdparty/nvtt/nvcore/utils.h
  44. 921 0
      3rdparty/nvtt/nvmath/Vector.inl
  45. 1200 0
      3rdparty/nvtt/nvmath/fitting.cpp
  46. 49 0
      3rdparty/nvtt/nvmath/fitting.h
  47. 112 0
      3rdparty/nvtt/nvmath/matrix.h
  48. 1274 0
      3rdparty/nvtt/nvmath/matrix.inl
  49. 56 0
      3rdparty/nvtt/nvmath/nvmath.h
  50. 40 0
      3rdparty/nvtt/nvmath/plane.h
  51. 49 0
      3rdparty/nvtt/nvmath/plane.inl
  52. 148 0
      3rdparty/nvtt/nvmath/vector.h
  53. 95 0
      3rdparty/nvtt/nvtt.cpp
  54. 13 0
      3rdparty/nvtt/nvtt.h
  55. 3 0
      scripts/texturec.lua
  56. 32 0
      tools/texturec/texturec.cpp

+ 24 - 0
3rdparty/nvtt/NVIDIA_Texture_Tools_LICENSE.txt

@@ -0,0 +1,24 @@
+NVIDIA Texture Tools 2.0 is licensed under the MIT license.
+
+Copyright (c) 2007 NVIDIA Corporation
+
+Permission is hereby granted, free of charge, to any person
+obtaining a copy of this software and associated documentation
+files (the "Software"), to deal in the Software without
+restriction, including without limitation the rights to use,
+copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.

+ 76 - 0
3rdparty/nvtt/bc6h/bits.h

@@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_BITS_H
+#define _ZOH_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/Debug.h"
+
+namespace ZOH {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif

+ 133 - 0
3rdparty/nvtt/bc6h/shapes_two.h

@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_SHAPES_TWO_H
+#define _ZOH_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static const int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static const int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif

+ 83 - 0
3rdparty/nvtt/bc6h/tile.h

@@ -0,0 +1,83 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_TILE_H
+#define _ZOH_TILE_H
+
+#include "zoh_utils.h"
+#include "nvmath/Vector.h"
+#include <math.h>
+
+namespace ZOH {
+
+//#define	USE_IMPORTANCE_MAP	1		// define this if you want to increase importance of some pixels in tile
+class Tile
+{
+public:
+	// NOTE: this returns the appropriately-clamped BIT PATTERN of the half as an INTEGRAL float value
+	static float half2float(uint16 h)
+	{
+		return (float) Utils::ushort_to_format(h);
+	}
+	// NOTE: this is the inverse of the above operation
+	static uint16 float2half(float f)
+	{
+		return Utils::format_to_ushort((int)f);
+	}
+
+	// look for adjacent pixels that are identical. if there are enough of them, increase their importance
+	void generate_importance_map()
+	{
+		// initialize
+		for (int y=0; y<size_y; ++y)
+		for (int x=0; x<size_x; ++x)
+		{
+			// my importance is increased if I am identical to any of my 4-neighbors
+			importance_map[y][x] = match_4_neighbor(x,y) ? 5.0f : 1.0f;
+		}
+	}
+	bool is_equal(int x, int y, int xn, int yn)
+	{
+		if (xn < 0 || xn >= size_x || yn < 0 || yn >= size_y)
+			return false;
+		return( (data[y][x].x == data[yn][xn].x) &&
+				(data[y][x].y == data[yn][xn].y) &&
+				(data[y][x].z == data[yn][xn].z) );
+	}
+
+#ifdef USE_IMPORTANCE_MAP
+	bool match_4_neighbor(int x, int y)
+	{
+		return is_equal(x,y,x-1,y) || is_equal(x,y,x+1,y) || is_equal(x,y,x,y-1) || is_equal(x,y,x,y+1);
+	}
+#else
+	bool match_4_neighbor(int, int)
+	{
+		return false;
+	}
+#endif
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+    nv::Vector3 data[TILE_H][TILE_W];
+	float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+};
+
+}
+
+#endif // _ZOH_TILE_H

+ 197 - 0
3rdparty/nvtt/bc6h/zoh.cpp

@@ -0,0 +1,197 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the zoh compressor and decompressor
+
+#include "tile.h"
+#include "zoh.h"
+
+#include <string.h> // memcpy
+
+using namespace ZOH;
+
+
+bool ZOH::isone(const char *block)
+{
+	char code = block[0] & 0x1F;
+
+	return (code == 0x03 || code == 0x07 || code == 0x0b || code == 0x0f);
+}
+
+void ZOH::compress(const Tile &t, char *block)
+{
+	char oneblock[ZOH::BLOCKSIZE], twoblock[ZOH::BLOCKSIZE];
+
+	float mseone = ZOH::compressone(t, oneblock);
+	float msetwo = ZOH::compresstwo(t, twoblock);
+
+	if (mseone <= msetwo)
+		memcpy(block, oneblock, ZOH::BLOCKSIZE);
+	else
+		memcpy(block, twoblock, ZOH::BLOCKSIZE);
+}
+
+void ZOH::decompress(const char *block, Tile &t)
+{
+	if (ZOH::isone(block))
+		ZOH::decompressone(block, t);
+	else
+		ZOH::decompresstwo(block, t);
+}
+
+/*
+void ZOH::compress(string inf, string zohf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	Exr::readRgba(inf, pixels, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "wb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for write";
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	int ndots = 25;
+	int dotcnt = 0;
+	printf("Progress [");
+	for (int i=0; i<ndots;++i) printf(" ");
+	printf("]\rProgress ["); fflush(stdout);
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			ZOH::compress(t, block);
+			if (fwrite(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+			if (tilecnt > (ntiles * dotcnt)/ndots) { printf("."); fflush(stdout); ++dotcnt; }
+		}
+	}
+
+	printf("]\n");		// advance to next line finally
+
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+}
+
+static int str2int(std::string s)
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// zoh file name is ...-w-h.zoh, extract width and height
+static void extract(string zohf, int &w, int &h)
+{
+	size_t n = zohf.rfind('.', zohf.length()-1);
+	size_t n1 = zohf.rfind('-', n-1);
+	size_t n2 = zohf.rfind('-', n1-1);
+	string width = zohf.substr(n2+1, n1-n2-1);
+	w = str2int(width);
+	string height = zohf.substr(n1+1, n-n1-1);
+	h = str2int(height);
+}
+
+static int mode_to_prec[] = {
+	10,7,11,10,
+	10,7,11,11,
+	10,7,11,12,
+	10,7,9,16,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,8,-1,
+	10,7,6,-1,
+};
+
+static int shapeindexhist[32], modehist[32], prechistone[16], prechisttwo[16], oneregion, tworegions;
+
+static void stats(char block[ZOH::BLOCKSIZE])
+{
+	char mode = block[0] & 0x1F; if ((mode & 0x3) == 0) mode = 0; if ((mode & 0x3) == 1) mode = 1; modehist[mode]++;
+	int prec = mode_to_prec[mode];
+	nvAssert (prec != -1);
+	if (!ZOH::isone(block))
+	{
+		tworegions++;
+		prechisttwo[prec]++;
+		int shapeindex = ((block[0] & 0xe0) >> 5) | ((block[1] & 0x3) << 3);
+		shapeindexhist[shapeindex]++;
+	}
+	else
+	{
+		oneregion++;
+		prechistone[prec]++;
+	}
+}
+
+static void printstats()
+{
+	printf("\nPrecision histogram 10b to 16b one region: "); for (int i=10; i<=16; ++i) printf("%d,", prechistone[i]);
+	printf("\nPrecision histogram 6b to 11b two regions: "); for (int i=6; i<=11; ++i) printf("%d,", prechisttwo[i]);
+	printf("\nMode histogram: "); for (int i=0; i<32; ++i) printf("%d,", modehist[i]);
+	printf("\nShape index histogram: "); for (int i=0; i<32; ++i) printf("%d,", shapeindexhist[i]);
+	printf("\nOne region %5.2f%%  Two regions %5.2f%%", 100.0*oneregion/float(oneregion+tworegions), 100.0*tworegions/float(oneregion+tworegions));
+	printf("\n");
+}
+
+void ZOH::decompress(string zohf, string outf)
+{
+	Array2D<Rgba> pixels;
+	int w, h;
+	char block[ZOH::BLOCKSIZE];
+
+	extract(zohf, w, h);
+	FILE *zohfile = fopen(zohf.c_str(), "rb");
+	if (zohfile == NULL) throw "Unable to open .zoh file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), ZOH::BLOCKSIZE, zohfile) != ZOH::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+
+			ZOH::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(zohfile)) throw "Close failed on .zoh file";
+	Exr::writeRgba(outf, pixels, w, h);
+
+#ifndef EXTERNAL_RELEASE
+	printstats();	// print statistics
+#endif
+}
+*/

+ 65 - 0
3rdparty/nvtt/bc6h/zoh.h

@@ -0,0 +1,65 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+#pragma once
+#ifndef _ZOH_H
+#define _ZOH_H
+
+#include "tile.h"
+
+namespace ZOH {
+
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_ONE	= 1;
+static const int NCHANNELS		= 3;
+
+struct FltEndpts
+{
+    nv::Vector3 A;
+    nv::Vector3 B;
+};
+
+struct IntEndpts
+{
+	int A[NCHANNELS];
+	int B[NCHANNELS];
+};
+
+struct ComprEndpts
+{
+	uint A[NCHANNELS];
+	uint B[NCHANNELS];
+};
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compressone(const Tile &t, char *block);
+float compresstwo(const Tile &t, char *block);
+void decompressone(const char *block, Tile &t);
+void decompresstwo(const char *block, Tile &t);
+
+float refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block);
+float roughtwo(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_TWO]);
+
+float refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block);
+float roughone(const Tile &tile, int shape, FltEndpts endpts[NREGIONS_ONE]);
+
+bool isone(const char *block);
+
+}
+
+#endif // _ZOH_H

+ 324 - 0
3rdparty/nvtt/bc6h/zoh_utils.cpp

@@ -0,0 +1,324 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "zoh_utils.h"
+#include "nvmath/Vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace ZOH;
+
+static const int denom7_weights_64[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights_64[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+/*static*/ Format Utils::FORMAT;
+
+int Utils::lerp(int a, int b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int round = 32, shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvDebugCheck(0);
+	}
+
+	return (a*weights[denom-i] +b*weights[i] + round) >> shift;
+}
+
+Vector3 Utils::lerp(const Vector3& a, const Vector3 &b, int i, int denom)
+{
+	nvDebugCheck (denom == 3 || denom == 7 || denom == 15);
+	nvDebugCheck (i >= 0 && i <= denom);
+
+	int shift = 6;
+	const int *weights;
+
+	switch(denom)
+	{
+	case 3:		denom *= 5; i *= 5;	// fall through to case 15
+	case 15:	weights = denom15_weights_64; break;
+	case 7:		weights = denom7_weights_64; break;
+	default:	nvUnreachable();
+	}
+
+	// no need to round these as this is an exact division
+	return (a*float(weights[denom-i]) +b*float(weights[i])) / float(1 << shift);
+}
+
+
+/*
+	For unsigned f16, clamp the input to [0,F16MAX]. Thus u15.
+	For signed f16, clamp the input to [-F16MAX,F16MAX]. Thus s16.
+
+	The conversions proceed as follows:
+
+	unsigned f16: get bits. if high bit set, clamp to 0, else clamp to F16MAX.
+	signed f16: get bits. extract exp+mantissa and clamp to F16MAX. return -value if sign bit was set, else value
+	unsigned int: get bits. return as a positive value.
+	signed int. get bits. return as a value in -32768..32767.
+
+	The inverse conversions are just the inverse of the above.
+*/
+
+// clamp the 3 channels of the input vector to the allowable range based on FORMAT
+// note that each channel is a float storing the allowable range as a bit pattern converted to float
+// that is, for unsigned f16 say, we would clamp each channel to the range [0, F16MAX]
+
+void Utils::clamp(Vector3 &v)
+{
+	for (int i=0; i<3; ++i)
+	{
+		switch(Utils::FORMAT)
+		{
+		case UNSIGNED_F16:
+			if (v.component[i] < 0.0) v.component[i] = 0;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		case SIGNED_F16:
+			if (v.component[i] < -F16MAX) v.component[i] = -F16MAX;
+			else if (v.component[i] > F16MAX) v.component[i] = F16MAX;
+			break;
+
+		default:
+			nvUnreachable();
+		}
+	}
+}
+
+// convert a u16 value to s17 (represented as an int) based on the format expected
+int Utils::ushort_to_format(unsigned short input)
+{
+	int out, s;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		if (input & F16S_MASK) out = 0;
+		else if (input > F16MAX) out = F16MAX;
+		else out = input;
+		break;
+
+	case SIGNED_F16:
+		s = input & F16S_MASK;
+		input &= F16EM_MASK;
+		if (input > F16MAX) out = F16MAX;
+		else out = input;
+		out = s ? -out : out;
+		break;
+	}
+	return out;
+}
+
+// convert a s17 value to u16 based on the format expected
+unsigned short Utils::format_to_ushort(int input)
+{
+	unsigned short out;
+
+	// clamp to the valid range we are expecting
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (input >= 0 && input <= F16MAX);
+		out = input;
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (input >= -F16MAX && input <= F16MAX);
+		// convert to sign-magnitude
+		int s;
+		if (input < 0) { s = F16S_MASK; input = -input; }
+		else           { s = 0; }
+		out = s | input;
+		break;
+	}
+	return out;
+}
+
+// quantize the input range into equal-sized bins
+int Utils::quantize(float value, int prec)
+{
+	int q, ivalue, s;
+
+	nvDebugCheck (prec > 1);	// didn't bother to make it work for 1
+
+	value = (float)floor(value + 0.5);
+
+	int bias = (prec > 10) ? ((1<<(prec-1))-1) : 0;	// bias precisions 11..16 to get a more accurate quantization
+
+	switch (Utils::FORMAT)
+	{
+	case UNSIGNED_F16:
+		nvDebugCheck (value >= 0 && value <= F16MAX);
+		ivalue = (int)value;
+		q = ((ivalue << prec) + bias) / (F16MAX+1);
+		nvDebugCheck (q >= 0 && q < (1 << prec));
+		break;
+
+	case SIGNED_F16:
+		nvDebugCheck (value >= -F16MAX && value <= F16MAX);
+		// convert to sign-magnitude
+		ivalue = (int)value;
+		if (ivalue < 0) { s = 1; ivalue = -ivalue; } else s = 0;
+
+		q = ((ivalue << (prec-1)) + bias) / (F16MAX+1);
+		if (s)
+			q = -q;
+		nvDebugCheck (q > -(1 << (prec-1)) && q < (1 << (prec-1)));
+		break;
+	}
+
+	return q;
+}
+
+int Utils::finish_unquantize(int q, int prec)
+{
+	if (Utils::FORMAT == UNSIGNED_F16)
+		return (q * 31) >> 6;										// scale the magnitude by 31/64
+	else if (Utils::FORMAT == SIGNED_F16)
+		return (q < 0) ? -(((-q) * 31) >> 5) : (q * 31) >> 5;		// scale the magnitude by 31/32
+	else
+		return q;
+}
+
+// unquantize each bin to midpoint of original bin range, except
+// for the end bins which we push to an endpoint of the bin range.
+// we do this to ensure we can represent all possible original values.
+// the asymmetric end bins do not affect PSNR for the test images.
+//
+// code this function assuming an arbitrary bit pattern as the encoded block
+int Utils::unquantize(int q, int prec)
+{
+	int unq, s;
+
+	nvDebugCheck (prec > 1);	// not implemented for prec 1
+
+	switch (Utils::FORMAT)
+	{
+	// modify this case to move the multiplication by 31 after interpolation.
+	// Need to use finish_unquantize.
+
+	// since we have 16 bits available, let's unquantize this to 16 bits unsigned
+	// thus the scale factor is [0-7c00)/[0-10000) = 31/64
+	case UNSIGNED_F16:
+		if (prec >= 15) 
+			unq = q;
+		else if (q == 0) 
+			unq = 0;
+		else if (q == ((1<<prec)-1)) 
+			unq = U16MAX;
+		else
+			unq = (q * (U16MAX+1) + (U16MAX+1)/2) >> prec;
+		break;
+
+	// here, let's stick with S16 (no apparent quality benefit from going to S17)
+	// range is (-7c00..7c00)/(-8000..8000) = 31/32
+	case SIGNED_F16:
+		// don't remove this test even though it appears equivalent to the code below
+		// as it isn't -- the code below can overflow for prec = 16
+		if (prec >= 16)
+			unq = q;
+		else
+		{
+			if (q < 0) { s = 1; q = -q; } else s = 0;
+
+			if (q == 0)
+				unq = 0;
+			else if (q >= ((1<<(prec-1))-1))
+				unq = s ? -S16MAX : S16MAX;
+			else
+			{
+				unq = (q * (S16MAX+1) + (S16MAX+1)/2) >> (prec-1);
+				if (s)
+					unq = -unq;
+			}
+		}
+		break;
+	}
+	return unq;
+}
+
+
+
+// pick a norm!
+#define	NORM_EUCLIDEAN 1
+
+float Utils::norm(const Vector3 &a, const Vector3 &b)
+{
+#ifdef	NORM_EUCLIDEAN
+	return lengthSquared(a - b);
+#endif
+#ifdef	NORM_ABS
+	Vector3 err = a - b;
+	return fabs(err.x) + fabs(err.y) + fabs(err.z);
+#endif
+}
+
+// parse <name>[<start>{:<end>}]{,}	
+// the pointer starts here         ^
+// name is 1 or 2 chars and matches field names. start and end are decimal numbers
+void Utils::parse(const char *encoding, int &ptr, Field &field, int &endbit, int &len)
+{
+	if (ptr <= 0) return;
+	--ptr;
+	if (encoding[ptr] == ',') --ptr;
+	nvDebugCheck (encoding[ptr] == ']');
+	--ptr;
+	endbit = 0;
+	int scale = 1;
+	while (encoding[ptr] != ':' && encoding[ptr] != '[')
+	{
+		nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+		endbit += (encoding[ptr--] - '0') * scale;
+		scale *= 10;
+	}
+	int startbit = 0; scale = 1;
+	if (encoding[ptr] == '[')
+		startbit = endbit;
+	else  
+	{
+		ptr--;
+		while (encoding[ptr] != '[')
+		{
+			nvDebugCheck(encoding[ptr] >= '0' && encoding[ptr] <= '9');
+			startbit += (encoding[ptr--] - '0') * scale;
+			scale *= 10;
+		}
+	}
+	len = startbit - endbit + 1;	// startbit>=endbit note
+	--ptr;
+	if (encoding[ptr] == 'm')		field = FIELD_M;
+	else if (encoding[ptr] == 'd')	field = FIELD_D;
+	else {
+		// it's wxyz
+		nvDebugCheck (encoding[ptr] >= 'w' && encoding[ptr] <= 'z');
+		int foo = encoding[ptr--] - 'w';
+		// now it is r g or b
+		if (encoding[ptr] == 'r')		foo += 10;
+		else if (encoding[ptr] == 'g')	foo += 20;
+		else if (encoding[ptr] == 'b')	foo += 30;
+		else nvDebugCheck(0);
+		field = (Field) foo;
+	}
+}
+
+

+ 73 - 0
3rdparty/nvtt/bc6h/zoh_utils.h

@@ -0,0 +1,73 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#pragma once
+#ifndef _ZOH_UTILS_H
+#define _ZOH_UTILS_H
+
+#include "nvmath/Vector.h"
+
+namespace ZOH {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((signed(x))&(1<<((nb)-1)))?((~0)<<(nb)):0)|(signed(x))); }
+
+enum Field {
+    FIELD_M = 1,	// mode
+    FIELD_D = 2,	// distribution/shape
+    FIELD_RW = 10+0, FIELD_RX = 10+1, FIELD_RY = 10+2, FIELD_RZ = 10+3,	// red channel endpoints or deltas
+    FIELD_GW = 20+0, FIELD_GX = 20+1, FIELD_GY = 20+2, FIELD_GZ = 20+3,	// green channel endpoints or deltas
+    FIELD_BW = 30+0, FIELD_BX = 30+1, FIELD_BY = 30+2, FIELD_BZ = 30+3,	// blue channel endpoints or deltas
+};
+
+// some constants
+static const int F16S_MASK	=  0x8000;		// f16 sign mask
+static const int F16EM_MASK	=  0x7fff;		// f16 exp & mantissa mask
+static const int U16MAX		=  0xffff;
+static const int S16MIN		= -0x8000;
+static const int S16MAX		=  0x7fff;
+static const int INT16_MASK	=  0xffff;
+static const int F16MAX		=  0x7bff;		// MAXFLT bit pattern for halfs
+
+enum Format { UNSIGNED_F16, SIGNED_F16 };
+
+class Utils
+{
+public:
+    static Format FORMAT;     // this is a global -- we're either handling unsigned or unsigned half values
+
+    // error metrics
+    static float norm(const nv::Vector3 &a, const nv::Vector3 &b);
+    static float mpsnr_norm(const nv::Vector3 &a, int exposure, const nv::Vector3 &b);
+
+    // conversion & clamp
+    static int ushort_to_format(unsigned short input);
+    static unsigned short format_to_ushort(int input);
+
+    // clamp to format
+    static void clamp(nv::Vector3 &v);
+
+    // quantization and unquantization
+    static int finish_unquantize(int q, int prec);
+    static int unquantize(int q, int prec);
+    static int quantize(float value, int prec);
+
+    static void parse(const char *encoding, int &ptr, Field & field, int &endbit, int &len);
+
+    // lerping
+    static int lerp(int a, int b, int i, int denom);
+    static nv::Vector3 lerp(const nv::Vector3 & a, const nv::Vector3 & b, int i, int denom);
+};
+
+}
+
+#endif // _ZOH_UTILS_H

+ 799 - 0
3rdparty/nvtt/bc6h/zohone.cpp

@@ -0,0 +1,799 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// one region zoh compress/decompress code
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/Vector.inl"
+#include "nvmath/Fitting.h"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+#define	NSHAPES	1
+
+static const int shapes[NSHAPES] =
+{
+    0x0000
+};	// only 1 shape
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	2
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];// allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;				// associated mode value
+    int modebits;			// number of mode bits
+    const char *encoding;	// verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 4
+
+static const Pattern patterns[NPATTERNS] =
+{
+    16,4,  16,4,  16,4,   1, 0x0f, 5, "bw[10],bw[11],bw[12],bw[13],bw[14],bw[15],bx[3:0],gw[10],gw[11],gw[12],gw[13],gw[14],gw[15],gx[3:0],rw[10],rw[11],rw[12],rw[13],rw[14],rw[15],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    12,8,  12,8,  12,8,   1, 0x0b, 5, "bw[10],bw[11],bx[7:0],gw[10],gw[11],gx[7:0],rw[10],rw[11],rx[7:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,9,  11,9,  11,9,   1, 0x07, 5, "bw[10],bx[8:0],gw[10],gx[8:0],rw[10],rx[8:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,10, 10,10, 10,10,  0, 0x03, 5, "bx[9:0],gx[9:0],rx[9:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+static const int mode_to_pat[MAXMODES] = {
+    -1,-1,-1,
+    3,	// 0x03
+    -1,-1,-1,
+    2,	// 0x07
+    -1,-1,-1,
+    1,	// 0x0b
+    -1,-1,-1,
+    0,	// 0x0f
+    -1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_ONE], ComprEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_ONE], IntEndpts out[NREGIONS_ONE], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_ONE], int prec, IntEndpts q_endpts[NREGIONS_ONE])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_one have a 0 high-order bit
+// index_one is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndpts endpts[NREGIONS_ONE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    int index_positions[NREGIONS_ONE];
+
+    index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        int x = index_positions[region] & 3;
+        int y = (index_positions[region] >> 2) & 3;
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_ONE], const ComprEndpts compressed[NREGIONS_ONE], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_ONE];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_ONE; ++j)
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+        if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+        if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_ONE], const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+}
+
+static void read_header(Bits &in, ComprEndpts endpts[NREGIONS_ONE], Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx;
+    int gw, gx;
+    int bw, bx;
+
+    d = 0;
+    rw = rx = 0;
+    gw = gx = 0;
+    bw = bx = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+
+        case FIELD_D:
+        case FIELD_RY:
+        case FIELD_RZ:
+        case FIELD_GY:
+        case FIELD_GZ:
+        case FIELD_BY:
+        case FIELD_BZ:
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 63);
+
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx;
+}
+
+// compress index 0
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        out.write(indices[y][x], INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts endpts[NREGIONS_ONE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(endpts, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+// position 0 was compressed
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        indices[y][x]= in.read(INDEXBITS - ((pos == 0) ? 1 : 0));
+    }
+}
+
+void ZOH::decompressone(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_ONE];
+    ComprEndpts compr_endpts[NREGIONS_ONE];
+
+    read_header(in, compr_endpts, p);
+    int shapeindex = 0;		// only one shape
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+    for (int r = 0; r < NREGIONS_ONE; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    // read indices
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+            t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_ONE], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+	*/
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_ONE], 
+                            const IntEndpts orig_endpts[NREGIONS_ONE], int prec, IntEndpts opt_endpts[NREGIONS_ONE])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refineone(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_ONE], char *block)
+{
+    float orig_err[NREGIONS_ONE], opt_err[NREGIONS_ONE], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_ONE], opt_endpts[NREGIONS_ONE];
+    ComprEndpts compr_orig[NREGIONS_ONE], compr_opt[NREGIONS_ONE];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_ONE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+
+	nvAssert (false); // "No candidate found, should never happen (refineone.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_ONE], Vector3 palette[NREGIONS_ONE][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_ONE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_ONE])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_ONE][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughone(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_ONE])
+{
+    for (int region=0; region<NREGIONS_ONE; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x,y,shapeindex) == region)
+                {
+                    colors[np] = tile.data[y][x];
+                    mean += tile.data[y][x];
+                    ++np;
+                }
+            }
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compressone(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_ONE], tempendpts[NREGIONS_ONE];
+    float msebest = FLT_MAX;
+
+    /*
+		collect the mse values that are within 5% of the best values
+		optimize each one and choose the best
+	*/
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughone(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refineone(t, shapeindex_best, endptsbest, block);
+}

+ 883 - 0
3rdparty/nvtt/bc6h/zohtwo.cpp

@@ -0,0 +1,883 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// two regions zoh compress/decompress code
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+/* optimization algorithm
+
+	get initial float endpoints
+	convert endpoints using 16 bit precision, transform, and get bit delta. choose likely endpoint compression candidates.
+		note that there will be 1 or 2 candidates; 2 will be chosen when the delta values are close to the max possible.
+	for each EC candidate in order from max precision to smaller precision
+		convert endpoints using the appropriate precision.
+		optimize the endpoints and minimize square error. save the error and index assignments. apply index compression as well.
+			(thus the endpoints and indices are in final form.)
+		transform and get bit delta.
+		if the bit delta fits, exit
+	if we ended up with no candidates somehow, choose the tail set of EC candidates and retry. this should happen hardly ever.
+		add a state variable to nvDebugCheck we only do this once.
+	convert to bit stream.
+	return the error.
+
+	Global optimization
+		order all tiles based on their errors
+		do something special for high-error tiles
+			the goal here is to try to avoid tiling artifacts. but I think this is a research problem. let's just generate an error image...
+
+	display an image that shows partitioning and precision selected for each tile
+*/
+
+#include "bits.h"
+#include "tile.h"
+#include "zoh.h"
+#include "zoh_utils.h"
+
+#include "nvmath/Fitting.h"
+#include "nvmath/Vector.inl"
+
+#include <string.h> // strlen
+#include <float.h> // FLT_MAX
+
+using namespace nv;
+using namespace ZOH;
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#include "shapes_two.h"
+// use only the first 32 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 32
+#define SHAPEBITS 5
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NDELTA	4
+
+struct Chanpat
+{
+    int prec[NDELTA];		// precision pattern for one channel
+};
+
+struct Pattern
+{
+    Chanpat chan[NCHANNELS];    // allow different bit patterns per channel -- but we still want constant precision per channel
+    int transformed;            // if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+    int mode;                   // associated mode value
+    int modebits;               // number of mode bits
+    const char *encoding;       // verilog description of encoding for this mode
+};
+
+#define MAXMODEBITS	5
+#define	MAXMODES (1<<MAXMODEBITS)
+
+#define	NPATTERNS 10
+
+static const Pattern patterns[NPATTERNS] =
+{
+    11,5,5,5,	11,4,4,4,	11,4,4,4,	1,	0x02, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],rw[10],rx[4:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,5,5,5,	11,4,4,4,	1,	0x06, 5, "d[4:0],bz[3],gy[4],rz[3:0],bz[2],bz[0],ry[3:0],by[3:0],bz[1],bw[10],bx[3:0],gz[3:0],gw[10],gx[4:0],gy[3:0],gz[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    11,4,4,4,	11,4,4,4,	11,5,5,5,	1,	0x0a, 5, "d[4:0],bz[3],bz[4],rz[3:0],bz[2:1],ry[3:0],by[3:0],bw[10],bx[4:0],gz[3:0],bz[0],gw[10],gx[3:0],gy[3:0],by[4],rw[10],rx[3:0],bw[9:0],gw[9:0],rw[9:0],m[4:0]",
+    10,5,5,5,	10,5,5,5,	10,5,5,5,	1,	0x00, 2, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bw[9:0],gw[9:0],rw[9:0],bz[4],by[4],gy[4],m[1:0]",
+    9,5,5,5,	9,5,5,5,	9,5,5,5,	1,	0x0e, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bw[8:0],gy[4],gw[8:0],by[4],rw[8:0],m[4:0]",
+    8,6,6,6,	8,5,5,5,	8,5,5,5,	1,	0x12, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bz[1],bx[4:0],gz[3:0],bz[0],gx[4:0],gy[3:0],rx[5:0],bz[4:3],bw[7:0],gy[4],bz[2],gw[7:0],by[4],gz[4],rw[7:0],m[4:0]",
+    8,5,5,5,	8,6,6,6,	8,5,5,5,	1,	0x16, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bz[1],bx[4:0],gz[3:0],gx[5:0],gy[3:0],gz[4],rx[4:0],bz[4],gz[5],bw[7:0],gy[4],gy[5],gw[7:0],by[4],bz[0],rw[7:0],m[4:0]",
+    8,5,5,5,	8,5,5,5,	8,6,6,6,	1,	0x1a, 5, "d[4:0],bz[3],rz[4:0],bz[2],ry[4:0],by[3:0],bx[5:0],gz[3:0],bz[0],gx[4:0],gy[3:0],gz[4],rx[4:0],bz[4],bz[5],bw[7:0],gy[4],by[5],gw[7:0],by[4],bz[1],rw[7:0],m[4:0]",
+    7,6,6,6,	7,6,6,6,	7,6,6,6,	1,	0x01, 2, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],bw[6:0],gy[4],bz[2],by[5],gw[6:0],by[4],bz[1:0],rw[6:0],gz[5:4],gy[5],m[1:0]",
+    6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x1e, 5, "d[4:0],rz[5:0],ry[5:0],by[3:0],bx[5:0],gz[3:0],gx[5:0],gy[3:0],rx[5:0],bz[4],bz[5],bz[3],gz[5],bw[5:0],gy[4],bz[2],by[5],gy[5],gw[5:0],by[4],bz[1:0],gz[4],rw[5:0],m[4:0]",
+};
+
+// mapping of mode to the corresponding index in pattern
+// UNUSED ZOH MODES are 0x13, 0x17, 0x1b, 0x1f -- return -2 for these
+static const int mode_to_pat[MAXMODES] = {	
+    3,	// 0x00
+    8,	// 0x01
+    0,	// 0x02
+    -1,-1,-1,
+    1,	// 0x06
+    -1,-1,-1,
+    2,	// 0x0a
+    -1,-1,-1,
+    4,	// 0x0e
+    -1,-1,-1,
+    5,	// 0x12
+    -2,-1,-1,
+    6,	// 0x16
+    -2,-1,-1,
+    7,	// 0x1a
+    -2,-1,-1,
+    9,	// 0x1e
+    -2
+};
+
+#define	R_0(ep)	(ep)[0].A[i]
+#define	R_1(ep)	(ep)[0].B[i]
+#define	R_2(ep)	(ep)[1].A[i]
+#define	R_3(ep)	(ep)[1].B[i]
+#define	MASK(n)	((1<<(n))-1)
+
+// compress endpoints
+static void compress_endpts(const IntEndpts in[NREGIONS_TWO], ComprEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = (R_1(in) - R_0(in)) & MASK(p.chan[i].prec[1]);
+            R_2(out) = (R_2(in) - R_0(in)) & MASK(p.chan[i].prec[2]);
+            R_3(out) = (R_3(in) - R_0(in)) & MASK(p.chan[i].prec[3]);
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = R_0(in) & MASK(p.chan[i].prec[0]);
+            R_1(out) = R_1(in) & MASK(p.chan[i].prec[1]);
+            R_2(out) = R_2(in) & MASK(p.chan[i].prec[2]);
+            R_3(out) = R_3(in) & MASK(p.chan[i].prec[3]);
+        }
+    }
+}
+
+// decompress endpoints
+static void decompress_endpts(const ComprEndpts in[NREGIONS_TWO], IntEndpts out[NREGIONS_TWO], const Pattern &p)
+{
+    bool issigned = Utils::FORMAT == SIGNED_F16;
+
+    if (p.transformed)
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            int t;
+            t = SIGN_EXTEND(R_1(in), p.chan[i].prec[1]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_1(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_2(in), p.chan[i].prec[2]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_2(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+            t = SIGN_EXTEND(R_3(in), p.chan[i].prec[3]);
+            t = (t + R_0(in)) & MASK(p.chan[i].prec[0]);
+            R_3(out) = issigned ? SIGN_EXTEND(t,p.chan[i].prec[0]) : t;
+        }
+    }
+    else
+    {
+        for (int i=0; i<NCHANNELS; ++i)
+        {
+            R_0(out) = issigned ? SIGN_EXTEND(R_0(in),p.chan[i].prec[0]) : R_0(in);
+            R_1(out) = issigned ? SIGN_EXTEND(R_1(in),p.chan[i].prec[1]) : R_1(in);
+            R_2(out) = issigned ? SIGN_EXTEND(R_2(in),p.chan[i].prec[2]) : R_2(in);
+            R_3(out) = issigned ? SIGN_EXTEND(R_3(in),p.chan[i].prec[3]) : R_3(in);
+        }
+    }
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_TWO], int prec, IntEndpts q_endpts[NREGIONS_TWO])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, prec);
+        q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, prec);
+        q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, prec);
+        q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, prec);
+        q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, prec);
+        q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, prec);
+    }
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndpts endpts[NREGIONS_TWO], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+        int x = POS_TO_X(position);
+        int y = POS_TO_Y(position);
+        nvDebugCheck(REGION(x,y,shapeindex) == region);		// double check the table
+        if (indices[y][x] & HIGH_INDEXBIT)
+        {
+            // high bit is set, swap the endpts and indices for this region
+            int t;
+            for (int i=0; i<NCHANNELS; ++i)
+            {
+                t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+            }
+
+            for (int y = 0; y < Tile::TILE_H; y++)
+                for (int x = 0; x < Tile::TILE_W; x++)
+                    if (REGION(x,y,shapeindex) == region)
+                        indices[y][x] = NINDICES - 1 - indices[y][x];
+        }
+    }
+}
+
+// endpoints fit only if the compression was lossless
+static bool endpts_fit(const IntEndpts orig[NREGIONS_TWO], const ComprEndpts compressed[NREGIONS_TWO], const Pattern &p)
+{
+    IntEndpts uncompressed[NREGIONS_TWO];
+
+    decompress_endpts(compressed, uncompressed, p);
+
+    for (int j=0; j<NREGIONS_TWO; ++j)
+    {
+	for (int i=0; i<NCHANNELS; ++i)
+	{
+            if (orig[j].A[i] != uncompressed[j].A[i]) return false;
+            if (orig[j].B[i] != uncompressed[j].B[i]) return false;
+        }
+    }
+    return true;
+}
+
+static void write_header(const ComprEndpts endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, Bits &out)
+{
+    // interpret the verilog backwards and process it
+    int m = p.mode;
+    int d = shapeindex;
+    int rw = endpts[0].A[0], rx = endpts[0].B[0], ry = endpts[1].A[0], rz = endpts[1].B[0];
+    int gw = endpts[0].A[1], gx = endpts[0].B[1], gy = endpts[1].A[1], gz = endpts[1].B[1];
+    int bw = endpts[0].A[2], bx = endpts[0].B[2], by = endpts[1].A[2], bz = endpts[1].B[2];
+    int ptr = int(strlen(p.encoding));
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+        switch(field)
+        {
+        case FIELD_M:	out.write( m >> endbit, len); break;
+        case FIELD_D:	out.write( d >> endbit, len); break;
+        case FIELD_RW:	out.write(rw >> endbit, len); break;
+        case FIELD_RX:	out.write(rx >> endbit, len); break;
+        case FIELD_RY:	out.write(ry >> endbit, len); break;
+        case FIELD_RZ:	out.write(rz >> endbit, len); break;
+        case FIELD_GW:	out.write(gw >> endbit, len); break;
+        case FIELD_GX:	out.write(gx >> endbit, len); break;
+        case FIELD_GY:	out.write(gy >> endbit, len); break;
+        case FIELD_GZ:	out.write(gz >> endbit, len); break;
+        case FIELD_BW:	out.write(bw >> endbit, len); break;
+        case FIELD_BX:	out.write(bx >> endbit, len); break;
+        case FIELD_BY:	out.write(by >> endbit, len); break;
+        case FIELD_BZ:	out.write(bz >> endbit, len); break;
+        default: nvUnreachable();
+        }
+    }
+}
+
+static bool read_header(Bits &in, ComprEndpts endpts[NREGIONS_TWO], int &shapeindex, Pattern &p)
+{
+    // reading isn't quite symmetric with writing -- we don't know the encoding until we decode the mode
+    int mode = in.read(2);
+    if (mode != 0x00 && mode != 0x01)
+        mode = (in.read(3) << 2) | mode;
+
+    int pat_index = mode_to_pat[mode];
+
+    if (pat_index == -2)
+        return false;		// reserved mode found
+
+    nvDebugCheck (pat_index >= 0 && pat_index < NPATTERNS);
+    nvDebugCheck (in.getptr() == patterns[pat_index].modebits);
+
+    p = patterns[pat_index];
+
+    int d;
+    int rw, rx, ry, rz;
+    int gw, gx, gy, gz;
+    int bw, bx, by, bz;
+
+    d = 0;
+    rw = rx = ry = rz = 0;
+    gw = gx = gy = gz = 0;
+    bw = bx = by = bz = 0;
+
+    int ptr = int(strlen(p.encoding));
+
+    while (ptr)
+    {
+        Field field;
+        int endbit, len;
+
+		// !!!UNDONE: get rid of string parsing!!!
+        Utils::parse(p.encoding, ptr, field, endbit, len);
+
+        switch(field)
+        {
+        case FIELD_M:	break;	// already processed so ignore
+        case FIELD_D:	 d |= in.read(len) << endbit; break;
+        case FIELD_RW:	rw |= in.read(len) << endbit; break;
+        case FIELD_RX:	rx |= in.read(len) << endbit; break;
+        case FIELD_RY:	ry |= in.read(len) << endbit; break;
+        case FIELD_RZ:	rz |= in.read(len) << endbit; break;
+        case FIELD_GW:	gw |= in.read(len) << endbit; break;
+        case FIELD_GX:	gx |= in.read(len) << endbit; break;
+        case FIELD_GY:	gy |= in.read(len) << endbit; break;
+        case FIELD_GZ:	gz |= in.read(len) << endbit; break;
+        case FIELD_BW:	bw |= in.read(len) << endbit; break;
+        case FIELD_BX:	bx |= in.read(len) << endbit; break;
+        case FIELD_BY:	by |= in.read(len) << endbit; break;
+        case FIELD_BZ:	bz |= in.read(len) << endbit; break;
+        default: nvUnreachable();
+        }
+    }
+
+    nvDebugCheck (in.getptr() == 128 - 46);
+
+    shapeindex = d;
+    endpts[0].A[0] = rw; endpts[0].B[0] = rx; endpts[1].A[0] = ry; endpts[1].B[0] = rz;
+    endpts[0].A[1] = gw; endpts[0].B[1] = gx; endpts[1].A[1] = gy; endpts[1].B[1] = gz;
+    endpts[0].A[2] = bw; endpts[0].B[2] = bx; endpts[1].A[2] = by; endpts[1].B[2] = bz;
+
+    return true;
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+static void emit_block(const ComprEndpts compr_endpts[NREGIONS_TWO], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+    Bits out(block, ZOH::BITSIZE);
+
+    write_header(compr_endpts, shapeindex, p, out);
+
+    write_indices(indices, shapeindex, out);
+
+    nvDebugCheck(out.getptr() == ZOH::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndpts &endpts, int prec, Vector3 palette[NINDICES])
+{
+    // scale endpoints
+    int a, b;			// really need a IntVector3...
+
+    a = Utils::unquantize(endpts.A[0], prec);
+    b = Utils::unquantize(endpts.B[0], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].x = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[1], prec);
+    b = Utils::unquantize(endpts.B[1], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].y = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+
+    a = Utils::unquantize(endpts.A[2], prec);
+    b = Utils::unquantize(endpts.B[2], prec);
+
+    // interpolate
+    for (int i = 0; i < NINDICES; ++i)
+        palette[i].z = float(Utils::finish_unquantize(Utils::lerp(a, b, i, DENOM), prec));
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+    int positions[NREGIONS_TWO];
+
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+    for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+    {
+        int x = POS_TO_X(pos);
+        int y = POS_TO_Y(pos);
+
+        bool match = false;
+
+        for (int r = 0; r < NREGIONS_TWO; ++r)
+            if (positions[r] == pos) { match = true; break; }
+
+        indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+    }
+}
+
+void ZOH::decompresstwo(const char *block, Tile &t)
+{
+    Bits in(block, ZOH::BITSIZE);
+
+    Pattern p;
+    IntEndpts endpts[NREGIONS_TWO];
+    ComprEndpts compr_endpts[NREGIONS_TWO];
+    int shapeindex;
+
+    if (!read_header(in, compr_endpts, shapeindex, p))
+    {
+        // reserved mode, return all zeroes
+        for (int y = 0; y < Tile::TILE_H; y++)
+            for (int x = 0; x < Tile::TILE_W; x++)
+                t.data[y][x] = Vector3(0.0f);
+
+        return;
+    }
+
+    decompress_endpts(compr_endpts, endpts, p);
+
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+    for (int r = 0; r < NREGIONS_TWO; ++r)
+        generate_palette_quantized(endpts[r], p.chan[0].prec[0], &palette[r][0]);
+
+    int indices[Tile::TILE_H][Tile::TILE_W];
+
+    read_indices(in, shapeindex, indices);
+
+    nvDebugCheck(in.getptr() == ZOH::BITSIZE);
+
+    // lookup
+    for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+        t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector3 colors[], const float importance[], int np, const IntEndpts &endpts, int prec)
+{
+    Vector3 palette[NINDICES];
+    float toterr = 0;
+    Vector3 err;
+
+    generate_palette_quantized(endpts, prec, palette);
+
+    for (int i = 0; i < np; ++i)
+    {
+        float err, besterr;
+
+        besterr = Utils::norm(colors[i], palette[0]) * importance[i];
+
+        for (int j = 1; j < NINDICES && besterr > 0; ++j)
+        {
+            err = Utils::norm(colors[i], palette[j]) * importance[i];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndpts endpts[NREGIONS_TWO], int prec, 
+                           int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+    {
+        generate_palette_quantized(endpts[region], prec, &palette[region][0]);
+        toterr[region] = 0;
+    }
+
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]);
+        indices[y][x] = 0;
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]);
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+            {
+                besterr = err;
+                indices[y][x] = i;
+            }
+        }
+        toterr[region] += besterr;
+    }
+}
+
+static float perturb_one(const Vector3 colors[], const float importance[], int np, int ch, int prec, const IntEndpts &old_endpts, IntEndpts &new_endpts,
+                          float old_err, int do_b)
+{
+    // we have the old endpoints: old_endpts
+    // we have the perturbed endpoints: new_endpts
+    // we have the temporary endpoints: temp_endpts
+
+    IntEndpts temp_endpts;
+    float min_err = old_err;		// start with the best current error
+    int beststep;
+
+    // copy real endpoints so we can perturb them
+    for (int i=0; i<NCHANNELS; ++i) { temp_endpts.A[i] = new_endpts.A[i] = old_endpts.A[i]; temp_endpts.B[i] = new_endpts.B[i] = old_endpts.B[i]; }
+
+    // do a logarithmic search for the best error for this endpoint (which)
+    for (int step = 1 << (prec-1); step; step >>= 1)
+    {
+        bool improved = false;
+        for (int sign = -1; sign <= 1; sign += 2)
+        {
+            if (do_b == 0)
+            {
+                temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+                if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+                    continue;
+            }
+            else
+            {
+                temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+                if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+                    continue;
+            }
+
+            float err = map_colors(colors, importance, np, temp_endpts, prec);
+
+            if (err < min_err)
+            {
+                improved = true;
+                min_err = err;
+                beststep = sign * step;
+            }
+        }
+        // if this was an improvement, move the endpoint and continue search from there
+        if (improved)
+        {
+            if (do_b == 0)
+                new_endpts.A[ch] += beststep;
+            else
+                new_endpts.B[ch] += beststep;
+        }
+    }
+    return min_err;
+}
+
+static void optimize_one(const Vector3 colors[], const float importance[], int np, float orig_err, const IntEndpts &orig_endpts, int prec, IntEndpts &opt_endpts)
+{
+    float opt_err = orig_err;
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        opt_endpts.A[ch] = orig_endpts.A[ch];
+        opt_endpts.B[ch] = orig_endpts.B[ch];
+    }
+    /*
+        err0 = perturb(rgb0, delta0)
+        err1 = perturb(rgb1, delta1)
+        if (err0 < err1)
+            if (err0 >= initial_error) break
+            rgb0 += delta0
+            next = 1
+        else
+            if (err1 >= initial_error) break
+            rgb1 += delta1
+            next = 0
+        initial_err = map()
+        for (;;)
+            err = perturb(next ? rgb1:rgb0, delta)
+            if (err >= initial_err) break
+            next? rgb1 : rgb0 += delta
+            initial_err = err
+    */
+    IntEndpts new_a, new_b;
+    IntEndpts new_endpt;
+    int do_b;
+
+    // now optimize each channel separately
+    for (int ch = 0; ch < NCHANNELS; ++ch)
+    {
+        // figure out which endpoint when perturbed gives the most improvement and start there
+        // if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_a, opt_err, 0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_b, opt_err, 1);	// perturb endpt B
+
+        if (err0 < err1)
+        {
+            if (err0 >= opt_err)
+                continue;
+
+            opt_endpts.A[ch] = new_a.A[ch];
+            opt_err = err0;
+            do_b = 1;		// do B next
+        }
+        else
+        {
+            if (err1 >= opt_err)
+                continue;
+            opt_endpts.B[ch] = new_b.B[ch];
+            opt_err = err1;
+            do_b = 0;		// do A next
+        }
+
+        // now alternate endpoints and keep trying until there is no improvement
+        for (;;)
+        {
+            float err = perturb_one(colors, importance, np, ch, prec, opt_endpts, new_endpt, opt_err, do_b);
+            if (err >= opt_err)
+                break;
+            if (do_b == 0)
+                opt_endpts.A[ch] = new_endpt.A[ch];
+            else
+                opt_endpts.B[ch] = new_endpt.B[ch];
+            opt_err = err;
+            do_b = 1 - do_b;	// now move the other endpoint
+        }
+    }
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_TWO], 
+                            const IntEndpts orig_endpts[NREGIONS_TWO], int prec, IntEndpts opt_endpts[NREGIONS_TWO])
+{
+    Vector3 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+    float err = 0;
+
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        // collect the pixels in the region
+        int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            pixels[np] = tile.data[y][x];
+            importance[np] = tile.importance_map[y][x];
+            ++np;
+        }
+
+        optimize_one(pixels, importance, np, orig_err[region], orig_endpts[region], prec, opt_endpts[region]);
+    }
+}
+
+/* optimization algorithm
+    for each pattern
+        convert endpoints using pattern precision
+        assign indices and get initial error
+        compress indices (and possibly reorder endpoints)
+        transform endpoints
+        if transformed endpoints fit pattern
+            get original endpoints back
+            optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+            compress new indices
+            transform new endpoints
+            if new endpoints fit pattern AND if error is improved
+                emit compressed block with new data
+            else
+                emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+float ZOH::refinetwo(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_TWO], char *block)
+{
+    float orig_err[NREGIONS_TWO], opt_err[NREGIONS_TWO], orig_toterr, opt_toterr;
+    IntEndpts orig_endpts[NREGIONS_TWO], opt_endpts[NREGIONS_TWO];
+    ComprEndpts compr_orig[NREGIONS_TWO], compr_opt[NREGIONS_TWO];
+    int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+    for (int sp = 0; sp < NPATTERNS; ++sp)
+    {
+        // precisions for all channels need to be the same
+        for (int i=1; i<NCHANNELS; ++i) nvDebugCheck (patterns[sp].chan[0].prec[0] == patterns[sp].chan[i].prec[0]);
+
+        quantize_endpts(endpts, patterns[sp].chan[0].prec[0], orig_endpts);
+        assign_indices(tile, shapeindex_best, orig_endpts, patterns[sp].chan[0].prec[0], orig_indices, orig_err);
+        swap_indices(orig_endpts, orig_indices, shapeindex_best);
+        compress_endpts(orig_endpts, compr_orig, patterns[sp]);
+        if (endpts_fit(orig_endpts, compr_orig, patterns[sp]))
+        {
+            optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, patterns[sp].chan[0].prec[0], opt_endpts);
+            assign_indices(tile, shapeindex_best, opt_endpts, patterns[sp].chan[0].prec[0], opt_indices, opt_err);
+            swap_indices(opt_endpts, opt_indices, shapeindex_best);
+            compress_endpts(opt_endpts, compr_opt, patterns[sp]);
+            orig_toterr = opt_toterr = 0;
+            for (int i=0; i < NREGIONS_TWO; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+            if (endpts_fit(opt_endpts, compr_opt, patterns[sp]) && opt_toterr < orig_toterr)
+            {
+                emit_block(compr_opt, shapeindex_best, patterns[sp], opt_indices, block);
+                return opt_toterr;
+            }
+            else
+            {
+                // either it stopped fitting when we optimized it, or there was no improvement
+                // so go back to the unoptimized endpoints which we know will fit
+                emit_block(compr_orig, shapeindex_best, patterns[sp], orig_indices, block);
+                return orig_toterr;
+            }
+        }
+    }
+    nvAssert(false); //throw "No candidate found, should never happen (refinetwo.)";
+	return FLT_MAX;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_TWO], Vector3 palette[NREGIONS_TWO][NINDICES])
+{
+    for (int region = 0; region < NREGIONS_TWO; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+            palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_TWO])
+{
+    // build list of possibles
+    Vector3 palette[NREGIONS_TWO][NINDICES];
+
+    generate_palette_unquantized(endpts, palette);
+
+    float toterr = 0;
+    Vector3 err;
+
+    for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+        int region = REGION(x,y,shapeindex);
+        float err, besterr;
+
+        besterr = Utils::norm(tile.data[y][x], palette[region][0]) * tile.importance_map[y][x];
+
+        for (int i = 1; i < NINDICES && besterr > 0; ++i)
+        {
+            err = Utils::norm(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+            if (err > besterr)	// error increased, so we're done searching
+                break;
+            if (err < besterr)
+                besterr = err;
+        }
+        toterr += besterr;
+    }
+    return toterr;
+}
+
+float ZOH::roughtwo(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_TWO])
+{
+    for (int region=0; region<NREGIONS_TWO; ++region)
+    {
+        int np = 0;
+        Vector3 colors[Tile::TILE_TOTAL];
+        Vector3 mean(0,0,0);
+
+        for (int y = 0; y < tile.size_y; y++)
+            for (int x = 0; x < tile.size_x; x++)
+                if (REGION(x,y,shapeindex) == region)
+                {
+            colors[np] = tile.data[y][x];
+            mean += tile.data[y][x];
+            ++np;
+        }
+
+        // handle simple cases
+        if (np == 0)
+        {
+            Vector3 zero(0,0,0);
+            endpts[region].A = zero;
+            endpts[region].B = zero;
+            continue;
+        }
+        else if (np == 1)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[0];
+            continue;
+        }
+        else if (np == 2)
+        {
+            endpts[region].A = colors[0];
+            endpts[region].B = colors[1];
+            continue;
+        }
+
+        mean /= float(np);
+
+        Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+        // project each pixel value along the principal direction
+        float minp = FLT_MAX, maxp = -FLT_MAX;
+        for (int i = 0; i < np; i++)
+        {
+            float dp = dot(colors[i]-mean, direction);
+            if (dp < minp) minp = dp;
+            if (dp > maxp) maxp = dp;
+        }
+
+        // choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+        endpts[region].A = mean + minp*direction;
+        endpts[region].B = mean + maxp*direction;
+
+        // clamp endpoints
+        // the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+        // shape based on endpoints being clamped
+        Utils::clamp(endpts[region].A);
+        Utils::clamp(endpts[region].B);
+    }
+
+    return map_colors(tile, shapeindex, endpts);
+}
+
+float ZOH::compresstwo(const Tile &t, char *block)
+{
+    int shapeindex_best = 0;
+    FltEndpts endptsbest[NREGIONS_TWO], tempendpts[NREGIONS_TWO];
+    float msebest = FLT_MAX;
+
+    /*
+    collect the mse values that are within 5% of the best values
+    optimize each one and choose the best
+    */
+    // hack for now -- just use the best value WORK
+    for (int i=0; i<NSHAPES && msebest>0.0; ++i)
+    {
+        float mse = roughtwo(t, i, tempendpts);
+        if (mse < msebest)
+        {
+            msebest = mse;
+            shapeindex_best = i;
+            memcpy(endptsbest, tempendpts, sizeof(endptsbest));
+        }
+
+    }
+    return refinetwo(t, shapeindex_best, endptsbest, block);
+}
+

+ 264 - 0
3rdparty/nvtt/bc7/avpcl.cpp

@@ -0,0 +1,264 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// the avpcl compressor and decompressor
+
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// global flags
+bool AVPCL::flag_premult = false;
+bool AVPCL::flag_nonuniform = false;
+bool AVPCL::flag_nonuniform_ati = false;
+
+// global mode
+bool AVPCL::mode_rgb = false;		// true if image had constant alpha = 255
+
+void AVPCL::compress(const Tile &t, char *block)
+{
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	float mse_mode0 = AVPCL::compress_mode0(t, tempblock);		if(mse_mode0 < msebest) { msebest = mse_mode0; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode1 = AVPCL::compress_mode1(t, tempblock);		if(mse_mode1 < msebest) { msebest = mse_mode1; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode2 = AVPCL::compress_mode2(t, tempblock);		if(mse_mode2 < msebest) { msebest = mse_mode2; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode3 = AVPCL::compress_mode3(t, tempblock);		if(mse_mode3 < msebest) { msebest = mse_mode3; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode4 = AVPCL::compress_mode4(t, tempblock);		if(mse_mode4 < msebest) { msebest = mse_mode4; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode5 = AVPCL::compress_mode5(t, tempblock);		if(mse_mode5 < msebest) { msebest = mse_mode5; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode6 = AVPCL::compress_mode6(t, tempblock);		if(mse_mode6 < msebest) { msebest = mse_mode6; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+	float mse_mode7 = AVPCL::compress_mode7(t, tempblock);		if(mse_mode7 < msebest) { msebest = mse_mode7; memcpy(block, tempblock, AVPCL::BLOCKSIZE); }
+		
+	/*if (errfile)
+	{
+		float errs[21];
+		int nerrs = 8;
+		errs[0] = mse_mode0; 
+		errs[1] = mse_mode1; 
+		errs[2] = mse_mode2; 
+		errs[3] = mse_mode3; 
+		errs[4] = mse_mode4; 
+		errs[5] = mse_mode5; 
+		errs[6] = mse_mode6; 
+		errs[7] = mse_mode7;
+		if (fwrite(errs, sizeof(float), nerrs, errfile) != nerrs)
+			throw "Write error on error file";
+	}*/
+}
+
+/*
+static int getbit(char *b, int start)
+{
+	if (start < 0 || start >= 128) return 0; // out of range
+
+	int ix = start >> 3;
+	return (b[ix] & (1 << (start & 7))) != 0;
+}
+
+static int getbits(char *b, int start, int len)
+{
+	int out = 0;
+	for (int i=0; i<len; ++i)
+		out |= getbit(b, start+i) << i;
+	return out;
+}
+
+static void setbit(char *b, int start, int bit)
+{
+	if (start < 0 || start >= 128) return; // out of range
+
+	int ix = start >> 3;
+
+	if (bit & 1)
+		b[ix] |= (1 << (start & 7));
+	else
+		b[ix] &= ~(1 << (start & 7));
+}
+
+static void setbits(char *b, int start, int len, int bits)
+{
+	for (int i=0; i<len; ++i)
+		setbit(b, start+i, bits >> i);
+}
+*/
+
+void AVPCL::decompress(const char *cblock, Tile &t)
+{
+	char block[AVPCL::BLOCKSIZE];
+	memcpy(block, cblock, AVPCL::BLOCKSIZE);
+
+	switch(getmode(block))
+	{
+	case 0:	AVPCL::decompress_mode0(block, t);	break;
+	case 1:	AVPCL::decompress_mode1(block, t);	break;
+	case 2:	AVPCL::decompress_mode2(block, t);	break;
+	case 3:	AVPCL::decompress_mode3(block, t);	break;
+	case 4:	AVPCL::decompress_mode4(block, t);	break;
+	case 5:	AVPCL::decompress_mode5(block, t);	break;
+	case 6:	AVPCL::decompress_mode6(block, t);	break;
+	case 7:	AVPCL::decompress_mode7(block, t);	break;
+	case 8: // return a black tile if you get a reserved mode
+		for (int y=0; y<Tile::TILE_H; ++y)
+			for (int x=0; x<Tile::TILE_W; ++x)
+				t.data[y][x].set(0, 0, 0, 0);
+		break;
+	default: nvUnreachable();
+	}
+}
+
+/*
+void AVPCL::compress(string inf, string avpclf, string errf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	Targa::read(inf, pixels, w, h);
+	FILE *avpclfile = fopen(avpclf.c_str(), "wb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for write";
+	FILE *errfile = NULL;
+	if (errf != "")
+	{
+		errfile = fopen(errf.c_str(), "wb");
+		if (errfile == NULL) throw "Unable to open error file for write";
+	}
+
+	// Look at alpha channel and override the premult flag if alpha is constant (but only if premult is set)
+	if (AVPCL::flag_premult)
+	{
+		if (AVPCL::mode_rgb)
+		{
+			AVPCL::flag_premult = false;
+			cout << endl << "NOTE: Source image alpha is constant 255, turning off premultiplied-alpha error metric." << endl << endl;
+		}
+	}
+
+	// stuff for progress bar O.o
+	int ntiles = ((h+Tile::TILE_H-1)/Tile::TILE_H)*((w+Tile::TILE_W-1)/Tile::TILE_W);
+	int tilecnt = 0;
+	clock_t start, prev, cur;
+
+	start = prev = clock();
+
+	// convert to tiles and compress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			if ((tilecnt%100) == 0) { cur = clock(); printf("Progress %d of %d, %5.2f seconds per 100 tiles\r", tilecnt, ntiles, float(cur-prev)/CLOCKS_PER_SEC); fflush(stdout); prev = cur; }
+
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			t.insert(pixels, x, y);
+
+			AVPCL::compress(t, block, errfile);
+			if (fwrite(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on write";
+
+			// progress bar
+			++tilecnt;
+		}
+	}
+
+	cur = clock();
+	printf("\nTotal time to compress: %.2f seconds\n\n", float(cur-start)/CLOCKS_PER_SEC);		// advance to next line finally
+
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+	if (errfile && fclose(errfile)) throw "Close failed on error file";
+}
+
+static int str2int(std::string s) 
+{
+	int thing;
+	std::stringstream str (stringstream::in | stringstream::out);
+	str << s;
+	str >> thing;
+	return thing;
+}
+
+// avpcl file name is ...-w-h-RGB[A].avpcl, extract width and height
+static void extract(string avpclf, int &w, int &h, bool &mode_rgb)
+{
+	size_t n = avpclf.rfind('.', avpclf.length()-1);
+	size_t n1 = avpclf.rfind('-', n-1);
+	size_t n2 = avpclf.rfind('-', n1-1);
+	size_t n3 = avpclf.rfind('-', n2-1);
+	//	...-wwww-hhhh-RGB[A].avpcl
+	//     ^    ^    ^      ^
+	//     n3   n2   n1     n n3<n2<n1<n
+	string width = avpclf.substr(n3+1, n2-n3-1);
+	w = str2int(width);
+	string height = avpclf.substr(n2+1, n1-n2-1);
+	h = str2int(height);
+	string mode = avpclf.substr(n1+1, n-n1-1);
+	mode_rgb = mode == "RGB";
+}
+
+static int modehist[8];
+
+static void stats(char block[AVPCL::BLOCKSIZE])
+{
+	int m = AVPCL::getmode(block);
+	modehist[m]++;
+}
+
+static void printstats()
+{
+	printf("\nMode histogram: "); for (int i=0; i<8; ++i) { printf("%d,", modehist[i]); }
+	printf("\n");
+}
+
+void AVPCL::decompress(string avpclf, string outf)
+{
+	Array2D<RGBA> pixels;
+	int w, h;
+	char block[AVPCL::BLOCKSIZE];
+
+	extract(avpclf, w, h, AVPCL::mode_rgb);
+	FILE *avpclfile = fopen(avpclf.c_str(), "rb");
+	if (avpclfile == NULL) throw "Unable to open .avpcl file for read";
+	pixels.resizeErase(h, w);
+
+	// convert to tiles and decompress each tile
+	for (int y=0; y<h; y+=Tile::TILE_H)
+	{
+		int ysize = min(Tile::TILE_H, h-y);
+		for (int x=0; x<w; x+=Tile::TILE_W)
+		{
+			int xsize = min(Tile::TILE_W, w-x);
+			Tile t(xsize, ysize);
+
+			if (fread(block, sizeof(char), AVPCL::BLOCKSIZE, avpclfile) != AVPCL::BLOCKSIZE)
+				throw "File error on read";
+
+			stats(block);	// collect statistics
+		
+			AVPCL::decompress(block, t);
+
+			t.extract(pixels, x, y);
+		}
+	}
+	if (fclose(avpclfile)) throw "Close failed on .avpcl file";
+
+	Targa::write(outf, pixels, w, h);
+
+	printstats();	// print statistics
+}
+*/

+ 99 - 0
3rdparty/nvtt/bc7/avpcl.h

@@ -0,0 +1,99 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_H
+#define _AVPCL_H
+
+#include "tile.h"
+#include "bits.h"
+
+#define	DISABLE_EXHAUSTIVE	1	// define this if you don't want to spend a lot of time on exhaustive compression
+#define	USE_ZOH_INTERP		1	// use zoh interpolator, otherwise use exact avpcl interpolators
+#define	USE_ZOH_INTERP_ROUNDED 1	// use the rounded versions!
+
+namespace AVPCL {
+
+static const int NREGIONS_TWO	= 2;
+static const int NREGIONS_THREE	= 3;
+
+static const int BLOCKSIZE=16;
+static const int BITSIZE=128;
+
+// global flags
+extern bool flag_premult;
+extern bool flag_nonuniform;
+extern bool flag_nonuniform_ati;
+
+// global mode
+extern bool mode_rgb;		// true if image had constant alpha = 255
+
+void compress(const Tile &t, char *block);
+void decompress(const char *block, Tile &t);
+
+float compress_mode0(const Tile &t, char *block);
+void decompress_mode0(const char *block, Tile &t);
+
+float compress_mode1(const Tile &t, char *block);
+void decompress_mode1(const char *block, Tile &t);
+
+float compress_mode2(const Tile &t, char *block);
+void decompress_mode2(const char *block, Tile &t);
+
+float compress_mode3(const Tile &t, char *block);
+void decompress_mode3(const char *block, Tile &t);
+
+float compress_mode4(const Tile &t, char *block);
+void decompress_mode4(const char *block, Tile &t);
+
+float compress_mode5(const Tile &t, char *block);
+void decompress_mode5(const char *block, Tile &t);
+
+float compress_mode6(const Tile &t, char *block);
+void decompress_mode6(const char *block, Tile &t);
+
+float compress_mode7(const Tile &t, char *block);
+void decompress_mode7(const char *block, Tile &t);
+
+inline int getmode(Bits &in)
+{
+	int mode = 0;
+
+	if (in.read(1))			mode = 0;
+	else if (in.read(1))	mode = 1;
+	else if (in.read(1))	mode = 2;
+	else if (in.read(1))	mode = 3;
+	else if (in.read(1))	mode = 4;
+	else if (in.read(1))	mode = 5;
+	else if (in.read(1))	mode = 6;
+	else if (in.read(1))	mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+inline int getmode(const char *block)
+{
+	int bits = block[0], mode = 0;
+
+	if (bits & 1) mode = 0;
+	else if ((bits&3) == 2) mode = 1;
+	else if ((bits&7) == 4) mode = 2;
+	else if ((bits & 0xF) == 8) mode = 3;
+	else if ((bits & 0x1F) == 16) mode = 4;
+	else if ((bits & 0x3F) == 32) mode = 5;
+	else if ((bits & 0x7F) == 64) mode = 6;
+	else if ((bits & 0xFF) == 128) mode = 7;
+	else mode = 8;	// reserved
+	return mode;
+}
+
+}
+
+#endif

+ 1066 - 0
3rdparty/nvtt/bc7/avpcl_mode0.cpp

@@ -0,0 +1,1066 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+//  x1		444.1x6 16p 45b (3bi)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_three.h"
+
+// use only the first 16 available shapes
+#undef NSHAPES
+#undef SHAPEBITS
+#define NSHAPES 16
+#define SHAPEBITS 4
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+    const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red			green			blue			xfm	mode  mb
+	4,4,4,4,4,4,	4,4,4,4,4,4,	4,4,4,4,4,4,	0,	0x1, 1, "",	// really 444.1 x 6
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 4,4,4, 
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 555,555; reduce to 444,444 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 16);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 16);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 83);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 83);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode0(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_2 new_a, new_b;
+	IntEndptsRGB_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+			float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+    nvAssert(false); // throw "No candidate found, should never happen (mode avpcl 0).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+// for this mode, we assume alpha = 255 constant and compress only the RGB portion.
+// however, we do the error check against the actual alpha values supplied for the tile.
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode0(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+

+ 1047 - 0
3rdparty/nvtt/bc7/avpcl_mode1.cpp

@@ -0,0 +1,1047 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10	(666x2).1 (666x2).1 64p 3bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	2		// number of different lsb modes per region. since we have one .1 per region, that can have 2 values
+
+#define NINDICES	8
+#define	INDEXBITS	3
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		xfm	mode  mb
+	6,6,6,6,	6,6,6,6,	6,6,6,6,	0,	0x2, 2, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	6,6,6, 6,6,6, 6,6,6, 6,6,6,	
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+
+static void transform_forward(IntEndptsRGB_1 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_1 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 777,777; reduce to 666,666 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_1& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 64);
+		nvAssert (compr_endpts.B[j] < 64);
+	}
+	compr_endpts.lsb = onescnt >= 3;
+}
+
+static void uncompress_one(const IntEndptsRGB_1& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_1 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_1 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_1 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_1 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_1 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+
+static void write_header(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+		out.write(endpts[i].lsb, 1);
+
+	nvAssert (out.getptr() == 82);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_1 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+		endpts[i].lsb  = in.read(1);
+	
+	nvAssert (in.getptr() == 82);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_1 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_1 &endpts_1, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_1, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// note: don't simplify to a + ((b-a)*i + BIAS)/DENOM as that doesn't work due to the way C handles integer division of negatives
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGB_1 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode1(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_1 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_1 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_1 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_1 &old_endpts, IntEndptsRGB_1 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_1 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGB_1 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_1 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_1 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_1 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_1 new_a, new_b;
+	IntEndptsRGB_1 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGB_1 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_1 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_1 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.lsb = lsbmode;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+			float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_1 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			//nvAssert(opt_toterr <= orig_toterr);
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 1).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			float err = Utils::metric4(tile.data[y][x], palette[region][i]) * tile.importance_map[y][x];
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode1(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+

+ 1004 - 0
3rdparty/nvtt/bc7/avpcl_mode2.cpp

@@ -0,0 +1,1004 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x100 555x6 64p 2bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_three.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	6
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red			green			blue			xfm	mode  mb
+	5,5,5,5,5,5,	5,5,5,5,5,5,	5,5,5,5,5,5,	0,	0x4, 3, "",
+};
+
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS_THREE];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 5,5,5, 
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+#define	R_2 ep[1].A[i]
+#define	R_3	ep[1].B[i]
+
+static void transform_forward(IntEndptsRGB ep[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		R_1 -= R_3; R_2 -= R_3; R_0 -= R_3;
+	}
+}
+
+static void transform_inverse(IntEndptsRGB ep[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		R_0 += R_3; R_2 += R_3; R_1 += R_3;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, IntEndptsRGB q_endpts[NREGIONS_THREE])
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB endpts[NREGIONS_THREE], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB endpts[NREGIONS_THREE], const Pattern &p)
+{
+	return true;
+}
+
+
+static void write_header(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS_THREE; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[i*2+0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[i*2+1]);
+		}
+	nvAssert (out.getptr() == 99);
+}
+
+static void read_header(Bits &in, IntEndptsRGB endpts[NREGIONS_THREE], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS_THREE; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[i*2+0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[i*2+1]);
+		}
+	nvAssert (in.getptr() == 99);
+}
+
+
+// WORK PLACEHOLDER -- keep it simple for now
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS_THREE];
+
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_THREE; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS_THREE];
+
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS_THREE; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB endpts[NREGIONS_THREE], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB &endpts, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGB endpts[NREGIONS_THREE])
+{
+	nvAssert (p.transformed != 0);
+
+	for (int i=0; i<NCHANNELS_RGB; ++i)
+	{
+		// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+		endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[1]);
+		endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[2]);
+		endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[3]);
+		endpts[2].A[i] = SIGN_EXTEND(endpts[2].A[i], p.chan[i].nbitsizes[4]);
+		endpts[2].B[i] = SIGN_EXTEND(endpts[2].B[i], p.chan[i].nbitsizes[5]);
+	}
+}
+
+void AVPCL::decompress_mode2(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB endpts[NREGIONS_THREE];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+	for (int r = 0; r < NREGIONS_THREE; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS_THREE])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB &old_endpts, IntEndptsRGB &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+			float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGB &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB new_a, new_b;
+	IntEndptsRGB new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS_THREE], 
+							const IntEndptsRGB orig_endpts[NREGIONS_THREE], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB opt_endpts[NREGIONS_THREE])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS_THREE; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+		float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS_THREE], char *block)
+{
+	float orig_err[NREGIONS_THREE], opt_err[NREGIONS_THREE], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB orig_endpts[NREGIONS_THREE], opt_endpts[NREGIONS_THREE];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS_THREE; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 2).";
+	return FLT_MAX;
+
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS_THREE], Vector4 palette[NREGIONS_THREE][NINDICES])
+{
+	for (int region = 0; region < NREGIONS_THREE; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS_THREE])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS_THREE][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS_THREE])
+{
+	for (int region=0; region<NREGIONS_THREE; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode2(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS_THREE];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+

+ 1059 - 0
3rdparty/nvtt/bc7/avpcl_mode3.cpp

@@ -0,0 +1,1059 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x1000 777.1x4 64p 2bi (30b)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGB];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+#define	NREGIONS  2
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		xfm	mode  mb
+	7,7,7,7,	7,7,7,7,	7,7,7,7,	0,	0x8, 4, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGB];
+	int endpt_b_prec[NCHANNELS_RGB];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7, 7,7,7, 7,7,7, 7,7,7,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGB_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+// endpoints are 888,888; reduce to 777,777 and put the lsb bit majority in compr_bits
+static void compress_one(const IntEndptsRGB& endpts, IntEndptsRGB_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.A[j] & 1;
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 128);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		onescnt += endpts.B[j] & 1;
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 128);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGB_2& compr_endpts, IntEndptsRGB& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGB_2 compr_endpts[NREGIONS], IntEndptsRGB endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGB endpts[NREGIONS], IntEndptsRGB_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGB_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGB full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_positions have a 0 high-order bit
+static void swap_indices(IntEndptsRGB_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGB; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGB_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 98);
+}
+
+static void read_header(Bits &in, IntEndptsRGB_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGB; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 98);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGB_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGB_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGB endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	// constant alpha
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = 255.0f;
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGB_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode3(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGB_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGB_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+            float err = Utils::metric4(colors[i], palette[j]) * importance[i];
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGB_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGB_2 &old_endpts, IntEndptsRGB_2 &new_endpts, 
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGB_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float &orig_err, IntEndptsRGB_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGB_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGB_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGB_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGB_2 new_a, new_b;
+	IntEndptsRGB_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGB; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+// this will return a valid set of endpoints in opt_endpts regardless of whether it improve orig_endpts or not
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							const IntEndptsRGB_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGB_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGB_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGB_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 3).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[2];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				if (np < 2) alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*Vector4(direction, 0);
+		endpts[region].B = mean + maxp*Vector4(direction, 0);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode3(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+

+ 1214 - 0
3rdparty/nvtt/bc7/avpcl_mode4.cpp

@@ -0,0 +1,1214 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10000 2r 1i 555x2 6x2 2bi 3bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
+// array 0 is always the RGB array and array 1 is always the A array
+#define	NINDEXARRAYS	2
+#define	INDEXARRAY_RGB	0
+#define INDEXARRAY_A	1
+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+
+#define NINDICES3	8
+#define	INDEXBITS3	3
+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
+#define	DENOM3		(NINDICES3-1)
+#define	BIAS3		(DENOM3/2)
+
+#define NINDICES2	4
+#define	INDEXBITS2	2
+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
+#define	DENOM2		(NINDICES2-1)
+#define	BIAS2		(DENOM2/2)
+
+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
+
+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define NREGIONS	1			// keep the region stuff in just in case...
+
+// encoded index compression location: region 0 is always at 0,0.
+
+#define	NBITSIZES	2			// one endpoint pair
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	TRANSFORM_MODE_ALPHA	1
+#define	TRANSFORM_MODE_RGB	2
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha	xfm	mode  mb encoding
+	5,5,		5,5,		5,5,		6,6,	0x0, 0x10, 5, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5,6,	5,5,5,6,
+};
+
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+
+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 -= R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 -= R_0;
+	}
+}
+
+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 += R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 += R_0;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]);
+
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+
+		// swap RGB
+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
+		}
+
+		// swap A
+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
+{
+	// ignore shapeindex
+	out.write(p.mode, p.modebits);
+	out.write(rotatemode, ROTATEMODE_BITS);
+	out.write(indexmode, INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (out.getptr() == 50);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	rotatemode = in.read(ROTATEMODE_BITS);
+	indexmode = in.read(INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (in.getptr() == 50);
+}
+
+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	nvAssert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
+}
+
+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
+}
+
+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
+
+	write_indices(indices, shapeindex, indexmode, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
+{
+	// scale endpoints for RGB
+	int a, b;
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate R
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate G
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate B
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
+
+	// interpolate A
+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
+		palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode)));
+
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGBA; ++i)
+	{
+		if (p.transform_mode)
+		{
+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);
+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]);
+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]);
+		}
+	}
+}
+
+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
+{
+	out.size_x = in.size_x;
+	out.size_y = in.size_y;
+
+	for (int y=0; y<in.size_y; ++y)
+	for (int x=0; x<in.size_x; ++x)
+	{
+		float t;
+		out.data[y][x] = in.data[y][x];
+
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).x; (out.data[y][x]).x = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).y; (out.data[y][x]).y = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).z; (out.data[y][x]).z = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		default: nvUnreachable();
+		}
+	}
+}
+
+void AVPCL::decompress_mode4(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA endpts[NREGIONS];
+	int shapeindex, pat_index, rotatemode, indexmode;
+
+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
+	
+	sign_extend(p, endpts);
+
+	if (p.transform_mode)
+		transform_inverse(p.transform_mode, endpts);
+
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+
+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indexmode, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	Tile temp(t.size_x, t.size_y);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		temp.data[y][x] = Vector4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
+
+	rotate_tile(temp, rotatemode, t);
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
+// exceeds what we already have
+static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	Vector3 palette_rgb[NINDICES3];	// could be nindices2
+	float palette_a[NINDICES3];	// could be nindices2
+	float toterr = 0;
+
+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
+
+	Vector3 rgb;
+	float a;
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).z : (colors[i]).w;
+
+		rgb.x = (colors[i]).x;
+		rgb.y = (colors[i]).y;
+		rgb.z = (colors[i]).z;
+		a = (colors[i]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = Utils::metric1(a, palette_a[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					palette_alpha = palette_a[j];
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			toterr += besterr;
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+		else
+		{
+			// do RGB index
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					bestindex = j;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0);
+			toterr += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector3 rgb;
+	float a;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		rgb.x = (tile.data[y][x]).x;
+		rgb.y = (tile.data[y][x]).y;
+		rgb.z = (tile.data[y][x]).z;
+		a = (tile.data[y][x]).w;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+					palette_alpha = palette_a[region][i];
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;
+		}
+		else
+		{
+			// do RGB index first as it has the alpha
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+					bestindex = i;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0);
+			toterr[region] += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+		}
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts, 
+						  float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					indices[j][i] = temp_indices[j][i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		for (int j=0; j<NINDEXARRAYS; ++j)
+		for (int i=0; i<np; ++i)
+			indices[j][i] = good_indices[j][i];
+	}
+
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA new_a, new_b;
+	IntEndptsRGBA new_endpt;
+	int do_b;
+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+		float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[j][i] = temp_indices0[j][i];
+					nvAssert (orig_indices[j][i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const float orig_err[NREGIONS], 
+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+        float temp_out_err = optimize_one(pixels, importance, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+
+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
+
+		if (patterns[sp].transform_mode)
+			transform_forward(patterns[sp].transform_mode, orig_endpts);
+
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transform_mode)
+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
+
+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
+
+			if (patterns[sp].transform_mode)
+				transform_forward(patterns[sp].transform_mode, opt_endpts);
+
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transform_mode)
+					transform_forward(patterns[sp].transform_mode, orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 4).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+// compute initial endpoints for the "RGB" portion and the "A" portion. 
+// Note these channels may have been rotated.
+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		float mina = FLT_MAX, maxa = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+
+			dp = alphas[i] - mean.w;
+			if (dp < mina) mina = dp;
+			if (dp > maxa) maxa = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + Vector4(minp*direction, mina);
+		endpts[region].B = mean + Vector4(maxp*direction, maxa);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+}
+
+float AVPCL::compress_mode4(const Tile &t, char *block)
+{
+	FltEndpts endpts[NREGIONS];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+	int shape = 0;
+	Tile t1;
+
+	// try all rotations. refine tries the 2 different indexings.
+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
+	{
+		rotate_tile(t, r, t1);
+		rough(t1, shape, endpts);
+		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
+		{
+			float mse = refine(t1, shape, r, i, endpts, tempblock);
+			if (mse < msebest)
+			{
+				memcpy(block, tempblock, sizeof(tempblock));
+				msebest = mse;
+			}
+		}
+	}
+	return msebest;
+}

+ 1216 - 0
3rdparty/nvtt/bc7/avpcl_mode5.cpp

@@ -0,0 +1,1216 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x100000 2r 777x2 8x2 2bi 2bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+// there are 2 index arrays. INDEXMODE selects between the arrays being 2 & 3 bits or 3 & 2 bits
+// array 0 is always the RGB array and array 1 is always the A array
+#define	NINDEXARRAYS	2
+#define	INDEXARRAY_RGB	0
+#define INDEXARRAY_A	1
+#define INDEXARRAY_2BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+#define INDEXARRAY_3BITS(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_3BITS) ? INDEXARRAY_A : INDEXARRAY_RGB)
+
+#define NINDICES3	4
+#define	INDEXBITS3	2
+#define	HIGH_INDEXBIT3	(1<<(INDEXBITS3-1))
+#define	DENOM3		(NINDICES3-1)
+#define	BIAS3		(DENOM3/2)
+
+#define NINDICES2	4
+#define	INDEXBITS2	2
+#define	HIGH_INDEXBIT2	(1<<(INDEXBITS2-1))
+#define	DENOM2		(NINDICES2-1)
+#define	BIAS2		(DENOM2/2)
+
+#define	NINDICES_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES3 : NINDICES2)
+#define	INDEXBITS_RGB(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS3 : INDEXBITS2)
+#define	HIGH_INDEXBIT_RGB(indexmode)((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT3 : HIGH_INDEXBIT2)
+#define	DENOM_RGB(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM3 : DENOM2)
+#define	BIAS_RGB(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS3 : BIAS2)
+
+#define	NINDICES_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? NINDICES2 : NINDICES3)
+#define	INDEXBITS_A(indexmode)		((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? INDEXBITS2 : INDEXBITS3)
+#define	HIGH_INDEXBIT_A(indexmode)	((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? HIGH_INDEXBIT2 : HIGH_INDEXBIT3)
+#define	DENOM_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? DENOM2 : DENOM3)
+#define	BIAS_A(indexmode)			((indexmode == INDEXMODE_ALPHA_IS_2BITS) ? BIAS2 : BIAS3)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define NREGIONS	1			// keep the region stuff in just in case...
+
+// encoded index compression location: region 0 is always at 0,0.
+
+#define	NBITSIZES	2			// one endpoint pair
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transform_mode;		// x0 means alpha channel not transformed, x1 otherwise. 0x rgb not transformed, 1x otherwise.
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	TRANSFORM_MODE_ALPHA	1
+#define	TRANSFORM_MODE_RGB	2
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha	xfm	mode  mb encoding
+	7,7,		7,7,		7,7,		8,8,	0x0, 0x20, 6, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7,8,	7,7,7,8,
+};
+
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+#define	R_0	ep[0].A[i]
+#define	R_1 ep[0].B[i]
+
+static void transform_forward(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 -= R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 -= R_0;
+	}
+}
+
+static void transform_inverse(int transform_mode, IntEndptsRGBA ep[NREGIONS])
+{
+	int i;
+
+	if (transform_mode & TRANSFORM_MODE_RGB)
+		for (i=CHANNEL_R; i<CHANNEL_A; ++i)
+			R_1 += R_0;
+	if (transform_mode & TRANSFORM_MODE_ALPHA)
+	{
+		i = CHANNEL_A;
+		R_1 += R_0;
+	}
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA q_endpts[NREGIONS])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		q_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]);
+		q_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]);
+		q_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]);
+		q_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]);
+
+		q_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]);
+		q_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]);
+		q_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]);
+		q_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(int shapeindex, int indexmode, IntEndptsRGBA endpts[NREGIONS], int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+
+		// swap RGB
+		if (indices[INDEXARRAY_RGB][y][x] & HIGH_INDEXBIT_RGB(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_R; i<=CHANNEL_B; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_RGB][y][x] = NINDICES_RGB(indexmode) - 1 - indices[INDEXARRAY_RGB][y][x];
+		}
+
+		// swap A
+		if (indices[INDEXARRAY_A][y][x] & HIGH_INDEXBIT_A(indexmode))
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=CHANNEL_A; i<=CHANNEL_A; ++i) { t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t; }
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[INDEXARRAY_A][y][x] = NINDICES_A(indexmode) - 1 - indices[INDEXARRAY_A][y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, int rotatemode, int indexmode, Bits &out)
+{
+	// ignore shapeindex
+	out.write(p.mode, p.modebits);
+	out.write(rotatemode, ROTATEMODE_BITS);
+//	out.write(indexmode, INDEXMODE_BITS);
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[0]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (out.getptr() == 66);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA endpts[NREGIONS], int &shapeindex, int &rotatemode, int &indexmode, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	rotatemode = in.read(ROTATEMODE_BITS);
+
+	indexmode = 0;		// we don't have any
+
+	for (int i=0; i<NREGIONS; ++i)
+		for (int j=0; j<NCHANNELS_RGBA; ++j)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[0]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[1]);
+		}
+	nvAssert (in.getptr() == 66);
+}
+
+static void write_indices(const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int shapeindex, int indexmode, Bits &out)
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	nvAssert ((indices[INDEXARRAY_2BITS(indexmode)][0][0] & HIGH_INDEXBIT2) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3], INDEXBITS2 - (i==0?1:0));	// write i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	nvAssert ((indices[INDEXARRAY_3BITS(indexmode)][0][0] & HIGH_INDEXBIT3) == 0);
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		out.write(indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3], INDEXBITS3 - (i==0?1:0));	// write i..[2:0] or i..[1:0]
+}
+
+static void read_indices(Bits &in, int shapeindex, int indexmode, int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W])
+{
+	// the indices we shorten is always index 0
+
+	// do the 2 bit indices first
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_2BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS2 - (i==0?1:0));		// read i..[1:0] or i..[0]
+
+	// then the 3 bit indices
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+		indices[INDEXARRAY_3BITS(indexmode)][i>>2][i&3] = in.read(INDEXBITS3 - (i==0?1:0));		// read i..[1:0] or i..[0]
+}
+
+static void emit_block(const IntEndptsRGBA endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], int rotatemode, int indexmode, char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, rotatemode, indexmode, out);
+
+	write_indices(indices, shapeindex, indexmode, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized_rgb_a(const IntEndptsRGBA &endpts, const RegionPrec &region_prec, int indexmode, Vector3 palette_rgb[NINDICES3], float palette_a[NINDICES3])
+{
+	// scale endpoints for RGB
+	int a, b;
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]); 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]);
+
+	// interpolate R
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].x = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]);
+
+	// interpolate G
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].y = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]);
+
+	// interpolate B
+	for (int i = 0; i < NINDICES_RGB(indexmode); ++i)
+		palette_rgb[i].z = float(Utils::lerp(a, b, i, BIAS_RGB(indexmode), DENOM_RGB(indexmode)));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]);
+
+	// interpolate A
+	for (int i = 0; i < NINDICES_A(indexmode); ++i)
+		palette_a[i] = float(Utils::lerp(a, b, i, BIAS_A(indexmode), DENOM_A(indexmode)));
+}
+
+static void sign_extend(Pattern &p, IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NCHANNELS_RGBA; ++i)
+	{
+		if (p.transform_mode)
+		{
+			// endpts[0].A[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);	// always positive here
+			endpts[0].B[i] = SIGN_EXTEND(endpts[0].B[i], p.chan[i].nbitsizes[0]);
+			endpts[1].A[i] = SIGN_EXTEND(endpts[1].A[i], p.chan[i].nbitsizes[1]);
+			endpts[1].B[i] = SIGN_EXTEND(endpts[1].B[i], p.chan[i].nbitsizes[1]);
+		}
+	}
+}
+
+static void rotate_tile(const Tile &in, int rotatemode, Tile &out)
+{
+	out.size_x = in.size_x;
+	out.size_y = in.size_y;
+
+	for (int y=0; y<in.size_y; ++y)
+	for (int x=0; x<in.size_x; ++x)
+	{
+		float t;
+		out.data[y][x] = in.data[y][x];
+
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: t = (out.data[y][x]).x; (out.data[y][x]).x = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RABG: t = (out.data[y][x]).y; (out.data[y][x]).y = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		case ROTATEMODE_RGBA_RGAB: t = (out.data[y][x]).z; (out.data[y][x]).z = (out.data[y][x]).w; (out.data[y][x]).w = t; break;
+		default: nvUnreachable();
+		}
+	}
+}
+
+void AVPCL::decompress_mode5(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA endpts[NREGIONS];
+	int shapeindex, pat_index, rotatemode, indexmode;
+
+	read_header(in, endpts, shapeindex, rotatemode, indexmode, p, pat_index);
+	
+	sign_extend(p, endpts);
+
+	if (p.transform_mode)
+		transform_inverse(p.transform_mode, endpts);
+
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+		generate_palette_quantized_rgb_a(endpts[region], pattern_precs[pat_index].region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+
+	int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indexmode, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	Tile temp(t.size_x, t.size_y);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		temp.data[y][x] = Vector4(palette_rgb[REGION(x,y,shapeindex)][indices[INDEXARRAY_RGB][y][x]], palette_a[REGION(x,y,shapeindex)][indices[INDEXARRAY_A][y][x]]);
+
+	rotate_tile(temp, rotatemode, t);
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+// we already have a candidate mapping when we call this function, thus an error. take an early exit if the accumulated error so far
+// exceeds what we already have
+static float map_colors(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, const IntEndptsRGBA &endpts, const RegionPrec &region_prec, float current_besterr, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	Vector3 palette_rgb[NINDICES3];	// could be nindices2
+	float palette_a[NINDICES3];	// could be nindices2
+	float toterr = 0;
+
+	generate_palette_quantized_rgb_a(endpts, region_prec, indexmode, &palette_rgb[0], &palette_a[0]);
+
+	Vector3 rgb;
+	float a;
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (colors[i]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (colors[i]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (colors[i]).z : (colors[i]).w;
+
+		rgb.x = (colors[i]).x;
+		rgb.y = (colors[i]).y;
+		rgb.z = (colors[i]).z;
+		a = (colors[i]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = Utils::metric1(a, palette_a[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					palette_alpha = palette_a[j];
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[j], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			toterr += besterr;
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+		else
+		{
+			// do RGB index
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int j = 0; j < NINDICES_RGB(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[j], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[j], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					bestindex = j;
+					indices[INDEXARRAY_RGB][i] = j;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[bestindex]).z : nvCheckMacro(0);
+			toterr += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int j = 0; j < NINDICES_A(indexmode) && besterr > 0; ++j)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[j], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[j], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][i] = j;
+				}
+			}
+			toterr += besterr;		// squared-error norms are additive since we don't do the square root
+			if (toterr > current_besterr)
+			{
+				// fill out bogus index values so it's initialized at least
+				for (int k = i; k < np; ++k)
+				{
+					indices[INDEXARRAY_RGB][k] = -1;
+					indices[INDEXARRAY_A][k] = -1;
+				}
+				return FLT_MAX;
+			}
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, int rotatemode, int indexmode, IntEndptsRGBA endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	Vector3 palette_rgb[NREGIONS][NINDICES3];	// could be nindices2
+	float palette_a[NREGIONS][NINDICES3];	// could be nindices2
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized_rgb_a(endpts[region], pattern_prec.region_precs[region], indexmode, &palette_rgb[region][0], &palette_a[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector3 rgb;
+	float a;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+		float palette_alpha = 0, tile_alpha = 0;
+
+		rgb.x = (tile.data[y][x]).x;
+		rgb.y = (tile.data[y][x]).y;
+		rgb.z = (tile.data[y][x]).z;
+		a = (tile.data[y][x]).w;
+
+		if(AVPCL::flag_premult)
+				tile_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (tile.data[y][x]).x :
+							 (rotatemode == ROTATEMODE_RGBA_RABG) ? (tile.data[y][x]).y :
+							 (rotatemode == ROTATEMODE_RGBA_RGAB) ? (tile.data[y][x]).z : (tile.data[y][x]).w;
+
+		// compute the two indices separately
+		// if we're doing premultiplied alpha, we need to choose first the index that
+		// determines the alpha value, and then do the other index
+
+		if (rotatemode == ROTATEMODE_RGBA_RGBA)
+		{
+			// do A index first as it has the alpha
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = Utils::metric1(a, palette_a[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+					palette_alpha = palette_a[region][i];
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+
+			// do RGB index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphaout(rgb, tile_alpha, palette_rgb[region][i], palette_alpha);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;
+		}
+		else
+		{
+			// do RGB index first as it has the alpha
+			besterr = FLT_MAX;
+			int bestindex;
+			for (int i = 0; i < NINDICES_RGB(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric3(rgb, palette_rgb[region][i], rotatemode) :
+											 Utils::metric3premult_alphain(rgb, palette_rgb[region][i], rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_RGB][y][x] = i;
+					bestindex = i;
+				}
+			}
+			palette_alpha = (rotatemode == ROTATEMODE_RGBA_AGBR) ? (palette_rgb[region][bestindex]).x :
+							(rotatemode == ROTATEMODE_RGBA_RABG) ? (palette_rgb[region][bestindex]).y :
+							(rotatemode == ROTATEMODE_RGBA_RGAB) ? (palette_rgb[region][bestindex]).z : nvCheckMacro(0);
+			toterr[region] += besterr;
+
+			// do A index
+			besterr = FLT_MAX;
+			for (int i = 0; i < NINDICES_A(indexmode) && besterr > 0; ++i)
+			{
+				err = !AVPCL::flag_premult ? Utils::metric1(a, palette_a[region][i], rotatemode) :
+											 Utils::metric1premult(a, tile_alpha, palette_a[region][i], palette_alpha, rotatemode);
+
+				if (err > besterr)	// error increased, so we're done searching
+					break;
+				if (err < besterr)
+				{
+					besterr = err;
+					indices[INDEXARRAY_A][y][x] = i;
+				}
+			}
+			toterr[region] += besterr;		// squared-error norms are additive since we don't do the square root
+		}
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, const IntEndptsRGBA &old_endpts, IntEndptsRGBA &new_endpts,
+						  float old_err, int do_b, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					indices[j][i] = temp_indices[j][i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA &opt_endpts, int indices[NINDEXARRAYS][Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	for (int j=0; j<NINDEXARRAYS; ++j)
+	for (int i=0; i<np; ++i)
+		indices[j][i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, rotatemode, indexmode, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+					good_indices[j][i] = temp_indices[j][i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		for (int j=0; j<NINDEXARRAYS; ++j)
+		for (int i=0; i<np; ++i)
+			indices[j][i] = good_indices[j][i];
+	}
+
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, int rotatemode, int indexmode, float orig_err, const IntEndptsRGBA &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA new_a, new_b;
+	IntEndptsRGBA new_endpt;
+	int do_b;
+	int orig_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int new_indices[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices0[NINDEXARRAYS][Tile::TILE_TOTAL];
+	int temp_indices1[NINDEXARRAYS][Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = orig_indices[j][i] = temp_indices1[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int j=0; j<NINDEXARRAYS; ++j)
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[j][i] = temp_indices0[j][i];
+				nvAssert (orig_indices[j][i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[INDEXARRAY_RGB][i] != new_indices[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != new_indices[INDEXARRAY_A][i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, rotatemode, indexmode, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int j=0; j<NINDEXARRAYS; ++j)
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[j][i] = temp_indices0[j][i];
+					nvAssert (orig_indices[j][i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[INDEXARRAY_RGB][i] != temp_indices0[INDEXARRAY_RGB][i] || orig_indices[INDEXARRAY_A][i] != temp_indices0[INDEXARRAY_A][i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, int rotatemode, int indexmode, const float orig_err[NREGIONS], 
+							const IntEndptsRGBA orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA temp_in, temp_out;
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// make sure we have a valid error for temp_in
+		// we didn't change temp_in, so orig_err[region] is still valid
+		float temp_in_err = orig_err[region];
+
+		// now try to optimize these endpoints
+        float temp_out_err = optimize_one(pixels, importance, np, rotatemode, indexmode, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+		// if we find an improvement, update the best so far and correct the output endpoints and errors
+		if (temp_out_err < best_err)
+		{
+			best_err = temp_out_err;
+			opt_err[region] = temp_out_err;
+			opt_endpts[region] = temp_out;
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, int rotatemode, int indexmode, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W], opt_indices[NINDEXARRAYS][Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+
+		assign_indices(tile, shapeindex_best, rotatemode, indexmode, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(shapeindex_best, indexmode, orig_endpts, orig_indices);
+
+		if (patterns[sp].transform_mode)
+			transform_forward(patterns[sp].transform_mode, orig_endpts);
+
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transform_mode)
+				transform_inverse(patterns[sp].transform_mode, orig_endpts);
+
+			optimize_endpts(tile, shapeindex_best, rotatemode, indexmode, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+			assign_indices(tile, shapeindex_best, rotatemode, indexmode, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(shapeindex_best, indexmode, opt_endpts, opt_indices);
+
+			if (patterns[sp].transform_mode)
+				transform_forward(patterns[sp].transform_mode, opt_endpts);
+
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, rotatemode, indexmode, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transform_mode)
+					transform_forward(patterns[sp].transform_mode, orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, rotatemode, indexmode, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 5).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+// compute initial endpoints for the "RGB" portion and the "A" portion. 
+// Note these channels may have been rotated.
+static void rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector3 colors[Tile::TILE_TOTAL];
+		float alphas[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x].xyz();
+				alphas[np] = tile.data[y][x].w;
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[0], alphas[0]);
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = Vector4(colors[0], alphas[0]);
+			endpts[region].B = Vector4(colors[1], alphas[1]);
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector3 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		float mina = FLT_MAX, maxa = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean.xyz(), direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+
+			dp = alphas[i] - mean.w;
+			if (dp < mina) mina = dp;
+			if (dp > maxa) maxa = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + Vector4(minp*direction, mina);
+		endpts[region].B = mean + Vector4(maxp*direction, maxa);
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+}
+
+float AVPCL::compress_mode5(const Tile &t, char *block)
+{
+	FltEndpts endpts[NREGIONS];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+	int shape = 0;
+	Tile t1;
+
+	// try all rotations. refine tries the 2 different indexings.
+	for (int r = 0; r < NROTATEMODES && msebest > 0; ++r)
+	{
+		rotate_tile(t, r, t1);
+		rough(t1, shape, endpts);
+//		for (int i = 0; i < NINDEXMODES && msebest > 0; ++i)
+		for (int i = 0; i < 1 && msebest > 0; ++i)
+		{
+			float mse = refine(t1, shape, r, i, endpts, tempblock);
+			if (mse < msebest)
+			{
+				memcpy(block, tempblock, sizeof(tempblock));
+				msebest = mse;
+			}
+		}
+	}
+	return msebest;
+}

+ 1055 - 0
3rdparty/nvtt/bc7/avpcl_mode6.cpp

@@ -0,0 +1,1055 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x1000000 7777.1x2 4bi
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	16
+#define	INDEXBITS	4
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+#define	NSHAPES	1
+
+static int shapes[NSHAPES] =
+{
+	0x0000,
+};
+
+#define	REGION(x,y,shapeindex)	((shapes[shapeindex]&(1<<(15-(x)-4*(y))))!=0)
+
+#define	NREGIONS	1
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red	green	blue	alpha	mode  mb verilog
+	7,7,	7,7,	7,7,	7,7,	0x40, 7, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	7,7,7,7,	7,7,7,7,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+/*
+we're using this table to assign lsbs
+abgr	>=2	correct
+0000	0	0
+0001	0	0
+0010	0	0
+0011	1	x1
+0100	0	0
+0101	1	x1
+0110	1	x1
+0111	1	1
+1000	0	0
+1001	1	x0
+1010	1	x0
+1011	1	1
+1100	1	x0
+1101	1	1
+1110	1	1
+1111	1	1
+
+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
+I choose to assign the lsbs so that the rgb channels are as good as possible.
+*/
+
+// 8888 ->7777.1, use the "correct" column above to assign the lsb
+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		// ignore the alpha channel in the count
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 128);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 128);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+
+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGBA full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]+1);
+
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]+1);
+
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	int index_positions[NREGIONS];
+
+	index_positions[0] = 0;			// since WLOG we have the high bit of the shapes at 0
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int x = index_positions[region] & 3;
+		int y = (index_positions[region] >> 2) & 3;
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 65);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	p = patterns[pat_index];
+
+	shapeindex = 0;		// we don't have any
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 65);
+}
+
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	nvAssert ((indices[0][0] & HIGH_INDEXBIT) == 0);
+
+	// the index we shorten is always index 0
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+	{
+		if (i==0)
+			out.write(indices[i>>2][i&3], INDEXBITS-1);	// write i..[2:0]
+		else
+			out.write(indices[i>>2][i&3], INDEXBITS);	// write i..[3:0]
+	}
+
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	// the index we shorten is always index 0
+	for (int i = 0; i < Tile::TILE_TOTAL; ++i)
+	{
+		if (i==0)
+			indices[i>>2][i&3] = in.read(INDEXBITS-1);	// read i..[1:0]
+		else
+			indices[i>>2][i&3] = in.read(INDEXBITS);	// read i..[2:0]
+	}
+}
+
+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGBA endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM));
+}
+
+void AVPCL::decompress_mode6(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
+									     Utils::metric4premult(colors[i], palette[j]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts,
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA_2 new_a, new_b;
+	IntEndptsRGBA_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// try all lsb modes as we search for better endpoints
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+            float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+
+     simplify the above given that there is no transform now and that endpoints will always fit
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+
+		optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+
+		assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+		// (nreed) Commented out asserts because they go off all the time...not sure why
+		//for (int i=0; i<NREGIONS; ++i)
+		//	nvAssert(expected_opt_err[i] == opt_err[i]);
+		swap_indices(opt_endpts, opt_indices, shapeindex_best);
+
+		orig_toterr = opt_toterr = 0;
+		for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+		//nvAssert(opt_toterr <= orig_toterr);
+
+		if (opt_toterr < orig_toterr)
+		{
+			emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+			return opt_toterr;
+		}
+		else
+		{
+			emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+			return orig_toterr;
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 6).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr;
+
+		besterr = Utils::metric4(tile.data[y][x], palette[region][0]);
+
+		for (int i = 1; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector4 colors[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector4 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean, direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode6(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=1;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+

+ 1094 - 0
3rdparty/nvtt/bc7/avpcl_mode7.cpp

@@ -0,0 +1,1094 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Thanks to Jacob Munkberg ([email protected]) for the shortcut of using SVD to do the equivalent of principal components analysis
+
+// x10000000 5555.1x4 64p 2bi (30b)
+
+#include "bits.h"
+#include "tile.h"
+#include "avpcl.h"
+#include "nvcore/Debug.h"
+#include "nvmath/Vector.inl"
+#include "nvmath/Matrix.inl"
+#include "nvmath/Fitting.h"
+#include "avpcl_utils.h"
+#include "endpts.h"
+#include <cstring>
+#include <float.h>
+
+#include "shapes_two.h"
+
+using namespace nv;
+using namespace AVPCL;
+
+#define	NLSBMODES	4		// number of different lsb modes per region. since we have two .1 per region, that can have 4 values
+
+#define NINDICES	4
+#define	INDEXBITS	2
+#define	HIGH_INDEXBIT	(1<<(INDEXBITS-1))
+#define	DENOM		(NINDICES-1)
+#define	BIAS		(DENOM/2)
+
+// WORK: determine optimal traversal pattern to search for best shape -- what does the error curve look like?
+// i.e. can we search shapes in a particular order so we can see the global error minima easily and
+// stop without having to touch all shapes?
+
+#define	POS_TO_X(pos)	((pos)&3)
+#define	POS_TO_Y(pos)	(((pos)>>2)&3)
+
+#define	NBITSIZES	(NREGIONS*2)
+#define	ABITINDEX(region)	(2*(region)+0)
+#define	BBITINDEX(region)	(2*(region)+1)
+
+struct ChanBits
+{
+	int nbitsizes[NBITSIZES];	// bitsizes for one channel
+};
+
+struct Pattern
+{
+	ChanBits chan[NCHANNELS_RGBA];//  bit patterns used per channel
+	int transformed;		// if 0, deltas are unsigned and no transform; otherwise, signed and transformed
+	int mode;				// associated mode value
+	int modebits;			// number of mode bits
+	const char *encoding;			// verilog description of encoding for this mode
+};
+
+#define	NPATTERNS 1
+#define	NREGIONS  2
+
+static Pattern patterns[NPATTERNS] =
+{
+	// red		green		blue		alpha		xfm	mode  mb
+	5,5,5,5,	5,5,5,5,	5,5,5,5,	5,5,5,5,	0,	0x80, 8, "",
+};
+
+struct RegionPrec
+{
+	int	endpt_a_prec[NCHANNELS_RGBA];
+	int endpt_b_prec[NCHANNELS_RGBA];
+};
+
+struct PatternPrec
+{
+	RegionPrec region_precs[NREGIONS];
+};
+
+
+// this is the precision for each channel and region
+// NOTE: this MUST match the corresponding data in "patterns" above -- WARNING: there is NO nvAssert to check this!
+static PatternPrec pattern_precs[NPATTERNS] =
+{
+	5,5,5,5,  5,5,5,5,  5,5,5,5,  5,5,5,5,
+};
+
+// return # of bits needed to store n. handle signed or unsigned cases properly
+static int nbits(int n, bool issigned)
+{
+	int nb;
+	if (n==0)
+		return 0;	// no bits needed for 0, signed or not
+	else if (n > 0)
+	{
+		for (nb=0; n; ++nb, n>>=1) ;
+		return nb + (issigned?1:0);
+	}
+	else
+	{
+		nvAssert (issigned);
+		for (nb=0; n<-1; ++nb, n>>=1) ;
+		return nb + 1;
+	}
+}
+
+static void transform_forward(IntEndptsRGBA_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+static void transform_inverse(IntEndptsRGBA_2 ep[NREGIONS])
+{
+	nvUnreachable();
+}
+
+/*
+we're using this table to assign lsbs
+abgr	>=2	correct
+0000	0	0
+0001	0	0
+0010	0	0
+0011	1	x1
+0100	0	0
+0101	1	x1
+0110	1	x1
+0111	1	1
+1000	0	0
+1001	1	x0
+1010	1	x0
+1011	1	1
+1100	1	x0
+1101	1	1
+1110	1	1
+1111	1	1
+
+we need 8 0's and 8 1's. the x's can be either 0 or 1 as long as you get 8/8.
+I choose to assign the lsbs so that the rgb channels are as good as possible.
+*/
+
+// 6666 ->5555.1, use the "correct" column above to assign the lsb
+static void compress_one(const IntEndptsRGBA& endpts, IntEndptsRGBA_2& compr_endpts)
+{
+	int onescnt;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		// ignore the alpha channel in the count
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.A[j] & 1);
+		compr_endpts.A[j] = endpts.A[j] >> 1;
+		nvAssert (compr_endpts.A[j] < 32);
+	}
+	compr_endpts.a_lsb = onescnt >= 2;
+
+	onescnt = 0;
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		onescnt += (j==CHANNEL_A) ? 0 : (endpts.B[j] & 1);
+		compr_endpts.B[j] = endpts.B[j] >> 1;
+		nvAssert (compr_endpts.B[j] < 32);
+	}
+	compr_endpts.b_lsb = onescnt >= 2;
+}
+
+static void uncompress_one(const IntEndptsRGBA_2& compr_endpts, IntEndptsRGBA& endpts)
+{
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+	{
+		endpts.A[j] = (compr_endpts.A[j] << 1) | compr_endpts.a_lsb;
+		endpts.B[j] = (compr_endpts.B[j] << 1) | compr_endpts.b_lsb;
+	}
+}
+static void uncompress_endpoints(const IntEndptsRGBA_2 compr_endpts[NREGIONS], IntEndptsRGBA endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		uncompress_one(compr_endpts[i], endpts[i]);
+}
+
+static void compress_endpoints(const IntEndptsRGBA endpts[NREGIONS], IntEndptsRGBA_2 compr_endpts[NREGIONS])
+{
+	for (int i=0; i<NREGIONS; ++i)
+		compress_one(endpts[i], compr_endpts[i]);
+}
+
+static void quantize_endpts(const FltEndpts endpts[NREGIONS], const PatternPrec &pattern_prec, IntEndptsRGBA_2 q_endpts[NREGIONS])
+{
+	IntEndptsRGBA full_endpts[NREGIONS];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		full_endpts[region].A[0] = Utils::quantize(endpts[region].A.x, pattern_prec.region_precs[region].endpt_a_prec[0]+1);	// +1 since we are in uncompressed space
+		full_endpts[region].A[1] = Utils::quantize(endpts[region].A.y, pattern_prec.region_precs[region].endpt_a_prec[1]+1);
+		full_endpts[region].A[2] = Utils::quantize(endpts[region].A.z, pattern_prec.region_precs[region].endpt_a_prec[2]+1);
+		full_endpts[region].A[3] = Utils::quantize(endpts[region].A.w, pattern_prec.region_precs[region].endpt_a_prec[3]+1);
+
+		full_endpts[region].B[0] = Utils::quantize(endpts[region].B.x, pattern_prec.region_precs[region].endpt_b_prec[0]+1);
+		full_endpts[region].B[1] = Utils::quantize(endpts[region].B.y, pattern_prec.region_precs[region].endpt_b_prec[1]+1);
+		full_endpts[region].B[2] = Utils::quantize(endpts[region].B.z, pattern_prec.region_precs[region].endpt_b_prec[2]+1);
+		full_endpts[region].B[3] = Utils::quantize(endpts[region].B.w, pattern_prec.region_precs[region].endpt_b_prec[3]+1);
+
+		compress_one(full_endpts[region], q_endpts[region]);
+	}
+}
+
+// swap endpoints as needed to ensure that the indices at index_one and index_two have a 0 high-order bit
+// index_two is 0 at x=0 y=0 and 15 at x=3 y=3 so y = (index >> 2) & 3 and x = index & 3
+static void swap_indices(IntEndptsRGBA_2 endpts[NREGIONS], int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex)
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		int position = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,region);
+
+		int x = POS_TO_X(position);
+		int y = POS_TO_Y(position);
+		nvAssert(REGION(x,y,shapeindex) == region);		// double check the table
+		if (indices[y][x] & HIGH_INDEXBIT)
+		{
+			// high bit is set, swap the endpts and indices for this region
+			int t;
+			for (int i=0; i<NCHANNELS_RGBA; ++i) 
+			{
+				t = endpts[region].A[i]; endpts[region].A[i] = endpts[region].B[i]; endpts[region].B[i] = t;
+			}
+			t = endpts[region].a_lsb; endpts[region].a_lsb = endpts[region].b_lsb; endpts[region].b_lsb = t;
+
+			for (int y = 0; y < Tile::TILE_H; y++)
+			for (int x = 0; x < Tile::TILE_W; x++)
+				if (REGION(x,y,shapeindex) == region)
+					indices[y][x] = NINDICES - 1 - indices[y][x];
+		}
+	}
+}
+
+static bool endpts_fit(IntEndptsRGBA_2 endpts[NREGIONS], const Pattern &p)
+{
+	return true;
+}
+
+static void write_header(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, Bits &out)
+{
+	out.write(p.mode, p.modebits);
+	out.write(shapeindex, SHAPEBITS);
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			out.write(endpts[i].A[j], p.chan[j].nbitsizes[ABITINDEX(i)]);
+			out.write(endpts[i].B[j], p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		out.write(endpts[i].a_lsb, 1);
+		out.write(endpts[i].b_lsb, 1);
+	}
+
+	nvAssert (out.getptr() == 98);
+}
+
+static void read_header(Bits &in, IntEndptsRGBA_2 endpts[NREGIONS], int &shapeindex, Pattern &p, int &pat_index)
+{
+	int mode = AVPCL::getmode(in);
+
+	pat_index = 0;
+	nvAssert (pat_index >= 0 && pat_index < NPATTERNS);
+	nvAssert (in.getptr() == patterns[pat_index].modebits);
+
+	shapeindex = in.read(SHAPEBITS);
+	p = patterns[pat_index];
+
+	for (int j=0; j<NCHANNELS_RGBA; ++j)
+		for (int i=0; i<NREGIONS; ++i)
+		{
+			endpts[i].A[j] = in.read(p.chan[j].nbitsizes[ABITINDEX(i)]);
+			endpts[i].B[j] = in.read(p.chan[j].nbitsizes[BBITINDEX(i)]);
+		}
+	
+	for (int i=0; i<NREGIONS; ++i)
+	{
+		endpts[i].a_lsb  = in.read(1);
+		endpts[i].b_lsb  = in.read(1);
+	}
+
+	nvAssert (in.getptr() == 98);
+}
+
+// WORK PLACEHOLDER -- keep it simple for now
+static void write_indices(const int indices[Tile::TILE_H][Tile::TILE_W], int shapeindex, Bits &out)
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		out.write(indices[y][x], INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void read_indices(Bits &in, int shapeindex, int indices[Tile::TILE_H][Tile::TILE_W])
+{
+	int positions[NREGIONS];
+
+	for (int r = 0; r < NREGIONS; ++r)
+		positions[r] = SHAPEINDEX_TO_COMPRESSED_INDICES(shapeindex,r);
+
+	for (int pos = 0; pos < Tile::TILE_TOTAL; ++pos)
+	{
+		int x = POS_TO_X(pos);
+		int y = POS_TO_Y(pos);
+
+		bool match = false;
+
+		for (int r = 0; r < NREGIONS; ++r)
+			if (positions[r] == pos) { match = true; break; }
+
+		indices[y][x]= in.read(INDEXBITS - (match ? 1 : 0));
+	}
+}
+
+static void emit_block(const IntEndptsRGBA_2 endpts[NREGIONS], int shapeindex, const Pattern &p, const int indices[Tile::TILE_H][Tile::TILE_W], char *block)
+{
+	Bits out(block, AVPCL::BITSIZE);
+
+	write_header(endpts, shapeindex, p, out);
+
+	write_indices(indices, shapeindex, out);
+
+	nvAssert(out.getptr() == AVPCL::BITSIZE);
+}
+
+static void generate_palette_quantized(const IntEndptsRGBA_2 &endpts_2, const RegionPrec &region_prec, Vector4 palette[NINDICES])
+{
+	IntEndptsRGBA endpts;
+
+	uncompress_one(endpts_2, endpts);
+
+	// scale endpoints
+	int a, b;			// really need a IntVec4...
+
+	a = Utils::unquantize(endpts.A[0], region_prec.endpt_a_prec[0]+1);	// +1 since we are in uncompressed space 
+	b = Utils::unquantize(endpts.B[0], region_prec.endpt_b_prec[0]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].x = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[1], region_prec.endpt_a_prec[1]+1); 
+	b = Utils::unquantize(endpts.B[1], region_prec.endpt_b_prec[1]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].y = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[2], region_prec.endpt_a_prec[2]+1); 
+	b = Utils::unquantize(endpts.B[2], region_prec.endpt_b_prec[2]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].z = float(Utils::lerp(a, b, i, BIAS, DENOM));
+
+	a = Utils::unquantize(endpts.A[3], region_prec.endpt_a_prec[3]+1); 
+	b = Utils::unquantize(endpts.B[3], region_prec.endpt_b_prec[3]+1);
+
+	// interpolate
+	for (int i = 0; i < NINDICES; ++i)
+		palette[i].w = float(Utils::lerp(a, b, i, BIAS, DENOM));
+}
+
+// sign extend but only if it was transformed
+static void sign_extend(Pattern &p, IntEndptsRGBA_2 endpts[NREGIONS])
+{
+	nvUnreachable();
+}
+
+void AVPCL::decompress_mode7(const char *block, Tile &t)
+{
+	Bits in(block, AVPCL::BITSIZE);
+
+	Pattern p;
+	IntEndptsRGBA_2 endpts[NREGIONS];
+	int shapeindex, pat_index;
+
+	read_header(in, endpts, shapeindex, p, pat_index);
+	
+	if (p.transformed)
+	{
+		sign_extend(p, endpts);
+		transform_inverse(endpts);
+	}
+
+	Vector4 palette[NREGIONS][NINDICES];
+	for (int r = 0; r < NREGIONS; ++r)
+		generate_palette_quantized(endpts[r], pattern_precs[pat_index].region_precs[r], &palette[r][0]);
+
+	int indices[Tile::TILE_H][Tile::TILE_W];
+
+	read_indices(in, shapeindex, indices);
+
+	nvAssert(in.getptr() == AVPCL::BITSIZE);
+
+	// lookup
+	for (int y = 0; y < Tile::TILE_H; y++)
+	for (int x = 0; x < Tile::TILE_W; x++)
+		t.data[y][x] = palette[REGION(x,y,shapeindex)][indices[y][x]];
+}
+
+// given a collection of colors and quantized endpoints, generate a palette, choose best entries, and return a single toterr
+static float map_colors(const Vector4 colors[], const float importance[], int np, const IntEndptsRGBA_2 &endpts, const RegionPrec &region_prec, float current_err, int indices[Tile::TILE_TOTAL])
+{
+	Vector4 palette[NINDICES];
+	float toterr = 0;
+	Vector4 err;
+
+	generate_palette_quantized(endpts, region_prec, palette);
+
+	for (int i = 0; i < np; ++i)
+	{
+		float err, besterr = FLT_MAX;
+
+		for (int j = 0; j < NINDICES && besterr > 0; ++j)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(colors[i], palette[j]) :
+									     Utils::metric4premult(colors[i], palette[j]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[i] = j;
+			}
+		}
+		toterr += besterr;
+
+		// check for early exit
+		if (toterr > current_err)
+		{
+			// fill out bogus index values so it's initialized at least
+			for (int k = i; k < np; ++k)
+				indices[k] = -1;
+
+			return FLT_MAX;
+		}
+	}
+	return toterr;
+}
+
+// assign indices given a tile, shape, and quantized endpoints, return toterr for each region
+static void assign_indices(const Tile &tile, int shapeindex, IntEndptsRGBA_2 endpts[NREGIONS], const PatternPrec &pattern_prec, 
+						   int indices[Tile::TILE_H][Tile::TILE_W], float toterr[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	for (int region = 0; region < NREGIONS; ++region)
+	{
+		generate_palette_quantized(endpts[region], pattern_prec.region_precs[region], &palette[region][0]);
+		toterr[region] = 0;
+	}
+
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = !AVPCL::flag_premult ? Utils::metric4(tile.data[y][x], palette[region][i]) :
+										 Utils::metric4premult(tile.data[y][x], palette[region][i]) ;
+
+			if (err > besterr)	// error increased, so we're done searching
+				break;
+			if (err < besterr)
+			{
+				besterr = err;
+				indices[y][x] = i;
+			}
+		}
+		toterr[region] += besterr;
+	}
+}
+
+// note: indices are valid only if the value returned is less than old_err; otherwise they contain -1's
+// this function returns either old_err or a value smaller (if it was successful in improving the error)
+static float perturb_one(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, const IntEndptsRGBA_2 &old_endpts, IntEndptsRGBA_2 &new_endpts,
+						  float old_err, int do_b, int indices[Tile::TILE_TOTAL])
+{
+	// we have the old endpoints: old_endpts
+	// we have the perturbed endpoints: new_endpts
+	// we have the temporary endpoints: temp_endpts
+
+	IntEndptsRGBA_2 temp_endpts;
+	float min_err = old_err;		// start with the best current error
+	int beststep;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	// copy real endpoints so we can perturb them
+	temp_endpts = new_endpts = old_endpts;
+
+	int prec = do_b ? region_prec.endpt_b_prec[ch] : region_prec.endpt_a_prec[ch];
+
+	// do a logarithmic search for the best error for this endpoint (which)
+	for (int step = 1 << (prec-1); step; step >>= 1)
+	{
+		bool improved = false;
+		for (int sign = -1; sign <= 1; sign += 2)
+		{
+			if (do_b == 0)
+			{
+				temp_endpts.A[ch] = new_endpts.A[ch] + sign * step;
+				if (temp_endpts.A[ch] < 0 || temp_endpts.A[ch] >= (1 << prec))
+					continue;
+			}
+			else
+			{
+				temp_endpts.B[ch] = new_endpts.B[ch] + sign * step;
+				if (temp_endpts.B[ch] < 0 || temp_endpts.B[ch] >= (1 << prec))
+					continue;
+			}
+
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, min_err, temp_indices);
+
+			if (err < min_err)
+			{
+				improved = true;
+				min_err = err;
+				beststep = sign * step;
+				for (int i=0; i<np; ++i)
+					indices[i] = temp_indices[i];
+			}
+		}
+		// if this was an improvement, move the endpoint and continue search from there
+		if (improved)
+		{
+			if (do_b == 0)
+				new_endpts.A[ch] += beststep;
+			else
+				new_endpts.B[ch] += beststep;
+		}
+	}
+	return min_err;
+}
+
+// the larger the error the more time it is worth spending on an exhaustive search.
+// perturb the endpoints at least -3 to 3.
+// if err > 5000 perturb endpoints 50% of precision
+// if err > 1000 25%
+// if err > 200 12.5%
+// if err > 40  6.25%
+// for np = 16 -- adjust error thresholds as a function of np
+// always ensure endpoint ordering is preserved (no need to overlap the scan)
+// if orig_err returned from this is less than its input value, then indices[] will contain valid indices
+static float exhaustive(const Vector4 colors[], const float importance[], int np, int ch, const RegionPrec &region_prec, float orig_err, IntEndptsRGBA_2 &opt_endpts, int indices[Tile::TILE_TOTAL])
+{
+	IntEndptsRGBA_2 temp_endpts;
+	float best_err = orig_err;
+	int aprec = region_prec.endpt_a_prec[ch];
+	int bprec = region_prec.endpt_b_prec[ch];
+	int good_indices[Tile::TILE_TOTAL];
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int i=0; i<np; ++i)
+		indices[i] = -1;
+
+	float thr_scale = (float)np / (float)Tile::TILE_TOTAL;
+
+	if (orig_err == 0) return orig_err;
+
+	int adelta = 0, bdelta = 0;
+	if (orig_err > 5000.0*thr_scale)		{ adelta = (1 << aprec)/2; bdelta = (1 << bprec)/2; }
+	else if (orig_err > 1000.0*thr_scale)	{ adelta = (1 << aprec)/4; bdelta = (1 << bprec)/4; }
+	else if (orig_err > 200.0*thr_scale)	{ adelta = (1 << aprec)/8; bdelta = (1 << bprec)/8; }
+	else if (orig_err > 40.0*thr_scale)		{ adelta = (1 << aprec)/16; bdelta = (1 << bprec)/16; }
+	adelta = max(adelta, 3);
+	bdelta = max(bdelta, 3);
+
+#ifdef	DISABLE_EXHAUSTIVE
+	adelta = bdelta = 3;
+#endif
+
+	temp_endpts = opt_endpts;
+
+	// ok figure out the range of A and B
+	int alow = max(0, opt_endpts.A[ch] - adelta);
+	int ahigh = min((1<<aprec)-1, opt_endpts.A[ch] + adelta);
+	int blow = max(0, opt_endpts.B[ch] - bdelta);
+	int bhigh = min((1<<bprec)-1, opt_endpts.B[ch] + bdelta);
+
+	// now there's no need to swap the ordering of A and B
+	bool a_le_b = opt_endpts.A[ch] <= opt_endpts.B[ch];
+
+	int amin, bmin;
+
+	if (opt_endpts.A[ch] <= opt_endpts.B[ch])
+	{
+		// keep a <= b
+		for (int a = alow; a <= ahigh; ++a)
+		for (int b = max(a, blow); b < bhigh; ++b)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err;
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	else
+	{
+		// keep b <= a
+		for (int b = blow; b < bhigh; ++b)
+		for (int a = max(b, alow); a <= ahigh; ++a)
+		{
+			temp_endpts.A[ch] = a;
+			temp_endpts.B[ch] = b;
+		
+            float err = map_colors(colors, importance, np, temp_endpts, region_prec, best_err, temp_indices);
+			if (err < best_err) 
+			{ 
+				amin = a; 
+				bmin = b; 
+				best_err = err; 
+				for (int i=0; i<np; ++i)
+					good_indices[i] = temp_indices[i];
+			}
+		}
+	}
+	if (best_err < orig_err)
+	{
+		opt_endpts.A[ch] = amin;
+		opt_endpts.B[ch] = bmin;
+		orig_err = best_err;
+		// if we actually improved, update the indices
+		for (int i=0; i<np; ++i)
+			indices[i] = good_indices[i];
+	}
+	return best_err;
+}
+
+static float optimize_one(const Vector4 colors[], const float importance[], int np, float orig_err, const IntEndptsRGBA_2 &orig_endpts, const RegionPrec &region_prec, IntEndptsRGBA_2 &opt_endpts)
+{
+	float opt_err = orig_err;
+
+	opt_endpts = orig_endpts;
+
+	/*
+		err0 = perturb(rgb0, delta0)
+		err1 = perturb(rgb1, delta1)
+		if (err0 < err1)
+			if (err0 >= initial_error) break
+			rgb0 += delta0
+			next = 1
+		else
+			if (err1 >= initial_error) break
+			rgb1 += delta1
+			next = 0
+		initial_err = map()
+		for (;;)
+			err = perturb(next ? rgb1:rgb0, delta)
+			if (err >= initial_err) break
+			next? rgb1 : rgb0 += delta
+			initial_err = err
+	*/
+	IntEndptsRGBA_2 new_a, new_b;
+	IntEndptsRGBA_2 new_endpt;
+	int do_b;
+	int orig_indices[Tile::TILE_TOTAL];
+	int new_indices[Tile::TILE_TOTAL];
+	int temp_indices0[Tile::TILE_TOTAL];
+	int temp_indices1[Tile::TILE_TOTAL];
+
+	// now optimize each channel separately
+	// for the first error improvement, we save the indices. then, for any later improvement, we compare the indices
+	// if they differ, we restart the loop (which then falls back to looking for a first improvement.)
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+		// figure out which endpoint when perturbed gives the most improvement and start there
+		// if we just alternate, we can easily end up in a local minima
+        float err0 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_a, opt_err, 0, temp_indices0);	// perturb endpt A
+        float err1 = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_b, opt_err, 1, temp_indices1);	// perturb endpt B
+
+		if (err0 < err1)
+		{
+			if (err0 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices0[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.A[ch] = new_a.A[ch];
+			opt_err = err0;
+			do_b = 1;		// do B next
+		}
+		else
+		{
+			if (err1 >= opt_err)
+				continue;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = orig_indices[i] = temp_indices1[i];
+				nvAssert (orig_indices[i] != -1);
+			}
+
+			opt_endpts.B[ch] = new_b.B[ch];
+			opt_err = err1;
+			do_b = 0;		// do A next
+		}
+		
+		// now alternate endpoints and keep trying until there is no improvement
+		for (;;)
+		{
+            float err = perturb_one(colors, importance, np, ch, region_prec, opt_endpts, new_endpt, opt_err, do_b, temp_indices0);
+			if (err >= opt_err)
+				break;
+
+			for (int i=0; i<np; ++i)
+			{
+				new_indices[i] = temp_indices0[i];
+				nvAssert (new_indices[i] != -1);
+			}
+
+			if (do_b == 0)
+				opt_endpts.A[ch] = new_endpt.A[ch];
+			else
+				opt_endpts.B[ch] = new_endpt.B[ch];
+			opt_err = err;
+			do_b = 1 - do_b;	// now move the other endpoint
+		}
+
+		// see if the indices have changed
+		int i;
+		for (i=0; i<np; ++i)
+			if (orig_indices[i] != new_indices[i])
+				break;
+
+		if (i<np)
+			ch = -1;	// start over
+	}
+
+	// finally, do a small exhaustive search around what we think is the global minima to be sure
+	// note this is independent of the above search, so we don't care about the indices from the above
+	// we don't care about the above because if they differ, so what? we've already started at ch=0
+	bool first = true;
+	for (int ch = 0; ch < NCHANNELS_RGBA; ++ch)
+	{
+        float new_err = exhaustive(colors, importance, np, ch, region_prec, opt_err, opt_endpts, temp_indices0);
+
+		if (new_err < opt_err)
+		{
+			opt_err = new_err;
+
+			if (first)
+			{
+				for (int i=0; i<np; ++i)
+				{
+					orig_indices[i] = temp_indices0[i];
+					nvAssert (orig_indices[i] != -1);
+				}
+				first = false;
+			}
+			else
+			{
+				// see if the indices have changed
+				int i;
+				for (i=0; i<np; ++i)
+					if (orig_indices[i] != temp_indices0[i])
+						break;
+
+				if (i<np)
+				{
+					ch = -1;	// start over
+					first = true;
+				}
+			}
+		}
+	}
+
+	return opt_err;
+}
+
+static void optimize_endpts(const Tile &tile, int shapeindex, const float orig_err[NREGIONS], 
+							IntEndptsRGBA_2 orig_endpts[NREGIONS], const PatternPrec &pattern_prec, float opt_err[NREGIONS], IntEndptsRGBA_2 opt_endpts[NREGIONS])
+{
+	Vector4 pixels[Tile::TILE_TOTAL];
+    float importance[Tile::TILE_TOTAL];
+	IntEndptsRGBA_2 temp_in, temp_out;
+	int temp_indices[Tile::TILE_TOTAL];
+
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		// collect the pixels in the region
+		int np = 0;
+
+        for (int y = 0; y < tile.size_y; y++) {
+            for (int x = 0; x < tile.size_x; x++) {
+                if (REGION(x, y, shapeindex) == region) {
+                    pixels[np] = tile.data[y][x];
+                    importance[np] = tile.importance_map[y][x];
+                    np++;
+                }
+            }
+        }
+
+		opt_endpts[region] = temp_in = orig_endpts[region];
+		opt_err[region] = orig_err[region];
+
+		float best_err = orig_err[region];
+
+		// try all lsb modes as we search for better endpoints
+		for (int lsbmode=0; lsbmode<NLSBMODES; ++lsbmode)
+		{
+			temp_in.a_lsb = lsbmode & 1;
+			temp_in.b_lsb = (lsbmode >> 1) & 1;
+
+			// make sure we have a valid error for temp_in
+			// we use FLT_MAX here because we want an accurate temp_in_err, no shortcuts
+			// (mapcolors will compute a mapping but will stop if the error exceeds the value passed in the FLT_MAX position)
+			float temp_in_err = map_colors(pixels, importance, np, temp_in, pattern_prec.region_precs[region], FLT_MAX, temp_indices);
+
+			// now try to optimize these endpoints
+            float temp_out_err = optimize_one(pixels, importance, np, temp_in_err, temp_in, pattern_prec.region_precs[region], temp_out);
+
+			// if we find an improvement, update the best so far and correct the output endpoints and errors
+			if (temp_out_err < best_err)
+			{
+				best_err = temp_out_err;
+				opt_err[region] = temp_out_err;
+				opt_endpts[region] = temp_out;
+			}
+		}
+	}
+}
+
+/* optimization algorithm
+	for each pattern
+		convert endpoints using pattern precision
+		assign indices and get initial error
+		compress indices (and possibly reorder endpoints)
+		transform endpoints
+		if transformed endpoints fit pattern
+			get original endpoints back
+			optimize endpoints, get new endpoints, new indices, and new error // new error will almost always be better
+			compress new indices
+			transform new endpoints
+			if new endpoints fit pattern AND if error is improved
+				emit compressed block with new data
+			else
+				emit compressed block with original data // to try to preserve maximum endpoint precision
+*/
+
+static float refine(const Tile &tile, int shapeindex_best, const FltEndpts endpts[NREGIONS], char *block)
+{
+	float orig_err[NREGIONS], opt_err[NREGIONS], orig_toterr, opt_toterr, expected_opt_err[NREGIONS];
+	IntEndptsRGBA_2 orig_endpts[NREGIONS], opt_endpts[NREGIONS];
+	int orig_indices[Tile::TILE_H][Tile::TILE_W], opt_indices[Tile::TILE_H][Tile::TILE_W];
+
+	for (int sp = 0; sp < NPATTERNS; ++sp)
+	{
+		quantize_endpts(endpts, pattern_precs[sp], orig_endpts);
+		assign_indices(tile, shapeindex_best, orig_endpts, pattern_precs[sp], orig_indices, orig_err);
+		swap_indices(orig_endpts, orig_indices, shapeindex_best);
+		if (patterns[sp].transformed)
+			transform_forward(orig_endpts);
+		// apply a heuristic here -- we check if the endpoints fit before we try to optimize them.
+		// the assumption made is that if they don't fit now, they won't fit after optimizing.
+		if (endpts_fit(orig_endpts, patterns[sp]))
+		{
+			if (patterns[sp].transformed)
+				transform_inverse(orig_endpts);
+			optimize_endpts(tile, shapeindex_best, orig_err, orig_endpts, pattern_precs[sp], expected_opt_err, opt_endpts);
+			assign_indices(tile, shapeindex_best, opt_endpts, pattern_precs[sp], opt_indices, opt_err);
+			// (nreed) Commented out asserts because they go off all the time...not sure why
+			//for (int i=0; i<NREGIONS; ++i)
+			//	nvAssert(expected_opt_err[i] == opt_err[i]);
+			swap_indices(opt_endpts, opt_indices, shapeindex_best);
+			if (patterns[sp].transformed)
+				transform_forward(opt_endpts);
+			orig_toterr = opt_toterr = 0;
+			for (int i=0; i < NREGIONS; ++i) { orig_toterr += orig_err[i]; opt_toterr += opt_err[i]; }
+			if (endpts_fit(opt_endpts, patterns[sp]) && opt_toterr < orig_toterr)
+			{
+				emit_block(opt_endpts, shapeindex_best, patterns[sp], opt_indices, block);
+				return opt_toterr;
+			}
+			else
+			{
+				// either it stopped fitting when we optimized it, or there was no improvement
+				// so go back to the unoptimized endpoints which we know will fit
+				if (patterns[sp].transformed)
+					transform_forward(orig_endpts);
+				emit_block(orig_endpts, shapeindex_best, patterns[sp], orig_indices, block);
+				return orig_toterr;
+			}
+		}
+	}
+	nvAssert(false); //throw "No candidate found, should never happen (mode avpcl 7).";
+	return FLT_MAX;
+}
+
+static void clamp(Vector4 &v)
+{
+	if (v.x < 0.0f) v.x = 0.0f;
+	if (v.x > 255.0f) v.x = 255.0f;
+	if (v.y < 0.0f) v.y = 0.0f;
+	if (v.y > 255.0f) v.y = 255.0f;
+	if (v.z < 0.0f) v.z = 0.0f;
+	if (v.z > 255.0f) v.z = 255.0f;
+	if (v.w < 0.0f) v.w = 0.0f;
+	if (v.w > 255.0f) v.w = 255.0f;
+}
+
+static void generate_palette_unquantized(const FltEndpts endpts[NREGIONS], Vector4 palette[NREGIONS][NINDICES])
+{
+	for (int region = 0; region < NREGIONS; ++region)
+	for (int i = 0; i < NINDICES; ++i)
+		palette[region][i] = Utils::lerp(endpts[region].A, endpts[region].B, i, 0, DENOM);
+}
+
+// generate a palette from unquantized endpoints, then pick best palette color for all pixels in each region, return toterr for all regions combined
+static float map_colors(const Tile &tile, int shapeindex, const FltEndpts endpts[NREGIONS])
+{
+	// build list of possibles
+	Vector4 palette[NREGIONS][NINDICES];
+
+	generate_palette_unquantized(endpts, palette);
+
+	float toterr = 0;
+	Vector4 err;
+
+	for (int y = 0; y < tile.size_y; y++)
+	for (int x = 0; x < tile.size_x; x++)
+	{
+		int region = REGION(x,y,shapeindex);
+		float err, besterr = FLT_MAX;
+
+		for (int i = 0; i < NINDICES && besterr > 0; ++i)
+		{
+			err = Utils::metric4(tile.data[y][x], palette[region][i]);
+
+			if (err > besterr)	// error increased, so we're done searching. this works for most norms.
+				break;
+			if (err < besterr)
+				besterr = err;
+		}
+		toterr += besterr;
+	}
+	return toterr;
+}
+
+static float rough(const Tile &tile, int shapeindex, FltEndpts endpts[NREGIONS])
+{
+	for (int region=0; region<NREGIONS; ++region)
+	{
+		int np = 0;
+		Vector4 colors[Tile::TILE_TOTAL];
+		Vector4 mean(0,0,0,0);
+
+		for (int y = 0; y < tile.size_y; y++)
+		for (int x = 0; x < tile.size_x; x++)
+			if (REGION(x,y,shapeindex) == region)
+			{
+				colors[np] = tile.data[y][x];
+				mean += tile.data[y][x];
+				++np;
+			}
+
+		// handle simple cases	
+		if (np == 0)
+		{
+			Vector4 zero(0,0,0,255.0f);
+			endpts[region].A = zero;
+			endpts[region].B = zero;
+			continue;
+		}
+		else if (np == 1)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[0];
+			continue;
+		}
+		else if (np == 2)
+		{
+			endpts[region].A = colors[0];
+			endpts[region].B = colors[1];
+			continue;
+		}
+
+		mean /= float(np);
+
+		Vector4 direction = Fit::computePrincipalComponent_EigenSolver(np, colors);
+
+		// project each pixel value along the principal direction
+		float minp = FLT_MAX, maxp = -FLT_MAX;
+		for (int i = 0; i < np; i++) 
+		{
+			float dp = dot(colors[i]-mean, direction);
+			if (dp < minp) minp = dp;
+			if (dp > maxp) maxp = dp;
+		}
+
+		// choose as endpoints 2 points along the principal direction that span the projections of all of the pixel values
+		endpts[region].A = mean + minp*direction;
+		endpts[region].B = mean + maxp*direction;
+
+		// clamp endpoints
+		// the argument for clamping is that the actual endpoints need to be clamped and thus we need to choose the best
+		// shape based on endpoints being clamped
+		clamp(endpts[region].A);
+		clamp(endpts[region].B);
+	}
+
+	return map_colors(tile, shapeindex, endpts);
+}
+
+static void swap(float *list1, int *list2, int i, int j)
+{
+	float t = list1[i]; list1[i] = list1[j]; list1[j] = t;
+	int t1 = list2[i]; list2[i] = list2[j]; list2[j] = t1;
+}
+
+float AVPCL::compress_mode7(const Tile &t, char *block)
+{
+	// number of rough cases to look at. reasonable values of this are 1, NSHAPES/4, and NSHAPES
+	// NSHAPES/4 gets nearly all the cases; you can increase that a bit (say by 3 or 4) if you really want to squeeze the last bit out
+	const int NITEMS=NSHAPES/4;
+
+	// pick the best NITEMS shapes and refine these.
+	struct {
+		FltEndpts endpts[NREGIONS];
+	} all[NSHAPES];
+	float roughmse[NSHAPES];
+	int index[NSHAPES];
+	char tempblock[AVPCL::BLOCKSIZE];
+	float msebest = FLT_MAX;
+
+	for (int i=0; i<NSHAPES; ++i)
+	{
+		roughmse[i] = rough(t, i, &all[i].endpts[0]);
+		index[i] = i;
+	}
+
+	// bubble sort -- only need to bubble up the first NITEMS items
+	for (int i=0; i<NITEMS; ++i)
+	for (int j=i+1; j<NSHAPES; ++j)
+		if (roughmse[i] > roughmse[j])
+			swap(roughmse, index, i, j);
+
+	for (int i=0; i<NITEMS && msebest>0; ++i)
+	{
+		int shape = index[i];
+		float mse = refine(t, shape, &all[shape].endpts[0], tempblock);
+		if (mse < msebest)
+		{
+			memcpy(block, tempblock, sizeof(tempblock));
+			msebest = mse;
+		}
+	}
+	return msebest;
+}
+

+ 389 - 0
3rdparty/nvtt/bc7/avpcl_utils.cpp

@@ -0,0 +1,389 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// Utility and common routines
+
+#include "avpcl_utils.h"
+#include "avpcl.h"
+#include "nvmath/Vector.inl"
+#include <math.h>
+
+using namespace nv;
+using namespace AVPCL;
+
+static const int denom7_weights[] = {0, 9, 18, 27, 37, 46, 55, 64};										// divided by 64
+static const int denom15_weights[] = {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64};		// divided by 64
+
+int Utils::lerp(int a, int b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+	nvAssert (a >= 0 && b >= 0);
+
+	int round = 0;
+#ifdef	USE_ZOH_INTERP_ROUNDED
+	round = 32;
+#endif
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*denom15_weights[denom-i] + b*denom15_weights[i] + round) >> 6;
+	case 7:	return (a*denom7_weights[denom-i] + b*denom7_weights[i] + round) >> 6;
+	default: nvUnreachable(); return 0;
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+Vector4 Utils::lerp(Vector4::Arg a, Vector4::Arg b, int i, int bias, int denom)
+{
+#ifdef	USE_ZOH_INTERP
+	nvAssert (denom == 3 || denom == 7 || denom == 15);
+	nvAssert (i >= 0 && i <= denom);
+	nvAssert (bias >= 0 && bias <= denom/2);
+//	nvAssert (a >= 0 && b >= 0);
+
+	// no need to bias these as this is an exact division
+
+	switch (denom)
+	{
+	case 3:	denom *= 5; i *= 5;	// fall through to case 15
+	case 15:return (a*float(denom15_weights[denom-i]) + b*float(denom15_weights[i])) / 64.0f;
+	case 7:	return (a*float(denom7_weights[denom-i]) + b*float(denom7_weights[i])) / 64.0f;
+	default: nvUnreachable(); return Vector4(0);
+	}
+#else
+	return (((a)*((denom)-i)+(b)*(i)+(bias))/(denom));		// simple exact interpolation
+#endif
+}
+
+
+int Utils::unquantize(int q, int prec)
+{
+	int unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+#ifdef USE_ZOH_QUANT
+	if (prec >= 8)
+		unq = q;
+	else if (q == 0) 
+		unq = 0;
+	else if (q == ((1<<prec)-1)) 
+		unq = 255;
+	else
+		unq = (q * 256 + 128) >> prec;
+#else
+	// avpcl unquantizer -- bit replicate
+	unq = (q << (8-prec)) | (q >> (2*prec-8));
+#endif
+
+	return unq;
+}
+
+// quantize to the best value -- i.e., minimize unquantize error
+int Utils::quantize(float value, int prec)
+{
+	int q, unq;
+
+	nvAssert (prec > 3);	// we only want to do one replicate
+
+	unq = (int)floor(value + 0.5f);
+	nvAssert (unq <= 255);
+
+#ifdef USE_ZOH_QUANT
+	q = (prec >= 8) ? unq : (unq << prec) / 256;
+#else
+	// avpcl quantizer -- scale properly for best possible bit-replicated result
+	q = (unq * ((1<<prec)-1) + 127)/255;
+#endif
+
+	nvAssert (q >= 0 && q < (1 << prec));
+
+	return q;
+}
+
+float Utils::metric4(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+// WORK -- implement rotatemode for the below -- that changes where the rwt, gwt, and bwt's go.
+float Utils::metric3(Vector3::Arg a, Vector3::Arg b, int rotatemode)
+{
+	Vector3 err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: break;
+		case ROTATEMODE_RGBA_AGBR: rwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RABG: gwt = 1.0f; break;
+		case ROTATEMODE_RGBA_RGAB: bwt = 1.0f; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1(const float a, const float b, int rotatemode)
+{
+	float err = a - b;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}
+
+float Utils::premult(float r, float a)
+{
+	// note that the args are really integers stored in floats
+	int R = int(r), A = int(a);
+
+	nvAssert ((R==r) && (A==a));
+
+	return float((R*A + 127)/255);
+}
+
+static void premult4(Vector4& rgba)
+{
+	rgba.x = Utils::premult(rgba.x, rgba.w);
+	rgba.y = Utils::premult(rgba.y, rgba.w);
+	rgba.z = Utils::premult(rgba.z, rgba.w);
+}
+
+static void premult3(Vector3& rgb, float a)
+{
+	rgb.x = Utils::premult(rgb.x, a);
+	rgb.y = Utils::premult(rgb.y, a);
+	rgb.z = Utils::premult(rgb.z, a);
+}
+
+float Utils::metric4premult(Vector4::Arg a, Vector4::Arg b)
+{
+	Vector4 pma = a, pmb = b;
+
+	premult4(pma);
+	premult4(pmb);
+
+	Vector4 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphaout(Vector3::Arg rgb0, float a0, Vector3::Arg rgb1, float a1)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	premult3(pma, a0);
+	premult3(pmb, a1);
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric3premult_alphain(Vector3::Arg rgb0, Vector3::Arg rgb1, int rotatemode)
+{
+	Vector3 pma = rgb0, pmb = rgb1;
+
+	switch(rotatemode)
+	{
+	case ROTATEMODE_RGBA_RGBA:
+		// this function isn't supposed to be called for this rotatemode
+		nvUnreachable();
+		break;
+	case ROTATEMODE_RGBA_AGBR:
+		pma.y = premult(pma.y, pma.x);
+		pma.z = premult(pma.z, pma.x);
+		pmb.y = premult(pmb.y, pmb.x);
+		pmb.z = premult(pmb.z, pmb.x);
+		break;
+	case ROTATEMODE_RGBA_RABG:
+		pma.x = premult(pma.x, pma.y);
+		pma.z = premult(pma.z, pma.y);
+		pmb.x = premult(pmb.x, pmb.y);
+		pmb.z = premult(pmb.z, pmb.y);
+		break;
+	case ROTATEMODE_RGBA_RGAB:
+		pma.x = premult(pma.x, pma.z);
+		pma.y = premult(pma.y, pma.z);
+		pmb.x = premult(pmb.x, pmb.z);
+		pmb.y = premult(pmb.y, pmb.z);
+		break;
+	default: nvUnreachable();
+	}
+
+	Vector3 err = pma - pmb;
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else /*if (AVPCL::flag_nonuniform_ati)*/
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// weigh the components
+		err.x *= rwt;
+		err.y *= gwt;
+		err.z *= bwt;
+	}
+
+	return lengthSquared(err);
+}
+
+float Utils::metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode)
+{
+	float err = premult(rgb0, a0) - premult(rgb1, a1);
+
+	// if nonuniform, select weights and weigh away
+	if (AVPCL::flag_nonuniform || AVPCL::flag_nonuniform_ati)
+	{
+		float rwt, gwt, bwt, awt;
+		if (AVPCL::flag_nonuniform)
+		{
+			rwt = 0.299f; gwt = 0.587f; bwt = 0.114f;
+		}
+		else if (AVPCL::flag_nonuniform_ati)
+		{
+			rwt = 0.3086f; gwt = 0.6094f; bwt = 0.0820f;
+		}
+
+		// adjust weights based on rotatemode
+		switch(rotatemode)
+		{
+		case ROTATEMODE_RGBA_RGBA: awt = 1.0f; break;
+		case ROTATEMODE_RGBA_AGBR: awt = rwt; break;
+		case ROTATEMODE_RGBA_RABG: awt = gwt; break;
+		case ROTATEMODE_RGBA_RGAB: awt = bwt; break;
+		default: nvUnreachable();
+		}
+
+		// weigh the components
+		err *= awt;
+	}
+
+	return err * err;
+}

+ 61 - 0
3rdparty/nvtt/bc7/avpcl_utils.h

@@ -0,0 +1,61 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+// utility class holding common routines
+#ifndef _AVPCL_UTILS_H
+#define _AVPCL_UTILS_H
+
+#include "nvmath/Vector.h"
+
+namespace AVPCL {
+
+inline int SIGN_EXTEND(int x, int nb) { return ((((x)&(1<<((nb)-1)))?((~0)<<(nb)):0)|(x)); }
+
+static const int INDEXMODE_BITS				= 1;		// 2 different index modes
+static const int NINDEXMODES				= (1<<(INDEXMODE_BITS));
+static const int INDEXMODE_ALPHA_IS_3BITS	= 0;
+static const int INDEXMODE_ALPHA_IS_2BITS	= 1;
+
+static const int ROTATEMODE_BITS		= 2;		// 4 different rotate modes
+static const int NROTATEMODES			= (1<<(ROTATEMODE_BITS));
+static const int ROTATEMODE_RGBA_RGBA	= 0;
+static const int ROTATEMODE_RGBA_AGBR	= 1;
+static const int ROTATEMODE_RGBA_RABG	= 2;
+static const int ROTATEMODE_RGBA_RGAB	= 3;
+
+class Utils
+{
+public:
+	// error metrics
+	static float metric4(nv::Vector4::Arg a, nv::Vector4::Arg b);
+	static float metric3(nv::Vector3::Arg a, nv::Vector3::Arg b, int rotatemode);
+	static float metric1(float a, float b, int rotatemode);
+
+	static float metric4premult(nv::Vector4::Arg rgba0, nv::Vector4::Arg rgba1);
+	static float metric3premult_alphaout(nv::Vector3::Arg rgb0, float a0, nv::Vector3::Arg rgb1, float a1);
+	static float metric3premult_alphain(nv::Vector3::Arg rgb0, nv::Vector3::Arg rgb1, int rotatemode);
+	static float metric1premult(float rgb0, float a0, float rgb1, float a1, int rotatemode);
+
+	static float premult(float r, float a);
+
+	// quantization and unquantization
+	static int unquantize(int q, int prec);
+	static int quantize(float value, int prec);
+
+	// lerping
+	static int lerp(int a, int b, int i, int bias, int denom);
+	static nv::Vector4 lerp(nv::Vector4::Arg a, nv::Vector4::Arg b, int i, int bias, int denom);
+};
+
+}
+
+#endif

+ 76 - 0
3rdparty/nvtt/bc7/bits.h

@@ -0,0 +1,76 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_BITS_H
+#define _AVPCL_BITS_H
+
+// read/write a bitstream
+
+#include "nvcore/Debug.h"
+
+namespace AVPCL {
+
+class Bits
+{
+public:
+
+	Bits(char *data, int maxdatabits) { nvAssert (data && maxdatabits > 0); bptr = bend = 0; bits = data; maxbits = maxdatabits; readonly = 0;}
+	Bits(const char *data, int availdatabits) { nvAssert (data && availdatabits > 0); bptr = 0; bend = availdatabits; cbits = data; maxbits = availdatabits; readonly = 1;}
+
+	void write(int value, int nbits) {
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		for (int i=0; i<nbits; ++i)
+			writeone(value>>i);
+	}
+	int read(int nbits) { 
+		nvAssert (nbits >= 0 && nbits < 32);
+		nvAssert (sizeof(int)>= 4);
+		int out = 0;
+		for (int i=0; i<nbits; ++i)
+			out |= readone() << i;
+		return out;
+	}
+	int getptr() { return bptr; }
+	void setptr(int ptr) { nvAssert (ptr >= 0 && ptr < maxbits); bptr = ptr; }
+	int getsize() { return bend; }
+
+private:
+	int	bptr;		// next bit to read
+	int bend;		// last written bit + 1
+	char *bits;		// ptr to user bit stream
+	const char *cbits;	// ptr to const user bit stream
+	int maxbits;	// max size of user bit stream
+	char readonly;	// 1 if this is a read-only stream
+
+	int readone() {
+		nvAssert (bptr < bend);
+		if (bptr >= bend) return 0;
+		int bit = (readonly ? cbits[bptr>>3] : bits[bptr>>3]) & (1 << (bptr & 7));
+		++bptr;
+		return bit != 0;
+	}
+	void writeone(int bit) {
+		nvAssert (!readonly); // "Writing a read-only bit stream"
+		nvAssert (bptr < maxbits);
+		if (bptr >= maxbits) return;
+		if (bit&1)
+			bits[bptr>>3] |= 1 << (bptr & 7);
+		else
+			bits[bptr>>3] &= ~(1 << (bptr & 7));
+		if (bptr++ >= bend) bend = bptr;
+	}
+};
+
+}
+
+#endif

+ 81 - 0
3rdparty/nvtt/bc7/endpts.h

@@ -0,0 +1,81 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_ENDPTS_H
+#define _AVPCL_ENDPTS_H
+
+// endpoint definitions and routines to search through endpoint space
+
+#include "nvmath/Vector.h"
+
+namespace AVPCL {
+
+static const int NCHANNELS_RGB	= 3;
+static const int NCHANNELS_RGBA	= 4;
+static const int CHANNEL_R		= 0;
+static const int CHANNEL_G		= 1;
+static const int CHANNEL_B		= 2;
+static const int CHANNEL_A		= 3;
+
+struct FltEndpts
+{
+	nv::Vector4	A;
+	nv::Vector4	B;
+};
+
+struct IntEndptsRGB
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+};
+
+struct IntEndptsRGB_1
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		lsb;				// shared lsb for A and B
+};
+
+struct IntEndptsRGB_2
+{
+	int		A[NCHANNELS_RGB];
+	int		B[NCHANNELS_RGB];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+
+struct IntEndptsRGBA
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+};
+
+struct IntEndptsRGBA_2
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for A
+	int		b_lsb;				// lsb for B
+};
+
+struct IntEndptsRGBA_2a
+{
+	int		A[NCHANNELS_RGBA];
+	int		B[NCHANNELS_RGBA];
+	int		a_lsb;				// lsb for RGB channels of A
+	int		b_lsb;				// lsb for RGB channels of A
+};
+
+}
+
+#endif

+ 132 - 0
3rdparty/nvtt/bc7/shapes_three.h

@@ -0,0 +1,132 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef	_AVPCL_SHAPES_THREE_H
+#define _AVPCL_SHAPES_THREE_H
+
+// shapes for 3 regions
+
+#define NREGIONS 3
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 2, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   2, 0, 0, 1,   0, 0, 2, 2,   
+0, 2, 2, 1,   2, 2, 1, 1,   2, 2, 1, 1,   0, 0, 1, 1,   
+2, 2, 2, 2,   2, 2, 2, 1,   2, 2, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 2, 2,   0, 0, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+1, 1, 2, 2,   0, 0, 2, 2,   1, 1, 1, 1,   2, 2, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 2,   
+0, 0, 0, 0,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 1, 2,   
+1, 1, 1, 1,   1, 1, 1, 1,   2, 2, 2, 2,   0, 0, 1, 2,   
+2, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 2,   0, 0, 1, 2,   
+
+0, 1, 1, 2,   0, 1, 2, 2,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   0, 1, 1, 2,   2, 0, 0, 1,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 1, 2, 2,   2, 2, 0, 0,   
+0, 1, 1, 2,   0, 1, 2, 2,   1, 2, 2, 2,   2, 2, 2, 0,   
+
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+0, 0, 1, 1,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+0, 1, 1, 2,   2, 0, 0, 1,   1, 1, 2, 2,   0, 0, 2, 2,   
+1, 1, 2, 2,   2, 2, 0, 0,   1, 1, 2, 2,   1, 1, 1, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   1, 1, 0, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+0, 2, 2, 2,   2, 2, 2, 1,   0, 1, 2, 2,   2, 2, 1, 0,   
+
+0, 1, 2, 2,   0, 0, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 2, 2,   0, 0, 1, 2,   1, 2, 2, 1,   0, 1, 1, 0,   
+0, 0, 1, 1,   1, 1, 2, 2,   1, 2, 2, 1,   1, 2, 2, 1,   
+0, 0, 0, 0,   2, 2, 2, 2,   0, 1, 1, 0,   1, 2, 2, 1,   
+
+0, 0, 2, 2,   0, 1, 1, 0,   0, 0, 1, 1,   0, 0, 0, 0,   
+1, 1, 0, 2,   0, 1, 1, 0,   0, 1, 2, 2,   2, 0, 0, 0,   
+1, 1, 0, 2,   2, 0, 0, 2,   0, 1, 2, 2,   2, 2, 1, 1,   
+0, 0, 2, 2,   2, 2, 2, 2,   0, 0, 1, 1,   2, 2, 2, 1,   
+
+0, 0, 0, 0,   0, 2, 2, 2,   0, 0, 1, 1,   0, 1, 2, 0,   
+0, 0, 0, 2,   0, 0, 2, 2,   0, 0, 1, 2,   0, 1, 2, 0,   
+1, 1, 2, 2,   0, 0, 1, 2,   0, 0, 2, 2,   0, 1, 2, 0,   
+1, 2, 2, 2,   0, 0, 1, 1,   0, 2, 2, 2,   0, 1, 2, 0,   
+
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+1, 1, 1, 1,   1, 2, 0, 1,   2, 0, 1, 2,   2, 2, 0, 0,   
+2, 2, 2, 2,   2, 0, 1, 2,   1, 2, 0, 1,   1, 1, 2, 2,   
+0, 0, 0, 0,   0, 1, 2, 0,   0, 1, 2, 0,   0, 0, 1, 1,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 0, 0, 0,   0, 0, 2, 2,   
+1, 1, 2, 2,   0, 1, 0, 1,   0, 0, 0, 0,   1, 1, 2, 2,   
+2, 2, 0, 0,   2, 2, 2, 2,   2, 1, 2, 1,   0, 0, 2, 2,   
+0, 0, 1, 1,   2, 2, 2, 2,   2, 1, 2, 1,   1, 1, 2, 2,   
+
+0, 0, 2, 2,   0, 2, 2, 0,   0, 1, 0, 1,   0, 0, 0, 0,   
+0, 0, 1, 1,   1, 2, 2, 1,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 2, 2,   0, 2, 2, 0,   2, 2, 2, 2,   2, 1, 2, 1,   
+0, 0, 1, 1,   1, 2, 2, 1,   0, 1, 0, 1,   2, 1, 2, 1,   
+
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   0, 0, 0, 0,   
+0, 1, 0, 1,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+0, 1, 0, 1,   0, 2, 2, 2,   0, 0, 0, 2,   2, 1, 1, 2,   
+2, 2, 2, 2,   0, 1, 1, 1,   1, 1, 1, 2,   2, 1, 1, 2,   
+
+0, 2, 2, 2,   0, 0, 0, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   1, 1, 1, 2,   0, 1, 1, 0,   2, 1, 1, 2,   
+0, 2, 2, 2,   0, 0, 0, 2,   2, 2, 2, 2,   2, 1, 1, 2,   
+
+0, 1, 1, 0,   0, 0, 2, 2,   0, 0, 2, 2,   0, 0, 0, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 1, 1,   1, 1, 2, 2,   0, 0, 0, 0,   
+2, 2, 2, 2,   0, 0, 2, 2,   0, 0, 2, 2,   2, 1, 1, 2,   
+
+0, 0, 0, 2,   0, 2, 2, 2,   0, 1, 0, 1,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 0, 1, 1,   
+0, 0, 0, 2,   0, 2, 2, 2,   2, 2, 2, 2,   2, 2, 0, 1,   
+0, 0, 0, 1,   1, 2, 2, 2,   2, 2, 2, 2,   2, 2, 2, 0,
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*3] = 
+{
+	0, 3,15,  0, 3, 8,  0,15, 8,  0,15, 3,
+	0, 8,15,  0, 3,15,  0,15, 3,  0,15, 8,
+	0, 8,15,  0, 8,15,  0, 6,15,  0, 6,15,
+	0, 6,15,  0, 5,15,  0, 3,15,  0, 3, 8,
+
+	0, 3,15,  0, 3, 8,  0, 8,15,  0,15, 3,
+	0, 3,15,  0, 3, 8,  0, 6,15,  0,10, 8,
+	0, 5, 3,  0, 8,15,  0, 8, 6,  0, 6,10,
+	0, 8,15,  0, 5,15,  0,15,10,  0,15, 8,
+
+	0, 8,15,  0,15, 3,  0, 3,15,  0, 5,10,
+	0, 6,10,  0,10, 8,  0, 8, 9,  0,15,10,
+	0,15, 6,  0, 3,15,  0,15, 8,  0, 5,15,
+	0,15, 3,  0,15, 6,  0,15, 6,  0,15, 8,
+
+	0, 3,15,  0,15, 3,  0, 5,15,  0, 5,15,
+	0, 5,15,  0, 8,15,  0, 5,15,  0,10,15,
+	0, 5,15,  0,10,15,  0, 8,15,  0,13,15,
+	0,15, 3,  0,12,15,  0, 3,15,  0, 3, 8
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*3+(region)]
+
+#endif

+ 133 - 0
3rdparty/nvtt/bc7/shapes_two.h

@@ -0,0 +1,133 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_SHAPES_TWO_H
+#define _AVPCL_SHAPES_TWO_H
+
+// shapes for two regions
+
+#define NREGIONS 2
+#define NSHAPES 64
+#define SHAPEBITS 6
+
+static int shapes[NSHAPES*16] = 
+{
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 0, 1, 1,   0, 0, 0, 1,   
+0, 0, 0, 1,   0, 1, 1, 1,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 0, 0,   
+0, 0, 0, 1,   1, 1, 1, 1,   0, 1, 1, 1,   0, 0, 0, 1,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+0, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 0,   
+1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   1, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 0, 0,   0, 1, 1, 1,   
+1, 0, 0, 0,   0, 0, 0, 1,   0, 0, 0, 0,   0, 0, 1, 1,   
+1, 1, 1, 0,   0, 0, 0, 0,   1, 0, 0, 0,   0, 0, 0, 1,   
+1, 1, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 0, 0, 0,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 0, 0, 0,   0, 1, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 0, 0,   1, 0, 0, 0,   0, 0, 1, 1,   
+0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   0, 0, 0, 1,   
+
+0, 0, 1, 1,   0, 0, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 1,   1, 0, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+0, 0, 0, 0,   1, 1, 0, 0,   0, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 0, 1,   0, 0, 0, 0,   0, 1, 1, 1,   0, 0, 1, 1,   
+0, 1, 1, 1,   1, 1, 1, 1,   0, 0, 0, 1,   1, 0, 0, 1,   
+1, 1, 1, 0,   1, 1, 1, 1,   1, 0, 0, 0,   1, 0, 0, 1,   
+1, 0, 0, 0,   0, 0, 0, 0,   1, 1, 1, 0,   1, 1, 0, 0,   
+
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   0, 0, 1, 1,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   0, 0, 1, 1,   
+0, 1, 0, 1,   0, 0, 0, 0,   0, 1, 0, 1,   1, 1, 0, 0,   
+0, 1, 0, 1,   1, 1, 1, 1,   1, 0, 1, 0,   1, 1, 0, 0,   
+
+0, 0, 1, 1,   0, 1, 0, 1,   0, 1, 1, 0,   0, 1, 0, 1,   
+1, 1, 0, 0,   0, 1, 0, 1,   1, 0, 0, 1,   1, 0, 1, 0,   
+0, 0, 1, 1,   1, 0, 1, 0,   0, 1, 1, 0,   1, 0, 1, 0,   
+1, 1, 0, 0,   1, 0, 1, 0,   1, 0, 0, 1,   0, 1, 0, 1,   
+
+0, 1, 1, 1,   0, 0, 0, 1,   0, 0, 1, 1,   0, 0, 1, 1,   
+0, 0, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   1, 0, 1, 1,   
+1, 1, 0, 0,   1, 1, 0, 0,   0, 1, 0, 0,   1, 1, 0, 1,   
+1, 1, 1, 0,   1, 0, 0, 0,   1, 1, 0, 0,   1, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 0, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   0, 1, 1, 0,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   0, 1, 1, 0,   
+0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   0, 0, 0, 0,   
+
+0, 1, 0, 0,   0, 0, 1, 0,   0, 0, 0, 0,   0, 0, 0, 0,   
+1, 1, 1, 0,   0, 1, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 1, 0, 0,   0, 0, 1, 0,   0, 1, 1, 1,   1, 1, 1, 0,   
+0, 0, 0, 0,   0, 0, 0, 0,   0, 0, 1, 0,   0, 1, 0, 0,   
+
+0, 1, 1, 0,   0, 0, 1, 1,   0, 1, 1, 0,   0, 0, 1, 1,   
+1, 1, 0, 0,   0, 1, 1, 0,   0, 0, 1, 1,   1, 0, 0, 1,   
+1, 0, 0, 1,   1, 1, 0, 0,   1, 0, 0, 1,   1, 1, 0, 0,   
+0, 0, 1, 1,   1, 0, 0, 1,   1, 1, 0, 0,   0, 1, 1, 0,   
+
+0, 1, 1, 0,   0, 1, 1, 0,   0, 1, 1, 1,   0, 0, 0, 1,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 1, 1, 0,   1, 0, 0, 0,   
+1, 1, 0, 0,   0, 0, 1, 1,   1, 0, 0, 0,   1, 1, 1, 0,   
+1, 0, 0, 1,   1, 0, 0, 1,   0, 0, 0, 1,   0, 1, 1, 1,   
+
+0, 0, 0, 0,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+1, 1, 1, 1,   0, 0, 1, 1,   0, 0, 1, 0,   0, 1, 0, 0,   
+0, 0, 1, 1,   1, 1, 1, 1,   1, 1, 1, 0,   0, 1, 1, 1,   
+0, 0, 1, 1,   0, 0, 0, 0,   1, 1, 1, 0,   0, 1, 1, 1,   
+
+};
+
+#define	REGION(x,y,si)	shapes[((si)&3)*4+((si)>>2)*64+(x)+(y)*16]
+
+static int shapeindex_to_compressed_indices[NSHAPES*2] = 
+{
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0,15,  0,15,  0,15,
+
+	0,15,  0, 2,  0, 8,  0, 2,
+	0, 2,  0, 8,  0, 8,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 8,  0, 8,  0, 2,  0, 2,
+
+	0,15,  0,15,  0, 6,  0, 8,
+	0, 2,  0, 8,  0,15,  0,15,
+	0, 2,  0, 8,  0, 2,  0, 2,
+	0, 2,  0,15,  0,15,  0, 6,
+
+	0, 6,  0, 2,  0, 6,  0, 8,
+	0,15,  0,15,  0, 2,  0, 2,
+	0,15,  0,15,  0,15,  0,15,
+	0,15,  0, 2,  0, 2,  0,15
+
+};
+#define SHAPEINDEX_TO_COMPRESSED_INDICES(si,region)  shapeindex_to_compressed_indices[(si)*2+(region)]
+
+#endif

+ 41 - 0
3rdparty/nvtt/bc7/tile.h

@@ -0,0 +1,41 @@
+/*
+Copyright 2007 nVidia, Inc.
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. 
+
+You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, 
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
+
+See the License for the specific language governing permissions and limitations under the License.
+*/
+
+#ifndef _AVPCL_TILE_H
+#define _AVPCL_TILE_H
+
+#include "nvmath/Vector.h"
+#include <math.h>
+#include "avpcl_utils.h"
+
+namespace AVPCL {
+
+// extract a tile of pixels from an array
+
+class Tile
+{
+public:
+	static const int TILE_H = 4;
+	static const int TILE_W = 4;
+	static const int TILE_TOTAL = TILE_H * TILE_W;
+	nv::Vector4 data[TILE_H][TILE_W];
+    float importance_map[TILE_H][TILE_W];
+	int	size_x, size_y;			// actual size of tile
+
+	Tile() {};
+	~Tile(){};
+	Tile(int xs, int ys) {size_x = xs; size_y = ys;}
+};
+
+}
+
+#endif

+ 437 - 0
3rdparty/nvtt/nvcore/Array.inl

@@ -0,0 +1,437 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_ARRAY_INL
+#define NV_CORE_ARRAY_INL
+
+#include "array.h"
+
+#include "stream.h"
+#include "utils.h" // swap
+
+#include <string.h>	// memmove
+#include <new> // for placement new
+
+
+
+namespace nv 
+{
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::append()
+    {
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size);
+
+        return m_buffer[old_size]; // Return reference to last element.
+    }
+
+    // Push an element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::push_back( const T & val )
+    {
+#if 1
+        nvDebugCheck(&val < m_buffer || &val >= m_buffer+m_size);
+
+        uint old_size = m_size;
+        uint new_size = m_size + 1;
+
+        setArraySize(new_size);
+
+        construct_range(m_buffer, new_size, old_size, val);
+#else
+        uint new_size = m_size + 1;
+
+        if (new_size > m_capacity)
+        {
+            // @@ Is there any way to avoid this copy?
+            // @@ Can we create a copy without side effects? Ie. without calls to constructor/destructor. Use alloca + memcpy?
+            // @@ Assert instead of copy?
+            const T copy(val);	// create a copy in case value is inside of this array.
+
+            setArraySize(new_size);
+
+            new (m_buffer+new_size-1) T(copy);
+        }
+        else
+        {
+            m_size = new_size;
+            new(m_buffer+new_size-1) T(val);
+        }
+#endif // 0/1
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pushBack( const T & val )
+    {
+        push_back(val);
+    }
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::append( const T & val )
+    {
+        push_back(val);
+        return *this;
+    }
+
+    // Qt like push operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator<< ( T & t )
+    {
+        push_back(t);
+        return *this;
+    }
+
+    // Pop the element at the end of the vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::pop_back()
+    {
+        nvDebugCheck( m_size > 0 );
+        resize( m_size - 1 );
+    }
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popBack(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        resize(m_size - count);
+    }
+
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::popFront(uint count)
+    {
+        nvDebugCheck(m_size >= count);
+        //resize(m_size - count);
+
+        if (m_size == count) {
+            clear();
+        }
+        else {
+            destroy_range(m_buffer, 0, count);
+
+            memmove(m_buffer, m_buffer + count, sizeof(T) * (m_size - count));
+
+            m_size -= count;
+        }
+
+    }
+
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::back() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get back element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::back()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[m_size-1];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE const T & Array<T>::front() const
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Get front element.
+    template <typename T>
+    NV_FORCEINLINE T & Array<T>::front()
+    {
+        nvDebugCheck( m_size > 0 );
+        return m_buffer[0];
+    }
+
+    // Check if the given element is contained in the array.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::contains(const T & e) const
+    {
+        return find(e, NULL);
+    }
+
+    // Return true if element found.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint * indexPtr) const
+    {
+        return find(element, 0, m_size, indexPtr);
+    }
+
+    // Return true if element found within the given range.
+    template <typename T>
+    NV_FORCEINLINE bool Array<T>::find(const T & element, uint begin, uint end, uint * indexPtr) const
+    {
+        return ::nv::find(element, m_buffer, begin, end, indexPtr);
+    }
+
+
+    // Remove the element at the given index. This is an expensive operation!
+    template <typename T>
+    void Array<T>::removeAt(uint index)
+    {
+        nvDebugCheck(index >= 0 && index < m_size);
+
+        if (m_size == 1) {
+            clear();
+        }
+        else {
+            m_buffer[index].~T();
+
+            memmove(m_buffer+index, m_buffer+index+1, sizeof(T) * (m_size - 1 - index));
+            m_size--;
+        }
+    }
+
+    // Remove the first instance of the given element.
+    template <typename T>
+    bool Array<T>::remove(const T & element)
+    {
+        uint index;
+        if (find(element, &index)) {
+            removeAt(index);
+            return true;
+        }
+        return false;
+    }
+
+    // Insert the given element at the given index shifting all the elements up.
+    template <typename T>
+    void Array<T>::insertAt(uint index, const T & val/*=T()*/)
+    {
+        nvDebugCheck( index >= 0 && index <= m_size );
+
+        setArraySize(m_size + 1);
+
+        if (index < m_size - 1) {
+            memmove(m_buffer+index+1, m_buffer+index, sizeof(T) * (m_size - 1 - index));
+        }
+
+        // Copy-construct into the newly opened slot.
+        new(m_buffer+index) T(val);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::append(const Array<T> & other)
+    {
+        append(other.m_buffer, other.m_size);
+    }
+
+    // Append the given data to our vector.
+    template <typename T>
+    void Array<T>::append(const T other[], uint count)
+    {
+        if (count > 0) {
+            const uint old_size = m_size;
+
+            setArraySize(m_size + count);
+
+            for (uint i = 0; i < count; i++ ) {
+                new(m_buffer + old_size + i) T(other[i]);
+            }
+        }
+    }
+
+
+    // Remove the given element by replacing it with the last one.
+    template <typename T> 
+    void Array<T>::replaceWithLast(uint index)
+    {
+        nvDebugCheck( index < m_size );
+        nv::swap(m_buffer[index], back());      // @@ Is this OK when index == size-1?
+        (m_buffer+m_size-1)->~T();
+        m_size--;
+    }
+
+    // Resize the vector preserving existing elements.
+    template <typename T> 
+    void Array<T>::resize(uint new_size)
+    {
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call default constructors
+        construct_range(m_buffer, new_size, old_size);
+    }
+
+
+    // Resize the vector preserving existing elements and initializing the
+    // new ones with the given value.
+    template <typename T> 
+    void Array<T>::resize(uint new_size, const T & elem)
+    {
+        nvDebugCheck(&elem < m_buffer || &elem > m_buffer+m_size);
+
+        uint old_size = m_size;
+
+        // Destruct old elements (if we're shrinking).
+        destroy_range(m_buffer, new_size, old_size);
+
+        setArraySize(new_size);
+
+        // Call copy constructors
+        construct_range(m_buffer, new_size, old_size, elem);
+    }
+
+    // Fill array with the given value.
+    template <typename T>
+    void Array<T>::fill(const T & elem)
+    {
+        fill(m_buffer, m_size, elem);
+    }
+
+    // Clear the buffer.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::clear()
+    {
+        nvDebugCheck(isValidPtr(m_buffer));
+
+        // Destruct old elements
+        destroy_range(m_buffer, 0, m_size);
+
+        m_size = 0;
+    }
+
+    // Shrink the allocated vector.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::shrink()
+    {
+        if (m_size < m_capacity) {
+            setArrayCapacity(m_size);
+        }
+    }
+
+    // Preallocate space.
+    template <typename T> 
+    NV_FORCEINLINE void Array<T>::reserve(uint desired_size)
+    {
+        if (desired_size > m_capacity) {
+            setArrayCapacity(desired_size);
+        }
+    }
+
+    // Copy elements to this array. Resizes it if needed.
+    template <typename T>
+    NV_FORCEINLINE void Array<T>::copy(const T * data, uint count)
+    {
+#if 1   // More simple, but maybe not be as efficient?
+        destroy_range(m_buffer, 0, m_size);
+
+        setArraySize(count);
+
+        construct_range(m_buffer, count, 0, data);
+#else
+        const uint old_size = m_size;
+
+        destroy_range(m_buffer, count, old_size);
+
+        setArraySize(count);
+
+        copy_range(m_buffer, data, old_size);
+
+        construct_range(m_buffer, count, old_size, data);
+#endif
+    }
+
+    // Assignment operator.
+    template <typename T>
+    NV_FORCEINLINE Array<T> & Array<T>::operator=( const Array<T> & a )
+    {
+        copy(a.m_buffer, a.m_size);
+        return *this;
+    }
+
+    // Release ownership of allocated memory and returns pointer to it.
+    template <typename T>
+    T * Array<T>::release() {
+        T * tmp = m_buffer;
+        m_buffer = NULL;
+        m_capacity = 0;
+        m_size = 0;
+        return tmp;
+    }
+
+
+
+    // Change array size.
+    template <typename T> 
+    inline void Array<T>::setArraySize(uint new_size) {
+        m_size = new_size;
+
+        if (new_size > m_capacity) {
+            uint new_buffer_size;
+            if (m_capacity == 0) {
+                // first allocation is exact
+                new_buffer_size = new_size;
+            }
+            else {
+                // following allocations grow array by 25%
+                new_buffer_size = new_size + (new_size >> 2);
+            }
+
+            setArrayCapacity( new_buffer_size );
+        }
+    }
+
+    // Change array capacity.
+    template <typename T> 
+    inline void Array<T>::setArrayCapacity(uint new_capacity) {
+        nvDebugCheck(new_capacity >= m_size);
+
+        if (new_capacity == 0) {
+            // free the buffer.
+            if (m_buffer != NULL) {
+                free<T>(m_buffer);
+                m_buffer = NULL;
+            }
+        }
+        else {
+            // realloc the buffer
+            m_buffer = realloc<T>(m_buffer, new_capacity);
+        }
+
+        m_capacity = new_capacity;
+    }
+
+    // Array serialization.
+    template <typename Typ> 
+    inline Stream & operator<< ( Stream & s, Array<Typ> & p )
+    {
+        if (s.isLoading()) {
+            uint size;
+            s << size;
+            p.resize( size );
+        }
+        else {
+            s << p.m_size;
+        }
+
+        for (uint i = 0; i < p.m_size; i++) {
+            s << p.m_buffer[i];
+        }
+
+        return s;
+    }
+
+    // Swap the members of the two given vectors.
+    template <typename Typ>
+    inline void swap(Array<Typ> & a, Array<Typ> & b)
+    {
+        nv::swap(a.m_buffer, b.m_buffer);
+        nv::swap(a.m_capacity, b.m_capacity);
+        nv::swap(a.m_size, b.m_size);
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_INL

+ 216 - 0
3rdparty/nvtt/nvcore/Debug.h

@@ -0,0 +1,216 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_DEBUG_H
+#define NV_CORE_DEBUG_H
+
+#include "nvcore.h"
+
+#include <stdarg.h> // va_list
+
+
+// Make sure we are using our assert.
+#undef assert
+
+#define NV_ABORT_DEBUG      1
+#define NV_ABORT_IGNORE     2
+#define NV_ABORT_EXIT       3
+
+#define nvNoAssert(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    (void)sizeof(exp); \
+    NV_MULTI_LINE_MACRO_END
+
+#if NV_NO_ASSERT
+
+#   define nvAssert(exp) nvNoAssert(exp)
+#   define nvCheck(exp) nvNoAssert(exp)
+#   define nvDebugAssert(exp) nvNoAssert(exp)
+#   define nvDebugCheck(exp) nvNoAssert(exp)
+#   define nvDebugBreak() nvNoAssert(0)
+
+#else // NV_NO_ASSERT
+
+#   if NV_CC_MSVC
+        // @@ Does this work in msvc-6 and earlier?
+#       define nvDebugBreak()       __debugbreak()
+//#       define nvDebugBreak()        __asm { int 3 }
+#   elif NV_OS_ORBIS
+#       define nvDebugBreak()       __debugbreak()
+#   elif NV_CC_GNUC
+#       define nvDebugBreak()       __builtin_trap()
+#   else
+#       error "No nvDebugBreak()!"
+#   endif
+
+/*
+#   elif NV_CC_GNUC || NV_CPU_PPC && NV_OS_DARWIN
+        // @@ Use __builtin_trap() on GCC
+#       define nvDebugBreak()       __asm__ volatile ("trap")
+#   elif (NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64) && NV_OS_DARWIN
+#       define nvDebugBreak()       __asm__ volatile ("int3")
+#   elif NV_CC_GNUC || NV_CPU_X86 || NV_CPU_X86_64
+#       define nvDebugBreak()       __asm__ ( "int %0" : :"I"(3) )
+#   else
+#       include <signal.h>
+#       define nvDebugBreak()       raise(SIGTRAP)
+#   endif
+*/
+
+#define nvDebugBreakOnce() \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    static bool firstTime = true; \
+    if (firstTime) { firstTime = false; nvDebugBreak(); } \
+    NV_MULTI_LINE_MACRO_END
+
+#define nvAssertMacro(exp) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+    if (!(exp)) { \
+        if (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) { \
+            nvDebugBreak(); \
+        } \
+    } \
+    NV_MULTI_LINE_MACRO_END
+
+// GCC, LLVM need "##" before the __VA_ARGS__, MSVC doesn't care
+#define nvAssertMacroWithIgnoreAll(exp,...) \
+    NV_MULTI_LINE_MACRO_BEGIN \
+        static bool ignoreAll = false; \
+        if (!ignoreAll && !(exp)) { \
+            int result = nvAbort(#exp, __FILE__, __LINE__, __FUNC__, ##__VA_ARGS__); \
+            if (result == NV_ABORT_DEBUG) { \
+                nvDebugBreak(); \
+            } else if (result == NV_ABORT_IGNORE) { \
+                ignoreAll = true; \
+            } \
+        } \
+    NV_MULTI_LINE_MACRO_END
+
+// Interesting assert macro from Insomniac:
+// http://www.gdcvault.com/play/1015319/Developing-Imperfect-Software-How-to
+// Used as follows:
+// if (nvCheck(i < count)) {
+//     normal path
+// } else {
+//     fixup code.
+// }
+// This style of macro could be combined with __builtin_expect to let the compiler know failure is unlikely.
+#define nvCheckMacro(exp) \
+    (\
+        (exp) ? true : ( \
+            (nvAbort(#exp, __FILE__, __LINE__, __FUNC__) == NV_ABORT_DEBUG) ? (nvDebugBreak(), true) : ( false ) \
+        ) \
+    )
+
+
+#define nvAssert(exp)    nvAssertMacro(exp)
+#define nvCheck(exp)     nvAssertMacro(exp)
+
+#if defined(_DEBUG)
+#   define nvDebugAssert(exp)   nvAssertMacro(exp)
+#   define nvDebugCheck(exp)    nvAssertMacro(exp)
+#else // _DEBUG
+#   define nvDebugAssert(exp)   nvNoAssert(exp)
+#   define nvDebugCheck(exp)    nvNoAssert(exp)
+#endif // _DEBUG
+
+#endif // NV_NO_ASSERT
+
+// Use nvAssume for very simple expresions only: nvAssume(0), nvAssume(value == true), etc.
+/*#if !defined(_DEBUG)
+#   if NV_CC_MSVC
+#       define nvAssume(exp)    __assume(exp)
+#   else
+#       define nvAssume(exp)    nvCheck(exp)
+#   endif
+#else
+#   define nvAssume(exp)    nvCheck(exp)
+#endif*/
+
+#if defined(_DEBUG)
+#  if NV_CC_MSVC
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __assume(0)
+#  else
+#   define nvUnreachable() nvAssert(0 && "unreachable"); __builtin_unreachable()
+#  endif
+#else
+#  if NV_CC_MSVC
+#   define nvUnreachable() __assume(0)
+#  else
+#   define nvUnreachable() __builtin_unreachable()
+#  endif
+#endif
+
+
+#define nvError(x)      nvAbort(x, __FILE__, __LINE__, __FUNC__)
+#define nvWarning(x)    nvDebugPrint("*** Warning %s/%d: %s\n", __FILE__, __LINE__, (x))
+
+#ifndef NV_DEBUG_PRINT
+#define NV_DEBUG_PRINT 1 //defined(_DEBUG)
+#endif
+
+#if NV_DEBUG_PRINT
+#define nvDebug(...)    nvDebugPrint(__VA_ARGS__)
+#else
+#if NV_CC_MSVC
+#define nvDebug(...)    __noop(__VA_ARGS__)
+#else
+#define nvDebug(...)    ((void)0) // Non-msvc platforms do not evaluate arguments?
+#endif
+#endif
+
+
+NVCORE_API int nvAbort(const char *exp, const char *file, int line, const char * func = NULL, const char * msg = NULL, ...) __attribute__((format (printf, 5, 6)));
+NVCORE_API void NV_CDECL nvDebugPrint( const char *msg, ... ) __attribute__((format (printf, 1, 2)));
+
+namespace nv
+{
+    inline bool isValidPtr(const void * ptr) {
+    #if NV_CPU_X86_64
+        if (ptr == NULL) return true;
+        if (reinterpret_cast<uint64>(ptr) < 0x10000ULL) return false;
+        if (reinterpret_cast<uint64>(ptr) >= 0x000007FFFFFEFFFFULL) return false;
+    #else
+	    if (reinterpret_cast<uint32>(ptr) == 0xcccccccc) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xcdcdcdcd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xdddddddd) return false;
+	    if (reinterpret_cast<uint32>(ptr) == 0xffffffff) return false;
+    #endif
+        return true;
+    }
+
+    // Message handler interface.
+    struct MessageHandler {
+        virtual void log(const char * str, va_list arg) = 0;
+        virtual ~MessageHandler() {}
+    };
+
+    // Assert handler interface.
+    struct AssertHandler {
+        virtual int assertion(const char *exp, const char *file, int line, const char *func, const char *msg, va_list arg) = 0;
+        virtual ~AssertHandler() {}
+    };
+
+
+    namespace debug
+    {
+        NVCORE_API void dumpInfo();
+        NVCORE_API void dumpCallstack( MessageHandler *messageHandler, int callstackLevelsToSkip = 0 );
+
+        NVCORE_API void setMessageHandler( MessageHandler * messageHandler );
+        NVCORE_API void resetMessageHandler();
+
+        NVCORE_API void setAssertHandler( AssertHandler * assertHanlder );
+        NVCORE_API void resetAssertHandler();
+
+        NVCORE_API void enableSigHandler(bool interactive);
+        NVCORE_API void disableSigHandler();
+
+        NVCORE_API bool isDebuggerPresent();
+        NVCORE_API bool attachToDebugger();
+
+        NVCORE_API void terminate(int code);
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_DEBUG_H

+ 181 - 0
3rdparty/nvtt/nvcore/array.h

@@ -0,0 +1,181 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_ARRAY_H
+#define NV_CORE_ARRAY_H
+
+/*
+This array class requires the elements to be relocable; it uses memmove and realloc. Ideally I should be 
+using swap, but I honestly don't care. The only thing that you should be aware of is that internal pointers
+are not supported.
+
+Note also that push_back and resize does not support inserting arguments elements that are in the same 
+container. This is forbidden to prevent an extra copy.
+*/
+
+
+#include "memory.h"
+#include "debug.h"
+#include "foreach.h" // pseudoindex
+
+
+namespace nv 
+{
+    class Stream;
+
+    /**
+    * Replacement for std::vector that is easier to debug and provides
+    * some nice foreach enumerators. 
+    */
+    template<typename T>
+    class NVCORE_CLASS Array {
+    public:
+        typedef uint size_type;
+
+        // Default constructor.
+        NV_FORCEINLINE Array() : m_buffer(NULL), m_capacity(0), m_size(0) {}
+
+        // Copy constructor.
+        NV_FORCEINLINE Array(const Array & a) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(a.m_buffer, a.m_size);
+        }
+
+        // Constructor that initializes the vector with the given elements.
+        NV_FORCEINLINE Array(const T * ptr, uint num) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            copy(ptr, num);
+        }
+
+        // Allocate array.
+        NV_FORCEINLINE explicit Array(uint capacity) : m_buffer(NULL), m_capacity(0), m_size(0) {
+            setArrayCapacity(capacity);
+        }
+
+        // Destructor.
+        NV_FORCEINLINE ~Array() {
+            clear();
+            free<T>(m_buffer);
+        }
+
+
+        /// Const element access.
+        NV_FORCEINLINE const T & operator[]( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE const T & at( uint index ) const
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Element access.
+        NV_FORCEINLINE T & operator[] ( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+        NV_FORCEINLINE T & at( uint index )
+        {
+            nvDebugCheck(index < m_size);
+            return m_buffer[index];
+        }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint size() const { return m_size; }
+
+        /// Get vector size.
+        NV_FORCEINLINE uint count() const { return m_size; }
+
+        /// Get vector capacity.
+        NV_FORCEINLINE uint capacity() const { return m_capacity; }
+
+        /// Get const vector pointer.
+        NV_FORCEINLINE const T * buffer() const { return m_buffer; }
+
+        /// Get vector pointer.
+        NV_FORCEINLINE T * buffer() { return m_buffer; }
+
+        /// Provide begin/end pointers for C++11 range-based for loops.
+        NV_FORCEINLINE T * begin() { return m_buffer; }
+        NV_FORCEINLINE T * end() { return m_buffer + m_size; }
+        NV_FORCEINLINE const T * begin() const { return m_buffer; }
+        NV_FORCEINLINE const T * end() const { return m_buffer + m_size; }
+
+        /// Is vector empty.
+        NV_FORCEINLINE bool isEmpty() const { return m_size == 0; }
+
+        /// Is a null vector.
+        NV_FORCEINLINE bool isNull() const { return m_buffer == NULL; }
+
+
+        T & append();
+        void push_back( const T & val );
+        void pushBack( const T & val );
+        Array<T> & append( const T & val );
+        Array<T> & operator<< ( T & t );
+        void pop_back();
+        void popBack(uint count = 1);
+        void popFront(uint count = 1);
+        const T & back() const;
+        T & back();
+        const T & front() const;
+        T & front();
+        bool contains(const T & e) const;
+        bool find(const T & element, uint * indexPtr) const;
+        bool find(const T & element, uint begin, uint end, uint * indexPtr) const;
+        void removeAt(uint index);
+        bool remove(const T & element);
+        void insertAt(uint index, const T & val = T());
+        void append(const Array<T> & other);
+        void append(const T other[], uint count);
+        void replaceWithLast(uint index);
+        void resize(uint new_size);
+        void resize(uint new_size, const T & elem);
+        void fill(const T & elem);
+        void clear();
+        void shrink();
+        void reserve(uint desired_size);
+        void copy(const T * data, uint count);
+        Array<T> & operator=( const Array<T> & a );
+        T * release();
+
+
+        // Array enumerator.
+        typedef uint PseudoIndex;
+
+        NV_FORCEINLINE PseudoIndex start() const { return 0; }
+        NV_FORCEINLINE bool isDone(const PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); return i == this->m_size; }
+        NV_FORCEINLINE void advance(PseudoIndex & i) const { nvDebugCheck(i <= this->m_size); i++; }
+
+#if NV_CC_MSVC
+        NV_FORCEINLINE T & operator[]( const PseudoIndexWrapper & i ) {
+            return m_buffer[i(this)];
+        }
+        NV_FORCEINLINE const T & operator[]( const PseudoIndexWrapper & i ) const {
+            return m_buffer[i(this)];
+        }
+#endif
+
+        // Friends.
+        template <typename Typ> 
+        friend Stream & operator<< ( Stream & s, Array<Typ> & p );
+
+        template <typename Typ>
+        friend void swap(Array<Typ> & a, Array<Typ> & b);
+
+
+    protected:
+
+        void setArraySize(uint new_size);
+        void setArrayCapacity(uint new_capacity);
+
+        T * m_buffer;
+        uint m_capacity;
+        uint m_size;
+
+    };
+
+
+} // nv namespace
+
+#endif // NV_CORE_ARRAY_H

+ 53 - 0
3rdparty/nvtt/nvcore/defsgnucdarwin.h

@@ -0,0 +1,53 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#	define DLL_EXPORT __attribute__((visibility("default")))
+#	define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#	define DLL_EXPORT
+#	define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	__attribute__((always_inline)) inline
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL //ACS: there's no "__thread" or equivalent on iOS/OSX
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__

+ 59 - 0
3rdparty/nvtt/nvcore/defsgnuclinux.h

@@ -0,0 +1,59 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+#include <stdint.h> // uint8_t, int8_t, ... uintptr_t
+#include <stddef.h> // operator new, size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT
+#if __GNUC__ >= 4
+#   define DLL_EXPORT   __attribute__((visibility("default")))
+#   define DLL_EXPORT_CLASS DLL_EXPORT
+#else
+#   define DLL_EXPORT
+#   define DLL_EXPORT_CLASS
+#endif
+
+// Function calling modes
+#if NV_CPU_X86
+#   define NV_CDECL     __attribute__((cdecl))
+#   define NV_STDCALL   __attribute__((stdcall))
+#else
+#   define NV_CDECL 
+#   define NV_STDCALL
+#endif
+
+#define NV_FASTCALL     __attribute__((fastcall))
+//#if __GNUC__ > 3
+// It seems that GCC does not assume always_inline implies inline. I think this depends on the GCC version :(
+#define NV_FORCEINLINE  inline __attribute__((always_inline))
+//#else
+// Some compilers complain that inline and always_inline are redundant.
+//#define NV_FORCEINLINE  __attribute__((always_inline))
+//#endif
+#define NV_DEPRECATED   __attribute__((deprecated))
+#define NV_THREAD_LOCAL __thread 
+
+#if __GNUC__ > 2
+#define NV_PURE     __attribute__((pure))
+#define NV_CONST    __attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#   if __GNUC__ >= 2
+#       define __FUNC__ __PRETTY_FUNCTION__ // __FUNCTION__
+#   else
+#       define __FUNC__ "<unknown>"
+#   endif
+#else
+#   define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict    __restrict__

+ 65 - 0
3rdparty/nvtt/nvcore/defsgnucwin32.h

@@ -0,0 +1,65 @@
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+//#include <cstddef> // size_t, NULL
+
+// Function linkage
+#define DLL_IMPORT	__declspec(dllimport)
+#define DLL_EXPORT	__declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#if NV_CPU_X86
+#	define NV_CDECL 	__attribute__((cdecl))
+#	define NV_STDCALL	__attribute__((stdcall))
+#else
+#	define NV_CDECL 
+#	define NV_STDCALL
+#endif
+
+#define NV_FASTCALL		__attribute__((fastcall))
+#define NV_FORCEINLINE	__attribute__((always_inline))
+#define NV_DEPRECATED   __attribute__((deprecated))
+
+#if __GNUC__ > 2
+#define NV_PURE		__attribute__((pure))
+#define NV_CONST	__attribute__((const))
+#else
+#define NV_PURE
+#define NV_CONST
+#endif
+
+#define NV_NOINLINE __attribute__((noinline))
+
+// Define __FUNC__ properly.
+#if __STDC_VERSION__ < 199901L
+#	if __GNUC__ >= 2
+#		define __FUNC__ __PRETTY_FUNCTION__	// __FUNCTION__
+#	else
+#		define __FUNC__ "<unknown>"
+#	endif
+#else
+#	define __FUNC__ __PRETTY_FUNCTION__
+#endif
+
+#define restrict	__restrict__
+
+/*
+// Type definitions
+typedef unsigned char		uint8;
+typedef signed char			int8;
+
+typedef unsigned short		uint16;
+typedef signed short		int16;
+
+typedef unsigned int		uint32;
+typedef signed int			int32;
+
+typedef unsigned long long	uint64;
+typedef signed long long	int64;
+
+// Aliases
+typedef uint32				uint;
+*/
+

+ 94 - 0
3rdparty/nvtt/nvcore/defsvcwin32.h

@@ -0,0 +1,94 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_H
+#error "Do not include this file directly."
+#endif
+
+// Function linkage
+#define DLL_IMPORT __declspec(dllimport)
+#define DLL_EXPORT __declspec(dllexport)
+#define DLL_EXPORT_CLASS DLL_EXPORT
+
+// Function calling modes
+#define NV_CDECL        __cdecl
+#define NV_STDCALL      __stdcall
+#define NV_FASTCALL     __fastcall
+#define NV_DEPRECATED
+
+#define NV_PURE
+#define NV_CONST
+
+// Set standard function names.
+#if _MSC_VER < 1900
+#   define snprintf _snprintf
+#endif
+#if _MSC_VER < 1500
+#   define vsnprintf _vsnprintf
+#endif
+#if _MSC_VER < 1700
+#   define strtoll _strtoi64
+#   define strtoull _strtoui64
+#endif
+#define chdir _chdir
+#define getcwd _getcwd 
+
+#if _MSC_VER < 1800 // Not sure what version introduced this.
+#define va_copy(a, b) (a) = (b)
+#endif
+
+#if !defined restrict
+#define restrict
+#endif
+
+// Ignore gcc attributes.
+#define __attribute__(X)
+
+#if !defined __FUNC__
+#define __FUNC__ __FUNCTION__ 
+#endif
+
+#define NV_NOINLINE __declspec(noinline)
+#define NV_FORCEINLINE __forceinline
+
+#define NV_THREAD_LOCAL __declspec(thread)
+
+/*
+// Type definitions
+typedef unsigned char       uint8;
+typedef signed char         int8;
+
+typedef unsigned short      uint16;
+typedef signed short        int16;
+
+typedef unsigned int        uint32;
+typedef signed int          int32;
+
+typedef unsigned __int64    uint64;
+typedef signed __int64      int64;
+
+// Aliases
+typedef uint32              uint;
+*/
+
+// Unwanted VC++ warnings to disable.
+/*
+#pragma warning(disable : 4244)     // conversion to float, possible loss of data
+#pragma warning(disable : 4245)     // conversion from 'enum ' to 'unsigned long', signed/unsigned mismatch
+#pragma warning(disable : 4100)     // unreferenced formal parameter
+#pragma warning(disable : 4514)     // unreferenced inline function has been removed
+#pragma warning(disable : 4710)     // inline function not expanded
+#pragma warning(disable : 4127)     // Conditional expression is constant
+#pragma warning(disable : 4305)     // truncation from 'const double' to 'float'
+#pragma warning(disable : 4505)     // unreferenced local function has been removed
+
+#pragma warning(disable : 4702)     // unreachable code in inline expanded function
+#pragma warning(disable : 4711)     // function selected for automatic inlining
+#pragma warning(disable : 4725)     // Pentium fdiv bug
+
+#pragma warning(disable : 4786)     // Identifier was truncated and cannot be debugged.
+
+#pragma warning(disable : 4675)     // resolved overload was found by argument-dependent lookup
+*/
+
+#pragma warning(1 : 4705)     // Report unused local variables.
+#pragma warning(1 : 4555)     // Expression has no effect.

+ 68 - 0
3rdparty/nvtt/nvcore/foreach.h

@@ -0,0 +1,68 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#pragma once
+#ifndef NV_CORE_FOREACH_H
+#define NV_CORE_FOREACH_H
+
+/*
+These foreach macros are very non-standard and somewhat confusing, but I like them.
+*/
+
+#include "nvcore.h"
+
+#if NV_CC_GNUC // If typeof or decltype is available:
+#if !NV_CC_CPP11
+#   define NV_DECLTYPE typeof // Using a non-standard extension over typeof that behaves as C++11 decltype
+#else
+#   define NV_DECLTYPE decltype
+#endif
+
+/*
+Ideally we would like to write this:
+
+#define NV_FOREACH(i, container) \
+    for(NV_DECLTYPE(container)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+But gcc versions prior to 4.7 required an intermediate type. See:
+https://gcc.gnu.org/bugzilla/show_bug.cgi?id=6709
+*/
+
+#define NV_FOREACH(i, container) \
+    typedef NV_DECLTYPE(container) NV_STRING_JOIN2(cont,__LINE__); \
+    for(NV_STRING_JOIN2(cont,__LINE__)::PseudoIndex i((container).start()); !(container).isDone(i); (container).advance(i))
+
+#else // If typeof not available:
+
+#include <new> // placement new
+
+struct PseudoIndexWrapper {
+    template <typename T>
+    PseudoIndexWrapper(const T & container) {
+        nvStaticCheck(sizeof(typename T::PseudoIndex) <= sizeof(memory));
+        new (memory) typename T::PseudoIndex(container.start());
+    }
+    // PseudoIndex cannot have a dtor!
+
+    template <typename T> typename T::PseudoIndex & operator()(const T * /*container*/) {
+        return *reinterpret_cast<typename T::PseudoIndex *>(memory);
+    }
+    template <typename T> const typename T::PseudoIndex & operator()(const T * /*container*/) const {
+        return *reinterpret_cast<const typename T::PseudoIndex *>(memory);
+    }
+
+    uint8 memory[4];	// Increase the size if we have bigger enumerators.
+};
+
+#define NV_FOREACH(i, container) \
+    for(PseudoIndexWrapper i(container); !(container).isDone(i(&(container))); (container).advance(i(&(container))))
+
+#endif
+
+// Declare foreach keyword.
+#if !defined NV_NO_USE_KEYWORDS
+#   define foreach NV_FOREACH
+#   define foreach_index NV_FOREACH
+#endif
+
+
+#endif // NV_CORE_FOREACH_H

+ 83 - 0
3rdparty/nvtt/nvcore/hash.h

@@ -0,0 +1,83 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#pragma once
+#ifndef NV_CORE_HASH_H
+#define NV_CORE_HASH_H
+
+#include "nvcore.h"
+
+namespace nv
+{
+    inline uint sdbmHash(const void * data_in, uint size, uint h = 5381)
+    {
+        const uint8 * data = (const uint8 *) data_in;
+        uint i = 0;
+        while (i < size) {
+            h = (h << 16) + (h << 6) - h + (uint) data[i++];
+        }
+        return h;
+    }
+
+    // Note that this hash does not handle NaN properly.
+    inline uint sdbmFloatHash(const float * f, uint count, uint h = 5381)
+    {
+        for (uint i = 0; i < count; i++) {
+            //nvDebugCheck(nv::isFinite(*f));
+            union { float f; uint32 i; } x = { f[i] };
+            if (x.i == 0x80000000) x.i = 0;
+            h = sdbmHash(&x, 4, h);
+        }
+        return h;
+    }
+
+
+    template <typename T>
+    inline uint hash(const T & t, uint h = 5381)
+    {
+        return sdbmHash(&t, sizeof(T), h);
+    }
+
+    template <>
+    inline uint hash(const float & f, uint h)
+    {
+        return sdbmFloatHash(&f, 1, h);
+    }
+
+
+    // Functors for hash table:
+    template <typename Key> struct Hash 
+    {
+        uint operator()(const Key & k) const {
+            return hash(k);
+        }
+    };
+
+    template <typename Key> struct Equal
+    {
+        bool operator()(const Key & k0, const Key & k1) const {
+            return k0 == k1;
+        }
+    };
+
+
+    // @@ Move to Utils.h?
+    template <typename T1, typename T2>
+    struct Pair {
+        T1 first;
+        T2 second;
+    };
+
+    template <typename T1, typename T2>
+    bool operator==(const Pair<T1,T2> & p0, const Pair<T1,T2> & p1) {
+        return p0.first == p1.first && p0.second == p1.second;
+    }
+
+    template <typename T1, typename T2>
+    uint hash(const Pair<T1,T2> & p, uint h = 5381) {
+        return hash(p.second, hash(p.first));
+    }
+
+
+} // nv namespace
+
+#endif // NV_CORE_HASH_H

+ 29 - 0
3rdparty/nvtt/nvcore/memory.h

@@ -0,0 +1,29 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_MEMORY_H
+#define NV_CORE_MEMORY_H
+
+#include "nvcore.h"
+
+namespace nv {
+
+    // C++ helpers.
+    template <typename T> NV_FORCEINLINE T * malloc(size_t count) {
+        return (T *)::malloc(sizeof(T) * count);
+    }
+
+    template <typename T> NV_FORCEINLINE T * realloc(T * ptr, size_t count) {
+        return (T *)::realloc(ptr, sizeof(T) * count);
+    }
+
+    template <typename T> NV_FORCEINLINE void free(const T * ptr) {
+        ::free((void *)ptr);
+    }
+
+    template <typename T> NV_FORCEINLINE void zero(T & data) {
+        memset(&data, 0, sizeof(T));
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_MEMORY_H

+ 299 - 0
3rdparty/nvtt/nvcore/nvcore.h

@@ -0,0 +1,299 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_H
+#define NV_CORE_H
+
+// Function linkage
+#if NVCORE_SHARED
+#ifdef NVCORE_EXPORTS
+#define NVCORE_API DLL_EXPORT
+#define NVCORE_CLASS DLL_EXPORT_CLASS
+#else
+#define NVCORE_API DLL_IMPORT
+#define NVCORE_CLASS DLL_IMPORT
+#endif
+#else // NVCORE_SHARED
+#define NVCORE_API
+#define NVCORE_CLASS
+#endif // NVCORE_SHARED
+
+
+// Platform definitions
+#include "posh.h"
+
+// OS:
+// NV_OS_WIN32
+// NV_OS_WIN64
+// NV_OS_MINGW
+// NV_OS_CYGWIN
+// NV_OS_LINUX
+// NV_OS_UNIX
+// NV_OS_DARWIN
+// NV_OS_XBOX
+// NV_OS_ORBIS
+// NV_OS_IOS
+
+#define NV_OS_STRING POSH_OS_STRING
+
+#if defined POSH_OS_LINUX
+#   define NV_OS_LINUX 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_ORBIS
+#   define NV_OS_ORBIS 1
+#elif defined POSH_OS_FREEBSD
+#   define NV_OS_FREEBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_OPENBSD
+#   define NV_OS_OPENBSD 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_CYGWIN32
+#   define NV_OS_CYGWIN 1
+#elif defined POSH_OS_MINGW
+#   define NV_OS_MINGW 1
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_OSX
+#   define NV_OS_DARWIN 1
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_IOS
+#   define NV_OS_DARWIN 1 //ACS should we keep this on IOS?
+#   define NV_OS_UNIX 1
+#   define NV_OS_IOS 1
+#elif defined POSH_OS_UNIX
+#   define NV_OS_UNIX 1
+#elif defined POSH_OS_WIN64
+#   define NV_OS_WIN32 1
+#   define NV_OS_WIN64 1
+#elif defined POSH_OS_WIN32
+#   define NV_OS_WIN32 1
+#elif defined POSH_OS_XBOX
+#   define NV_OS_XBOX 1
+#else
+#   error "Unsupported OS"
+#endif
+
+
+// Threading:
+// some platforms don't implement __thread or similar for thread-local-storage
+#if NV_OS_UNIX || NV_OS_ORBIS || NV_OS_IOS //ACStodoIOS darwin instead of ios?
+#   define NV_OS_USE_PTHREAD 1
+#   if NV_OS_DARWIN || NV_OS_IOS
+#       define NV_OS_HAS_TLS_QUALIFIER 0
+#   else
+#       define NV_OS_HAS_TLS_QUALIFIER 1
+#   endif
+#else
+#   define NV_OS_USE_PTHREAD 0
+#   define NV_OS_HAS_TLS_QUALIFIER 1
+#endif
+
+
+// CPUs:
+// NV_CPU_X86
+// NV_CPU_X86_64
+// NV_CPU_PPC
+// NV_CPU_ARM
+// NV_CPU_AARCH64
+
+#define NV_CPU_STRING   POSH_CPU_STRING
+
+#if defined POSH_CPU_X86_64
+//#   define NV_CPU_X86 1
+#   define NV_CPU_X86_64 1
+#elif defined POSH_CPU_X86
+#   define NV_CPU_X86 1
+#elif defined POSH_CPU_PPC
+#   define NV_CPU_PPC 1
+#elif defined POSH_CPU_STRONGARM
+#   define NV_CPU_ARM 1
+#elif defined POSH_CPU_AARCH64
+#   define NV_CPU_AARCH64 1
+#else
+#   error "Unsupported CPU"
+#endif
+
+
+// Compiler:
+// NV_CC_GNUC
+// NV_CC_MSVC
+// NV_CC_CLANG
+
+#if defined POSH_COMPILER_CLANG
+#   define NV_CC_CLANG  1
+#   define NV_CC_GNUC   1    // Clang is compatible with GCC.
+#   define NV_CC_STRING "clang"
+#elif defined POSH_COMPILER_GCC
+#   define NV_CC_GNUC   1
+#   define NV_CC_STRING "gcc"
+#elif defined POSH_COMPILER_MSVC
+#   define NV_CC_MSVC   1
+#   define NV_CC_STRING "msvc"
+#else
+#   error "Unsupported compiler"
+#endif
+
+#if NV_CC_MSVC
+#define NV_CC_CPP11 (__cplusplus > 199711L || _MSC_VER >= 1800) // Visual Studio 2013 has all the features we use, but doesn't advertise full C++11 support yet.
+#else
+// @@ IC: This works in CLANG, about GCC?
+// @@ ES: Doesn't work in gcc. These 3 features are available in GCC >= 4.4.
+#ifdef __clang__
+#define NV_CC_CPP11 (__has_feature(cxx_deleted_functions) && __has_feature(cxx_rvalue_references) && __has_feature(cxx_static_assert))
+#elif defined __GNUC__ 
+#define NV_CC_CPP11 ( __GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 4))
+#endif
+#endif
+
+// Endiannes:
+#define NV_LITTLE_ENDIAN    POSH_LITTLE_ENDIAN
+#define NV_BIG_ENDIAN       POSH_BIG_ENDIAN
+#define NV_ENDIAN_STRING    POSH_ENDIAN_STRING
+
+
+// Define the right printf prefix for size_t arguments:
+#if POSH_64BIT_POINTER
+#  define NV_SIZET_PRINTF_PREFIX POSH_I64_PRINTF_PREFIX
+#else
+#  define NV_SIZET_PRINTF_PREFIX
+#endif
+
+
+// Type definitions:
+typedef posh_u8_t   uint8;
+typedef posh_i8_t   int8;
+
+typedef posh_u16_t  uint16;
+typedef posh_i16_t  int16;
+
+typedef posh_u32_t  uint32;
+typedef posh_i32_t  int32;
+
+typedef posh_u64_t  uint64;
+typedef posh_i64_t  int64;
+
+// Aliases
+typedef uint32      uint;
+
+
+// Version string:
+#define NV_VERSION_STRING \
+    NV_OS_STRING "/" NV_CC_STRING "/" NV_CPU_STRING"/" \
+    NV_ENDIAN_STRING"-endian - " __DATE__ "-" __TIME__
+
+
+// Disable copy constructor and assignment operator. 
+#if NV_CC_CPP11
+#define NV_FORBID_COPY(C) \
+    C( const C & ) = delete; \
+    C &operator=( const C & ) = delete
+#else
+#define NV_FORBID_COPY(C) \
+    private: \
+    C( const C & ); \
+    C &operator=( const C & )
+#endif
+
+// Disable dynamic allocation on the heap. 
+// See Prohibiting Heap-Based Objects in More Effective C++.
+#define NV_FORBID_HEAPALLOC() \
+    private: \
+    void *operator new(size_t size); \
+    void *operator new[](size_t size)
+    //static void *operator new(size_t size); \
+    //static void *operator new[](size_t size);
+
+// String concatenation macros.
+#define NV_STRING_JOIN2(arg1, arg2) NV_DO_STRING_JOIN2(arg1, arg2)
+#define NV_DO_STRING_JOIN2(arg1, arg2) arg1 ## arg2
+#define NV_STRING_JOIN3(arg1, arg2, arg3) NV_DO_STRING_JOIN3(arg1, arg2, arg3)
+#define NV_DO_STRING_JOIN3(arg1, arg2, arg3) arg1 ## arg2 ## arg3
+#define NV_STRING2(x) #x
+#define NV_STRING(x) NV_STRING2(x)
+
+#if NV_CC_MSVC
+#define NV_MULTI_LINE_MACRO_BEGIN do {  
+#define NV_MULTI_LINE_MACRO_END \
+    __pragma(warning(push)) \
+    __pragma(warning(disable:4127)) \
+    } while(false) \
+    __pragma(warning(pop))  
+#else
+#define NV_MULTI_LINE_MACRO_BEGIN do {
+#define NV_MULTI_LINE_MACRO_END } while(false)
+#endif
+
+#if NV_CC_CPP11
+#define nvStaticCheck(x) static_assert((x), "Static assert "#x" failed")
+#else
+#define nvStaticCheck(x) typedef char NV_STRING_JOIN2(__static_assert_,__LINE__)[(x)]
+#endif
+#define NV_COMPILER_CHECK(x) nvStaticCheck(x)   // I like this name best.
+
+// Make sure type definitions are fine.
+NV_COMPILER_CHECK(sizeof(int8) == 1);
+NV_COMPILER_CHECK(sizeof(uint8) == 1);
+NV_COMPILER_CHECK(sizeof(int16) == 2);
+NV_COMPILER_CHECK(sizeof(uint16) == 2);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+NV_COMPILER_CHECK(sizeof(int32) == 4);
+NV_COMPILER_CHECK(sizeof(uint32) == 4);
+
+
+#define NV_ARRAY_SIZE(x) (sizeof(x)/sizeof((x)[0]))
+
+#if 0 // Disabled in The Witness.
+#if NV_CC_MSVC
+#define NV_MESSAGE(x) message(__FILE__ "(" NV_STRING(__LINE__) ") : " x)
+#else
+#define NV_MESSAGE(x) message(x)
+#endif
+#else
+#define NV_MESSAGE(x) 
+#endif
+
+
+// Startup initialization macro.
+#define NV_AT_STARTUP(some_code) \
+    namespace { \
+        static struct NV_STRING_JOIN2(AtStartup_, __LINE__) { \
+            NV_STRING_JOIN2(AtStartup_, __LINE__)() { some_code; } \
+        } \
+        NV_STRING_JOIN3(AtStartup_, __LINE__, Instance); \
+    }
+
+// Indicate the compiler that the parameter is not used to suppress compier warnings.
+#define NV_UNUSED(a) ((a)=(a))
+
+// Null index. @@ Move this somewhere else... it's only used by nvmesh.
+//const unsigned int NIL = unsigned int(~0);
+//#define NIL uint(~0)
+
+// Null pointer.
+#ifndef NULL
+#define NULL 0
+#endif
+
+// Platform includes
+#if NV_CC_MSVC
+#   if NV_OS_WIN32
+#       include "DefsVcWin32.h"
+#   elif NV_OS_XBOX
+#       include "DefsVcXBox.h"
+#   else
+#       error "MSVC: Platform not supported"
+#   endif
+#elif NV_CC_GNUC
+#   if NV_OS_LINUX
+#       include "DefsGnucLinux.h"
+#   elif NV_OS_DARWIN || NV_OS_FREEBSD || NV_OS_OPENBSD
+#       include "DefsGnucDarwin.h"
+#   elif NV_OS_MINGW
+#       include "DefsGnucWin32.h"
+#   elif NV_OS_CYGWIN
+#       error "GCC: Cygwin not supported"
+#   else
+#       error "GCC: Platform not supported"
+#   endif
+#endif
+
+#endif // NV_CORE_H

+ 1030 - 0
3rdparty/nvtt/nvcore/posh.h

@@ -0,0 +1,1030 @@
+/**
+@file posh.h
+@author Brian Hook
+@version 1.3.001
+
+Header file for POSH, the Portable Open Source Harness project.
+
+NOTE: Unlike most header files, this one is designed to be included
+multiple times, which is why it does not have the @#ifndef/@#define
+preamble.
+
+POSH relies on environment specified preprocessor symbols in order
+to infer as much as possible about the target OS/architecture and
+the host compiler capabilities.
+
+NOTE: POSH is simple and focused. It attempts to provide basic
+functionality and information, but it does NOT attempt to emulate
+missing functionality.  I am also not willing to make POSH dirty
+and hackish to support truly ancient and/or outmoded and/or bizarre
+technologies such as non-ANSI compilers, systems with non-IEEE
+floating point formats, segmented 16-bit operating systems, etc.
+
+Please refer to the accompanying HTML documentation or visit
+http://www.poshlib.org for more information on how to use POSH.
+
+LICENSE:
+
+Copyright (c) 2004, Brian Hook
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+      copyright notice, this list of conditions and the following
+      disclaimer in the documentation and/or other materials provided
+      with the distribution.
+
+    * The names of this package'ss contributors contributors may not
+      be used to endorse or promote products derived from this
+      software without specific prior written permission.
+
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+REVISION:
+
+I've been lax about revision histories, so this starts at, um, 1.3.001.
+Sorry for any inconveniences.
+
+1.3.001 - 2/23/2006 - Incorporated fix for bug reported by Bill Cary,
+                      where I was not detecting Visual Studio
+                      compilation on x86-64 systems.  Added check for
+                      _M_X64 which should fix that.
+
+*/
+/*
+I have yet to find an authoritative reference on preprocessor
+symbols, but so far this is what I've gleaned:
+
+GNU GCC/G++:
+   - __GNUC__: GNU C version
+   - __GNUG__: GNU C++ compiler
+   - __sun__ : on Sun platforms
+   - __svr4__: on Solaris and other SysV R4 platforms
+   - __mips__: on MIPS processor platforms
+   - __sparc_v9__: on Sparc 64-bit CPUs
+   - __sparcv9: 64-bit Solaris
+   - __MIPSEL__: mips processor, compiled for little endian
+   - __MIPSEB__: mips processor, compiled for big endian
+   - _R5900: MIPS/Sony/Toshiba R5900 (PS2)
+   - mc68000: 68K
+   - m68000: 68K
+   - m68k: 68K
+   - __palmos__: PalmOS
+
+Intel C/C++ Compiler:
+   - __ECC      : compiler version, IA64 only
+   - __EDG__
+   - __ELF__
+   - __GXX_ABI_VERSION
+   - __i386     : IA-32 only
+   - __i386__   : IA-32 only
+   - i386       : IA-32 only
+   - __ia64     : IA-64 only
+   - __ia64__   : IA-64 only
+   - ia64       : IA-64 only
+   - __ICC      : IA-32 only
+   - __INTEL_COMPILER : IA-32 or IA-64, newer versions only
+
+Apple's C/C++ Compiler for OS X:
+   - __APPLE_CC__
+   - __APPLE__
+   - __BIG_ENDIAN__
+   - __APPLE__
+   - __ppc__
+   - __MACH__
+
+DJGPP:
+   - __MSDOS__
+   - __unix__
+   - __unix
+   - __GNUC__
+   - __GO32
+   - DJGPP
+   - __i386, __i386, i386
+
+Cray's C compiler:
+   - _ADDR64: if 64-bit pointers
+   - _UNICOS: 
+   - __unix:
+
+SGI's CC compiler predefines the following (and more) with -ansi:
+   - __sgi
+   - __unix
+   - __host_mips
+   - _SYSTYPE_SVR4
+   - __mips
+   - _MIPSEB
+   - anyone know if there is a predefined symbol for the compiler?!
+
+MinGW:
+   - as GnuC but also defines _WIN32, __WIN32, WIN32, _X86_, __i386, __i386__, and several others
+   - __MINGW32__
+
+Cygwin:
+   - as Gnu C, but also
+   - __unix__
+   - __CYGWIN32__
+
+Microsoft Visual Studio predefines the following:
+   - _MSC_VER
+   - _WIN32: on Win32
+   - _M_IX6 (on x86 systems)
+   - _M_X64: on x86-64 systems
+   - _M_ALPHA (on DEC AXP systems)
+   - _SH3: WinCE, Hitachi SH-3
+   - _MIPS: WinCE, MIPS
+   - _ARM: WinCE, ARM
+
+Sun's C Compiler:
+   - sun and _sun
+   - unix and _unix
+   - sparc and _sparc (SPARC systems only)
+   - i386 and _i386 (x86 systems only)
+   - __SVR4 (Solaris only)
+   - __sparcv9: 64-bit solaris
+   - __SUNPRO_C
+   - _LP64: defined in 64-bit LP64 mode, but only if <sys/types.h> is included
+
+Borland C/C++ predefines the following:
+   - __BORLANDC__:
+
+DEC/Compaq C/C++ on Alpha:
+   - __alpha
+   - __arch64__
+   - __unix__ (on Tru64 Unix)
+   - __osf__
+   - __DECC
+   - __DECCXX (C++ compilation)
+   - __DECC_VER
+   - __DECCXX_VER
+
+IBM's AIX compiler:
+   - __64BIT__ if 64-bit mode
+   - _AIX
+   - __IBMC__: C compiler version
+   - __IBMCPP__: C++ compiler version
+   - _LONG_LONG: compiler allows long long
+
+Watcom:
+   - __WATCOMC__
+   - __DOS__ : if targeting DOS
+   - __386__ : if 32-bit support
+   - __WIN32__ : if targetin 32-bit Windows
+
+HP-UX C/C++ Compiler:
+   - __hpux
+   - __unix
+   - __hppa (on PA-RISC)
+   - __LP64__: if compiled in 64-bit mode
+
+Metrowerks:
+   - __MWERKS__
+   - __powerpc__
+   - _powerc
+   - __MC68K__
+   - macintosh when compiling for MacOS
+   - __INTEL__ for x86 targets
+   - __POWERPC__
+
+LLVM:
+   - __llvm__
+   - __clang__
+*/
+
+/*
+** ----------------------------------------------------------------------------
+** Include <limits.h> optionally
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_USE_LIMITS_H
+#  include <limits.h>
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine compilation environment
+** ----------------------------------------------------------------------------
+*/
+#if defined __ECC || defined __ICC || defined __INTEL_COMPILER
+#  define POSH_COMPILER_STRING "Intel C/C++"
+#  define POSH_COMPILER_INTEL 1
+#endif
+
+#if ( defined __host_mips || defined __sgi ) && !defined __GNUC__
+#  define POSH_COMPILER_STRING    "MIPSpro C/C++"
+#  define POSH_COMPILER_MIPSPRO 1 
+#endif
+
+#if defined __hpux && !defined __GNUC__
+#  define POSH_COMPILER_STRING "HP-UX CC"
+#  define POSH_COMPILER_HPCC 1 
+#endif
+
+#if defined __clang__
+#  define POSH_COMPILER_STRING "Clang"
+#  define POSH_COMPILER_CLANG 1
+#endif
+
+#if defined __GNUC__ && !defined __clang__
+#  define POSH_COMPILER_STRING "Gnu GCC"
+#  define POSH_COMPILER_GCC 1
+#endif
+
+#if defined __APPLE_CC__
+   /* we don't define the compiler string here, let it be GNU */
+#  define POSH_COMPILER_APPLECC 1
+#endif
+
+#if defined __IBMC__ || defined __IBMCPP__
+#  define POSH_COMPILER_STRING "IBM C/C++"
+#  define POSH_COMPILER_IBM 1
+#endif
+
+#if defined _MSC_VER
+#  define POSH_COMPILER_STRING "Microsoft Visual C++"
+#  define POSH_COMPILER_MSVC 1
+#endif
+
+#if defined __SUNPRO_C
+#  define POSH_COMPILER_STRING "Sun Pro" 
+#  define POSH_COMPILER_SUN 1
+#endif
+
+#if defined __BORLANDC__
+#  define POSH_COMPILER_STRING "Borland C/C++"
+#  define POSH_COMPILER_BORLAND 1
+#endif
+
+#if defined __MWERKS__
+#  define POSH_COMPILER_STRING     "MetroWerks CodeWarrior"
+#  define POSH_COMPILER_METROWERKS 1
+#endif
+
+#if defined __DECC || defined __DECCXX
+#  define POSH_COMPILER_STRING "Compaq/DEC C/C++"
+#  define POSH_COMPILER_DEC 1
+#endif
+
+#if defined __WATCOMC__
+#  define POSH_COMPILER_STRING "Watcom C/C++"
+#  define POSH_COMPILER_WATCOM 1
+#endif
+
+#if !defined POSH_COMPILER_STRING
+#  define POSH_COMPILER_STRING "Unknown compiler"
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Determine target operating system
+** ----------------------------------------------------------------------------
+*/
+#if defined linux || defined __linux__
+#  define POSH_OS_LINUX 1 
+#  define POSH_OS_STRING "Linux"
+#endif
+
+#if defined __FreeBSD__
+#  define POSH_OS_FREEBSD 1 
+#  define POSH_OS_STRING "FreeBSD"
+#endif
+
+#if defined __OpenBSD__
+#  define POSH_OS_OPENBSD 1
+#  define POSH_OS_STRING "OpenBSD"
+#endif
+
+#if defined __CYGWIN32__
+#  define POSH_OS_CYGWIN32 1
+#  define POSH_OS_STRING "Cygwin"
+#endif
+
+#if defined GEKKO
+#  define POSH_OS_GAMECUBE
+#  define __powerpc__
+#  define POSH_OS_STRING "GameCube"
+#endif
+
+#if defined __MINGW32__
+#  define POSH_OS_MINGW 1
+#  define POSH_OS_STRING "MinGW"
+#endif
+
+#if defined GO32 && defined DJGPP && defined __MSDOS__ 
+#  define POSH_OS_GO32 1
+#  define POSH_OS_STRING "GO32/MS-DOS"
+#endif
+
+/* NOTE: make sure you use /bt=DOS if compiling for 32-bit DOS,
+   otherwise Watcom assumes host=target */
+#if defined __WATCOMC__  && defined __386__ && defined __DOS__
+#  define POSH_OS_DOS32 1
+#  define POSH_OS_STRING "DOS/32-bit"
+#endif
+
+#if defined _UNICOS
+#  define POSH_OS_UNICOS 1
+#  define POSH_OS_STRING "UNICOS"
+#endif
+
+#if ( defined __MWERKS__ && defined __powerc && !defined macintosh ) || defined __APPLE_CC__ || defined macosx
+#  define POSH_OS_OSX 1
+#  define POSH_OS_STRING "MacOS X"
+#endif
+
+#if defined __sun__ || defined sun || defined __sun || defined __solaris__
+#  if defined __SVR4 || defined __svr4__ || defined __solaris__
+#     define POSH_OS_STRING "Solaris"
+#     define POSH_OS_SOLARIS 1
+#  endif
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "SunOS"
+#     define POSH_OS_SUNOS 1
+#  endif
+#endif
+
+#if defined __sgi__ || defined sgi || defined __sgi
+#  define POSH_OS_IRIX 1
+#  define POSH_OS_STRING "Irix"
+#endif
+
+#if defined __hpux__ || defined __hpux
+#  define POSH_OS_HPUX 1
+#  define POSH_OS_STRING "HP-UX"
+#endif
+
+#if defined _AIX
+#  define POSH_OS_AIX 1
+#  define POSH_OS_STRING "AIX"
+#endif
+
+#if ( defined __alpha && defined __osf__ )
+#  define POSH_OS_TRU64 1
+#  define POSH_OS_STRING "Tru64"
+#endif
+
+#if defined __BEOS__ || defined __beos__
+#  define POSH_OS_BEOS 1
+#  define POSH_OS_STRING "BeOS"
+#endif
+
+#if defined amiga || defined amigados || defined AMIGA || defined _AMIGA
+#  define POSH_OS_AMIGA 1
+#  define POSH_OS_STRING "Amiga"
+#endif
+
+#if defined __unix__
+#  define POSH_OS_UNIX 1 
+#  if !defined POSH_OS_STRING
+#     define POSH_OS_STRING "Unix-like(generic)"
+#  endif
+#endif
+
+#if defined _WIN32_WCE
+#  define POSH_OS_WINCE 1
+#  define POSH_OS_STRING "Windows CE"
+#endif
+
+#if defined _XBOX || defined _XBOX_VER
+#  define POSH_OS_XBOX 1
+#  define POSH_OS_STRING "XBOX"
+#endif
+
+#if defined _WIN32 || defined WIN32 || defined __NT__ || defined __WIN32__
+#  define POSH_OS_WIN32 1
+#  if !defined POSH_OS_XBOX
+#     if defined _WIN64
+#        define POSH_OS_WIN64 1
+#        define POSH_OS_STRING "Win64"
+#     else
+#        if !defined POSH_OS_STRING
+#           define POSH_OS_STRING "Win32"
+#        endif
+#     endif
+#  endif
+#endif
+
+#if defined __palmos__
+#  define POSH_OS_PALM 1
+#  define POSH_OS_STRING "PalmOS"
+#endif
+
+#if defined THINK_C || defined macintosh
+#  define POSH_OS_MACOS 1
+#  define POSH_OS_STRING "MacOS"
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Determine target CPU
+** -----------------------------------------------------------------------------
+*/
+
+#if defined GEKKO
+#  define POSH_CPU_PPC750 1
+#  define POSH_CPU_STRING "IBM PowerPC 750 (NGC)"
+#endif
+
+#if defined mc68000 || defined m68k || defined __MC68K__ || defined m68000
+#  define POSH_CPU_68K 1
+#  define POSH_CPU_STRING "MC68000"
+#endif
+
+#if defined __PPC__ || defined __POWERPC__  || defined powerpc || defined _POWER || defined __ppc__ || defined __powerpc__ || defined _M_PPC
+#  define POSH_CPU_PPC 1
+#  if !defined POSH_CPU_STRING
+#    if defined __powerpc64__
+#       define POSH_CPU_STRING "PowerPC64"
+#    else
+#       define POSH_CPU_STRING "PowerPC"
+#    endif
+#  endif
+#endif
+
+#if defined _CRAYT3E || defined _CRAYMPP
+#  define POSH_CPU_CRAYT3E 1 /* target processor is a DEC Alpha 21164 used in a Cray T3E*/
+#  define POSH_CPU_STRING "Cray T3E (Alpha 21164)"
+#endif
+
+#if defined CRAY || defined _CRAY && !defined _CRAYT3E
+#  error Non-AXP Cray systems not supported
+#endif
+
+#if defined _SH3
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_STRING "Hitachi SH-3"
+#endif
+
+#if defined __sh4__ || defined __SH4__
+#  define POSH_CPU_SH3 1
+#  define POSH_CPU_SH4 1
+#  define POSH_CPU_STRING "Hitachi SH-4"
+#endif
+
+#if defined __sparc__ || defined __sparc
+#  if defined __arch64__ || defined __sparcv9 || defined __sparc_v9__
+#     define POSH_CPU_SPARC64 1 
+#     define POSH_CPU_STRING "Sparc/64"
+#  else
+#     define POSH_CPU_STRING "Sparc/32"
+#  endif
+#  define POSH_CPU_SPARC 1
+#endif
+
+#if defined ARM || defined __arm__ || defined _ARM
+#  define POSH_CPU_STRONGARM 1
+#  define POSH_CPU_STRING "ARM"
+#endif
+
+#if defined __aarch64__
+#  define POSH_CPU_AARCH64 1
+#  define POSH_CPU_STRING "ARM64"
+#endif
+
+#if defined mips || defined __mips__ || defined __MIPS__ || defined _MIPS
+#  define POSH_CPU_MIPS 1 
+#  if defined _R5900
+#    define POSH_CPU_STRING "MIPS R5900 (PS2)"
+#  else
+#    define POSH_CPU_STRING "MIPS"
+#  endif
+#endif
+
+#if defined __ia64 || defined _M_IA64 || defined __ia64__ 
+#  define POSH_CPU_IA64 1
+#  define POSH_CPU_STRING "IA64"
+#endif
+
+#if defined __X86__ || defined __i386__ || defined i386 || defined _M_IX86 || defined __386__ || defined __x86_64__ || defined _M_X64
+#  define POSH_CPU_X86 1
+#  if defined __x86_64__ || defined _M_X64
+#     define POSH_CPU_X86_64 1 
+#  endif
+#  if defined POSH_CPU_X86_64
+#     define POSH_CPU_STRING "AMD x86-64"
+#  else
+#     define POSH_CPU_STRING "Intel 386+"
+#  endif
+#endif
+
+#if defined __alpha || defined alpha || defined _M_ALPHA || defined __alpha__
+#  define POSH_CPU_AXP 1
+#  define POSH_CPU_STRING "AXP"
+#endif
+
+#if defined __hppa || defined hppa
+#  define POSH_CPU_HPPA 1
+#  define POSH_CPU_STRING "PA-RISC"
+#endif
+
+#if !defined POSH_CPU_STRING
+#  error POSH cannot determine target CPU
+#  define POSH_CPU_STRING "Unknown" /* this is here for Doxygen's benefit */
+#endif
+
+/*
+** -----------------------------------------------------------------------------
+** Attempt to autodetect building for embedded on Sony PS2
+** -----------------------------------------------------------------------------
+*/
+#if !defined POSH_OS_STRING
+#  if !defined FORCE_DOXYGEN
+#    define POSH_OS_EMBEDDED 1 
+#  endif
+#  if defined _R5900
+#     define POSH_OS_STRING "Sony PS2(embedded)"
+#  else
+#     define POSH_OS_STRING "Embedded/Unknown"
+#  endif
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Handle cdecl, stdcall, fastcall, etc.
+** ---------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 && !defined POSH_CPU_X86_64
+#  if defined __GNUC__
+#     define POSH_CDECL __attribute__((cdecl))
+#     define POSH_STDCALL __attribute__((stdcall))
+#     define POSH_FASTCALL __attribute__((fastcall))
+#  elif ( defined _MSC_VER || defined __WATCOMC__ || defined __BORLANDC__ || defined __MWERKS__ )
+#     define POSH_CDECL    __cdecl
+#     define POSH_STDCALL  __stdcall
+#     define POSH_FASTCALL __fastcall
+#  endif
+#else
+#  define POSH_CDECL    
+#  define POSH_STDCALL  
+#  define POSH_FASTCALL 
+#endif
+
+/*
+** ---------------------------------------------------------------------------
+** Define POSH_IMPORTEXPORT signature based on POSH_DLL and POSH_BUILDING_LIB
+** ---------------------------------------------------------------------------
+*/
+
+/*
+** We undefine this so that multiple inclusions will work
+*/
+#if defined POSH_IMPORTEXPORT
+#  undef POSH_IMPORTEXPORT
+#endif
+
+#if defined POSH_DLL
+#   if defined POSH_OS_WIN32
+#      if defined _MSC_VER 
+#         if ( _MSC_VER >= 800 )
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif  /* defined _MSC_VER */
+#      if defined __BORLANDC__
+#         if ( __BORLANDC__ >= 0x500 )
+#            if defined POSH_BUILDING_LIB 
+#               define POSH_IMPORTEXPORT __declspec( dllexport )
+#            else
+#               define POSH_IMPORTEXPORT __declspec( dllimport )
+#            endif
+#         else
+#            if defined POSH_BUILDING_LIB
+#               define POSH_IMPORTEXPORT __export
+#            else
+#               define POSH_IMPORTEXPORT 
+#            endif
+#         endif
+#      endif /* defined __BORLANDC__ */
+       /* for all other compilers, we're just making a blanket assumption */
+#      if defined __GNUC__ || defined __WATCOMC__ || defined __MWERKS__
+#         if defined POSH_BUILDING_LIB
+#            define POSH_IMPORTEXPORT __declspec( dllexport )
+#         else
+#            define POSH_IMPORTEXPORT __declspec( dllimport )
+#         endif
+#      endif /* all other compilers */
+#      if !defined POSH_IMPORTEXPORT
+#         error Building DLLs not supported on this compiler ([email protected] if you know how)
+#      endif
+#   endif /* defined POSH_OS_WIN32 */
+#endif
+
+/* On pretty much everything else, we can thankfully just ignore this */
+#if !defined POSH_IMPORTEXPORT
+#  define POSH_IMPORTEXPORT
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_DLL    
+#  define POSH_BUILDING_LIB
+#  undef POSH_DLL
+#  undef POSH_BUILDING_LIB
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** (Re)define POSH_PUBLIC_API export signature 
+** ----------------------------------------------------------------------------
+*/
+#ifdef POSH_PUBLIC_API
+#  undef POSH_PUBLIC_API
+#endif
+
+#if ( ( defined _MSC_VER ) && ( _MSC_VER < 800 ) ) || ( defined __BORLANDC__ && ( __BORLANDC__ < 0x500 ) )
+#  define POSH_PUBLIC_API(rtype) extern rtype POSH_IMPORTEXPORT 
+#else
+#  define POSH_PUBLIC_API(rtype) extern POSH_IMPORTEXPORT rtype
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Try to infer endianess.  Basically we just go through the CPUs we know are
+** little endian, and assume anything that isn't one of those is big endian.
+** As a sanity check, we also do this with operating systems we know are
+** little endian, such as Windows.  Some processors are bi-endian, such as 
+** the MIPS series, so we have to be careful about those.
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_X86 || defined POSH_CPU_AXP || defined POSH_CPU_STRONGARM || defined POSH_CPU_AARCH64 || defined POSH_OS_WIN32 || defined POSH_OS_WINCE || defined __MIPSEL__
+#  define POSH_ENDIAN_STRING "little"
+#  define POSH_LITTLE_ENDIAN 1
+#else
+#  define POSH_ENDIAN_STRING "big"
+#  define POSH_BIG_ENDIAN 1
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_LITTLE_ENDIAN
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** Cross-platform compile time assertion macro
+** ----------------------------------------------------------------------------
+*/
+#define POSH_COMPILE_TIME_ASSERT(name, x) typedef int _POSH_dummy_ ## name[(x) ? 1 : -1 ]
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit Integer
+**
+** We don't require 64-bit support, nor do we emulate its functionality, we
+** simply export it if it's available.  Since we can't count on <limits.h>
+** for 64-bit support, we ignore the POSH_USE_LIMITS_H directive.
+** ----------------------------------------------------------------------------
+*/
+#if defined ( __LP64__ ) || defined ( __powerpc64__ ) || defined POSH_CPU_SPARC64
+#  define POSH_64BIT_INTEGER 1
+typedef long posh_i64_t; 
+typedef unsigned long posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)x)
+#  define POSH_U64( x ) ((posh_u64_t)x)
+#  define POSH_I64_PRINTF_PREFIX "l"
+#elif defined _MSC_VER || defined __BORLANDC__ || defined __WATCOMC__ || ( defined __alpha && defined __DECC )
+#  define POSH_64BIT_INTEGER 1
+typedef __int64 posh_i64_t;
+typedef unsigned __int64 posh_u64_t;
+#  define POSH_I64( x ) ((posh_i64_t)(x##i64))
+#  define POSH_U64( x ) ((posh_u64_t)(x##ui64))
+#  define POSH_I64_PRINTF_PREFIX "I64"
+#elif defined __GNUC__ || defined __MWERKS__ || defined __SUNPRO_C || defined __SUNPRO_CC || defined __APPLE_CC__ || defined POSH_OS_IRIX || defined _LONG_LONG || defined _CRAYC
+#  define POSH_64BIT_INTEGER 1
+typedef long long posh_i64_t;
+typedef unsigned long long posh_u64_t;
+#  define POSH_U64( x ) ((posh_u64_t)(x##LL))
+#  define POSH_I64( x ) ((posh_i64_t)(x##LL))
+#  define POSH_I64_PRINTF_PREFIX "ll"
+#endif
+
+/* hack */
+/*#ifdef __MINGW32__
+#undef POSH_I64
+#undef POSH_U64
+#undef POSH_I64_PRINTF_PREFIX
+#define POSH_I64( x ) ((posh_i64_t)x)
+#define POSH_U64( x ) ((posh_u64_t)x)
+#define POSH_I64_PRINTF_PREFIX "I64"
+#endif*/
+
+#ifdef FORCE_DOXYGEN
+typedef long long posh_i64_t;
+typedef unsigned long posh_u64_t;
+#  define POSH_64BIT_INTEGER
+#  define POSH_I64_PRINTF_PREFIX
+#  define POSH_I64(x)
+#  define POSH_U64(x)
+#endif
+
+/** Minimum value for a 64-bit signed integer */
+#define POSH_I64_MIN  POSH_I64(0x8000000000000000)
+/** Maximum value for a 64-bit signed integer */
+#define POSH_I64_MAX  POSH_I64(0x7FFFFFFFFFFFFFFF)
+/** Minimum value for a 64-bit unsigned integer */
+#define POSH_U64_MIN  POSH_U64(0)
+/** Maximum value for a 64-bit unsigned integer */
+#define POSH_U64_MAX  POSH_U64(0xFFFFFFFFFFFFFFFF)
+
+/* ----------------------------------------------------------------------------
+** Basic Sized Types
+**
+** These types are expected to be EXACTLY sized so you can use them for
+** serialization.
+** ----------------------------------------------------------------------------
+*/
+#define POSH_FALSE 0 
+#define POSH_TRUE  1 
+
+typedef int            posh_bool_t;
+typedef unsigned char  posh_byte_t;
+
+/* NOTE: These assume that CHAR_BIT is 8!! */
+typedef unsigned char  posh_u8_t;
+typedef signed char    posh_i8_t;
+
+#if defined POSH_USE_LIMITS_H
+#  if CHAR_BITS > 8
+#    error This machine uses 9-bit characters.  This is a warning, you can comment this out now.
+#  endif /* CHAR_BITS > 8 */
+
+/* 16-bit */
+#  if ( USHRT_MAX == 65535 ) 
+   typedef unsigned short posh_u16_t;
+   typedef short          posh_i16_t;
+#  else
+   /* Yes, in theory there could still be a 16-bit character type and shorts are
+      32-bits in size...if you find such an architecture, let me know =P */
+#    error No 16-bit type found
+#  endif
+
+/* 32-bit */
+#  if ( INT_MAX == 2147483647 )
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  elif ( LONG_MAX == 2147483647 )
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  else
+      error No 32-bit type found
+#  endif
+
+#else /* POSH_USE_LIMITS_H */
+
+  typedef unsigned short posh_u16_t;
+  typedef short          posh_i16_t;
+
+#  if !defined POSH_OS_PALM
+  typedef unsigned       posh_u32_t;
+  typedef int            posh_i32_t;
+#  else
+  typedef unsigned long  posh_u32_t;
+  typedef long           posh_i32_t;
+#  endif
+#endif
+
+/** Minimum value for a byte */
+#define POSH_BYTE_MIN    0
+/** Maximum value for an 8-bit unsigned value */
+#define POSH_BYTE_MAX    255
+/** Minimum value for a byte */
+#define POSH_I16_MIN     ( ( posh_i16_t ) 0x8000 )
+/** Maximum value for a 16-bit signed value */
+#define POSH_I16_MAX     ( ( posh_i16_t ) 0x7FFF ) 
+/** Minimum value for a 16-bit unsigned value */
+#define POSH_U16_MIN     0
+/** Maximum value for a 16-bit unsigned value */
+#define POSH_U16_MAX     ( ( posh_u16_t ) 0xFFFF )
+/** Minimum value for a 32-bit signed value */
+#define POSH_I32_MIN     ( ( posh_i32_t ) 0x80000000 )
+/** Maximum value for a 32-bit signed value */
+#define POSH_I32_MAX     ( ( posh_i32_t ) 0x7FFFFFFF )
+/** Minimum value for a 32-bit unsigned value */
+#define POSH_U32_MIN     0
+/** Maximum value for a 32-bit unsigned value */
+#define POSH_U32_MAX     ( ( posh_u32_t ) 0xFFFFFFFF )
+
+/*
+** ----------------------------------------------------------------------------
+** Sanity checks on expected sizes
+** ----------------------------------------------------------------------------
+*/
+#if !defined FORCE_DOXYGEN
+
+POSH_COMPILE_TIME_ASSERT(posh_byte_t, sizeof(posh_byte_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u8_t, sizeof(posh_u8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_i8_t, sizeof(posh_i8_t) == 1);
+POSH_COMPILE_TIME_ASSERT(posh_u16_t, sizeof(posh_u16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_i16_t, sizeof(posh_i16_t) == 2);
+POSH_COMPILE_TIME_ASSERT(posh_u32_t, sizeof(posh_u32_t) == 4);
+POSH_COMPILE_TIME_ASSERT(posh_i32_t, sizeof(posh_i32_t) == 4);
+
+#if !defined POSH_NO_FLOAT
+   POSH_COMPILE_TIME_ASSERT(posh_testfloat_t, sizeof(float)==4 );
+   POSH_COMPILE_TIME_ASSERT(posh_testdouble_t, sizeof(double)==8);
+#endif
+
+#if defined POSH_64BIT_INTEGER
+   POSH_COMPILE_TIME_ASSERT(posh_u64_t, sizeof(posh_u64_t) == 8);
+   POSH_COMPILE_TIME_ASSERT(posh_i64_t, sizeof(posh_i64_t) == 8);
+#endif
+
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** 64-bit pointer support
+** ----------------------------------------------------------------------------
+*/
+#if defined POSH_CPU_AXP && ( defined POSH_OS_TRU64 || defined POSH_OS_LINUX )
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_X86_64 && defined POSH_OS_LINUX
+#  define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_CPU_SPARC64 || defined POSH_OS_WIN64 || defined __64BIT__ || defined __LP64 || defined _LP64 || defined __LP64__ || defined _ADDR64 || defined _CRAYC
+#   define POSH_64BIT_POINTER 1
+#endif
+
+#if defined POSH_64BIT_POINTER
+   POSH_COMPILE_TIME_ASSERT( posh_64bit_pointer, sizeof( void * ) == 8 );
+#elif !defined FORCE_DOXYGEN
+/* if this assertion is hit then you're on a system that either has 64-bit
+   addressing and we didn't catch it, or you're on a system with 16-bit
+   pointers.  In the latter case, POSH doesn't actually care, we're just
+   triggering this assertion to make sure you're aware of the situation,
+   so feel free to delete it.
+
+   If this assertion is triggered on a known 32 or 64-bit platform, 
+   please let us know ([email protected]) */
+   POSH_COMPILE_TIME_ASSERT( posh_32bit_pointer, sizeof( void * ) == 4 );
+#endif
+
+#if defined FORCE_DOXYGEN
+#  define POSH_64BIT_POINTER
+#endif
+
+/*
+** ----------------------------------------------------------------------------
+** POSH Utility Functions
+**
+** These are optional POSH utility functions that are not required if you don't
+** need anything except static checking of your host and target environment.
+** 
+** These functions are NOT wrapped with POSH_PUBLIC_API because I didn't want
+** to enforce their export if your own library is only using them internally.
+** ----------------------------------------------------------------------------
+*/
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const char *POSH_GetArchString( void );
+
+#if !defined POSH_NO_FLOAT
+
+posh_u32_t  POSH_LittleFloatBits( float f );
+posh_u32_t  POSH_BigFloatBits( float f );
+float       POSH_FloatFromLittleBits( posh_u32_t bits );
+float       POSH_FloatFromBigBits( posh_u32_t bits );
+
+void        POSH_DoubleBits( double d, posh_byte_t dst[ 8 ] );
+double      POSH_DoubleFromBits( const posh_byte_t src[ 8 ] );
+
+/* unimplemented
+float      *POSH_WriteFloatToLittle( void *dst, float f );
+float      *POSH_WriteFloatToBig( void *dst, float f );
+float       POSH_ReadFloatFromLittle( const void *src );
+float       POSH_ReadFloatFromBig( const void *src );
+
+double     *POSH_WriteDoubleToLittle( void *dst, double d );
+double     *POSH_WriteDoubleToBig( void *dst, double d );
+double      POSH_ReadDoubleFromLittle( const void *src );
+double      POSH_ReadDoubleFromBig( const void *src );
+*/
+#endif /* !defined POSH_NO_FLOAT */
+
+#if defined FORCE_DOXYGEN
+#  define POSH_NO_FLOAT
+#  undef  POSH_NO_FLOAT
+#endif
+
+extern posh_u16_t  POSH_SwapU16( posh_u16_t u );
+extern posh_i16_t  POSH_SwapI16( posh_i16_t u );
+extern posh_u32_t  POSH_SwapU32( posh_u32_t u );
+extern posh_i32_t  POSH_SwapI32( posh_i32_t u );
+
+#if defined POSH_64BIT_INTEGER
+
+extern posh_u64_t  POSH_SwapU64( posh_u64_t u );
+extern posh_i64_t  POSH_SwapI64( posh_i64_t u );
+
+#endif /*POSH_64BIT_INTEGER */
+
+extern posh_u16_t *POSH_WriteU16ToLittle( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToLittle( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToLittle( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToLittle( void *dst, posh_i32_t value );
+
+extern posh_u16_t *POSH_WriteU16ToBig( void *dst, posh_u16_t value );
+extern posh_i16_t *POSH_WriteI16ToBig( void *dst, posh_i16_t value );
+extern posh_u32_t *POSH_WriteU32ToBig( void *dst, posh_u32_t value );
+extern posh_i32_t *POSH_WriteI32ToBig( void *dst, posh_i32_t value );
+
+extern posh_u16_t  POSH_ReadU16FromLittle( const void *src );
+extern posh_i16_t  POSH_ReadI16FromLittle( const void *src );
+extern posh_u32_t  POSH_ReadU32FromLittle( const void *src );
+extern posh_i32_t  POSH_ReadI32FromLittle( const void *src );
+
+extern posh_u16_t  POSH_ReadU16FromBig( const void *src );
+extern posh_i16_t  POSH_ReadI16FromBig( const void *src );
+extern posh_u32_t  POSH_ReadU32FromBig( const void *src );
+extern posh_i32_t  POSH_ReadI32FromBig( const void *src );
+
+#if defined POSH_64BIT_INTEGER
+extern posh_u64_t *POSH_WriteU64ToLittle( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToLittle( void *dst, posh_i64_t value );
+extern posh_u64_t *POSH_WriteU64ToBig( void *dst, posh_u64_t value );
+extern posh_i64_t *POSH_WriteI64ToBig( void *dst, posh_i64_t value );
+
+extern posh_u64_t  POSH_ReadU64FromLittle( const void *src );
+extern posh_i64_t  POSH_ReadI64FromLittle( const void *src );
+extern posh_u64_t  POSH_ReadU64FromBig( const void *src );
+extern posh_i64_t  POSH_ReadI64FromBig( const void *src );
+#endif /* POSH_64BIT_INTEGER */
+
+#if defined POSH_LITTLE_ENDIAN
+
+#  define POSH_LittleU16(x) (x)
+#  define POSH_LittleU32(x) (x)
+#  define POSH_LittleI16(x) (x)
+#  define POSH_LittleI32(x) (x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) (x)
+#    define POSH_LittleI64(x) (x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#  define POSH_BigU16(x) POSH_SwapU16(x)
+#  define POSH_BigU32(x) POSH_SwapU32(x)
+#  define POSH_BigI16(x) POSH_SwapI16(x)
+#  define POSH_BigI32(x) POSH_SwapI32(x)
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) POSH_SwapU64(x)
+#    define POSH_BigI64(x) POSH_SwapI64(x)
+#  endif /* defined POSH_64BIT_INTEGER */
+
+#else
+
+#  define POSH_BigU16(x) (x)
+#  define POSH_BigU32(x) (x)
+#  define POSH_BigI16(x) (x)
+#  define POSH_BigI32(x) (x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_BigU64(x) (x)
+#    define POSH_BigI64(x) (x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#  define POSH_LittleU16(x) POSH_SwapU16(x)
+#  define POSH_LittleU32(x) POSH_SwapU32(x)
+#  define POSH_LittleI16(x) POSH_SwapI16(x)
+#  define POSH_LittleI32(x) POSH_SwapI32(x)
+
+#  if defined POSH_64BIT_INTEGER
+#    define POSH_LittleU64(x) POSH_SwapU64(x)
+#    define POSH_LittleI64(x) POSH_SwapI64(x)
+#  endif /* POSH_64BIT_INTEGER */
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+

+ 459 - 0
3rdparty/nvtt/nvcore/stdstream.h

@@ -0,0 +1,459 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#include "nvcore.h"
+#include "stream.h"
+#include "array.h"
+
+#include <stdio.h> // fopen
+#include <string.h> // memcpy
+
+namespace nv
+{
+
+    // Portable version of fopen.
+    inline FILE * fileOpen(const char * fileName, const char * mode)
+    {
+        nvCheck(fileName != NULL);
+#if NV_CC_MSVC && _MSC_VER >= 1400
+        FILE * fp;
+        if (fopen_s(&fp, fileName, mode) == 0) {
+            return fp;
+        }
+        return NULL;
+#else
+        return fopen(fileName, mode);
+#endif
+    }
+
+
+    /// Base stdio stream.
+    class NVCORE_CLASS StdStream : public Stream
+    {
+        NV_FORBID_COPY(StdStream);
+    public:
+
+        /// Ctor.
+        StdStream( FILE * fp, bool autoclose ) : m_fp(fp), m_autoclose(autoclose) { }
+
+        /// Dtor. 
+        virtual ~StdStream()
+        {
+            if( m_fp != NULL && m_autoclose ) {
+#if NV_OS_WIN32
+                _fclose_nolock( m_fp );
+#else
+                fclose( m_fp );
+#endif
+            }
+        }
+
+
+        /** @name Stream implementation. */
+        //@{
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(m_fp != NULL);
+            nvDebugCheck(pos <= size());
+#if NV_OS_WIN32
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return _ftell_nolock(m_fp);
+#else
+            return (uint)ftell(m_fp);
+#endif
+        }
+
+        virtual uint size() const
+        {
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return end;
+        }
+
+        virtual bool isError() const
+        {
+            return m_fp == NULL || ferror( m_fp ) != 0;
+        }
+
+        virtual void clearError()
+        {
+            nvDebugCheck(m_fp != NULL);
+            clearerr(m_fp);
+        }
+
+        // @@ The original implementation uses feof, which only returns true when we attempt to read *past* the end of the stream. 
+        // That is, if we read the last byte of a file, then isAtEnd would still return false, even though the stream pointer is at the file end. This is not the intent and was inconsistent with the implementation of the MemoryStream, a better 
+        // implementation uses use ftell and fseek to determine our location within the file.
+        virtual bool isAtEnd() const
+        {
+            if (m_fp == NULL) return true;
+            //nvDebugCheck(m_fp != NULL);
+            //return feof( m_fp ) != 0;
+#if NV_OS_WIN32
+            uint pos = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, 0, SEEK_END);
+            uint end = _ftell_nolock(m_fp);
+            _fseek_nolock(m_fp, pos, SEEK_SET);
+#else
+            uint pos = (uint)ftell(m_fp);
+            fseek(m_fp, 0, SEEK_END);
+            uint end = (uint)ftell(m_fp);
+            fseek(m_fp, pos, SEEK_SET);
+#endif
+            return pos == end;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const { return true; }
+        //@}
+
+    protected:
+
+        FILE * m_fp;
+        bool m_autoclose;
+
+    };
+
+
+    /// Standard output stream.
+    class NVCORE_CLASS StdOutputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdOutputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdOutputStream( const char * name ) : StdStream(fileOpen(name, "wb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdOutputStream( FILE * fp, bool autoclose ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Write data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fwrite_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fwrite_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                putc_unlocked(((char *)data)[i], m_fp);
+            }
+            return len;
+#else
+            return (uint)fwrite(data, 1, len, m_fp);
+#endif
+        }
+
+        virtual bool isLoading() const
+        {
+            return false;
+        }
+
+        virtual bool isSaving() const
+        {
+            return true;
+        }
+        //@}
+
+    };
+
+
+    /// Standard input stream.
+    class NVCORE_CLASS StdInputStream : public StdStream
+    {
+        NV_FORBID_COPY(StdInputStream);
+    public:
+
+        /// Construct stream by file name.
+        StdInputStream( const char * name ) : StdStream(fileOpen(name, "rb"), /*autoclose=*/true) { }
+
+        /// Construct stream by file handle.
+        StdInputStream( FILE * fp, bool autoclose=true ) : StdStream(fp, autoclose)
+        {
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(m_fp != NULL);
+#if NV_OS_WIN32
+            return (uint)_fread_nolock(data, 1, len, m_fp);
+#elif NV_OS_LINUX
+            return (uint)fread_unlocked(data, 1, len, m_fp);
+#elif NV_OS_DARWIN
+            // @@ No error checking, always returns len.
+            for (uint i = 0; i < len; i++) {
+                ((char *)data)[i] = getc_unlocked(m_fp);
+            }
+            return len;
+#else
+            return (uint)fread(data, 1, len, m_fp);
+#endif
+            
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+    };
+
+
+
+    /// Memory input stream.
+    class NVCORE_CLASS MemoryInputStream : public Stream
+    {
+        NV_FORBID_COPY(MemoryInputStream);
+    public:
+
+        /// Ctor.
+        MemoryInputStream( const uint8 * mem, uint size ) : m_mem(mem), m_ptr(mem), m_size(size) { }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            nvDebugCheck(!isError());
+
+            uint left = m_size - tell();
+            if (len > left) len = left;
+
+            memcpy( data, m_ptr, len );
+            m_ptr += len;
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            nvDebugCheck(!isError());
+            m_ptr = m_mem + pos;
+            nvDebugCheck(!isError());
+        }
+
+        virtual uint tell() const
+        {
+            nvDebugCheck(m_ptr >= m_mem);
+            return uint(m_ptr - m_mem);
+        }
+
+        virtual uint size() const
+        {
+            return m_size;
+        }
+
+        virtual bool isError() const
+        {
+            return m_mem == NULL || m_ptr > m_mem + m_size || m_ptr < m_mem;
+        }
+
+        virtual void clearError()
+        {
+            // Nothing to do.
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_ptr == m_mem + m_size;
+        }
+
+        /// Always true.
+        virtual bool isSeekable() const
+        {
+            return true;
+        }
+
+        virtual bool isLoading() const
+        {
+            return true;
+        }
+
+        virtual bool isSaving() const
+        {
+            return false;
+        }
+        //@}
+
+        const uint8 * ptr() const { return m_ptr; }
+
+
+    private:
+
+        const uint8 * m_mem;
+        const uint8 * m_ptr;
+        uint m_size;
+
+    };
+
+
+    /// Buffer output stream.
+    class NVCORE_CLASS BufferOutputStream : public Stream
+    {
+        NV_FORBID_COPY(BufferOutputStream);
+    public:
+
+        BufferOutputStream(Array<uint8> & buffer) : m_buffer(buffer) { }
+
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            m_buffer.append((uint8 *)data, len);
+            return len;
+        }
+
+        virtual void seek( uint /*pos*/ ) { /*Not implemented*/ }
+        virtual uint tell() const { return m_buffer.size(); }
+        virtual uint size() const { return m_buffer.size(); }
+
+        virtual bool isError() const { return false; }
+        virtual void clearError() {}
+
+        virtual bool isAtEnd() const { return true; }
+        virtual bool isSeekable() const { return false; }
+        virtual bool isLoading() const { return false; }
+        virtual bool isSaving() const { return true; }
+
+    private:
+        Array<uint8> & m_buffer;
+    };
+
+
+    /// Protected input stream.
+    class NVCORE_CLASS ProtectedStream : public Stream
+    {
+        NV_FORBID_COPY(ProtectedStream);
+    public:
+
+        /// Ctor.
+        ProtectedStream( Stream & s ) : m_s(&s), m_autodelete(false)
+        { 
+        }
+
+        /// Ctor.
+        ProtectedStream( Stream * s, bool autodelete = true ) : 
+        m_s(s), m_autodelete(autodelete) 
+        {
+            nvDebugCheck(m_s != NULL);
+        }
+
+        /// Dtor.
+        virtual ~ProtectedStream()
+        {
+            if( m_autodelete ) {
+                delete m_s;
+            }
+        }
+
+        /** @name Stream implementation. */
+        //@{
+        /// Read data.
+        virtual uint serialize( void * data, uint len )
+        {
+            nvDebugCheck(data != NULL);
+            len = m_s->serialize( data, len );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+
+            return len;
+        }
+
+        virtual void seek( uint pos )
+        {
+            m_s->seek( pos );
+
+            if( m_s->isError() ) {
+                throw;
+            }
+        }
+
+        virtual uint tell() const
+        {
+            return m_s->tell();
+        }
+
+        virtual uint size() const
+        {
+            return m_s->size();
+        }
+
+        virtual bool isError() const
+        {
+            return m_s->isError();
+        }
+
+        virtual void clearError()
+        {
+            m_s->clearError();
+        }
+
+        virtual bool isAtEnd() const
+        {
+            return m_s->isAtEnd();
+        }
+
+        virtual bool isSeekable() const
+        {
+            return m_s->isSeekable();
+        }
+
+        virtual bool isLoading() const
+        {
+            return m_s->isLoading();
+        }
+
+        virtual bool isSaving() const
+        {
+            return m_s->isSaving();
+        }
+        //@}
+
+
+    private:
+
+        Stream * const m_s;
+        bool const m_autodelete;
+
+    };
+
+} // nv namespace
+
+
+//#endif // NV_CORE_STDSTREAM_H

+ 163 - 0
3rdparty/nvtt/nvcore/stream.h

@@ -0,0 +1,163 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_STREAM_H
+#define NV_CORE_STREAM_H
+
+#include "nvcore.h"
+#include "debug.h"
+
+namespace nv
+{
+
+    /// Base stream class.
+    class NVCORE_CLASS Stream {
+    public:
+
+        enum ByteOrder {
+            LittleEndian = false,
+            BigEndian = true,
+        };
+
+        /// Get the byte order of the system.
+        static ByteOrder getSystemByteOrder() { 
+#if NV_LITTLE_ENDIAN
+            return LittleEndian;
+#else
+            return BigEndian;
+#endif
+        }
+
+
+        /// Ctor.
+        Stream() : m_byteOrder(LittleEndian) { }
+
+        /// Virtual destructor.
+        virtual ~Stream() {}
+
+        /// Set byte order.
+        void setByteOrder(ByteOrder bo) { m_byteOrder = bo; }
+
+        /// Get byte order.
+        ByteOrder byteOrder() const { return m_byteOrder; }
+
+
+        /// Serialize the given data.
+        virtual uint serialize( void * data, uint len ) = 0;
+
+        /// Move to the given position in the archive.
+        virtual void seek( uint pos ) = 0;
+
+        /// Return the current position in the archive.
+        virtual uint tell() const = 0;
+
+        /// Return the current size of the archive.
+        virtual uint size() const = 0;
+
+        /// Determine if there has been any error.
+        virtual bool isError() const = 0;
+
+        /// Clear errors.
+        virtual void clearError() = 0;
+
+        /// Return true if the stream is at the end.
+        virtual bool isAtEnd() const = 0;
+
+        /// Return true if the stream is seekable.
+        virtual bool isSeekable() const = 0;
+
+        /// Return true if this is an input stream.
+        virtual bool isLoading() const = 0;
+
+        /// Return true if this is an output stream.
+        virtual bool isSaving() const = 0;
+
+
+        void advance(uint offset) { seek(tell() + offset); }
+
+
+        // friends	
+        friend Stream & operator<<( Stream & s, bool & c ) {
+#if NV_OS_DARWIN && !NV_CC_CPP11
+            nvStaticCheck(sizeof(bool) == 4);
+            uint8 b = c ? 1 : 0;
+            s.serialize( &b, 1 );
+            c = (b == 1);
+#else
+            nvStaticCheck(sizeof(bool) == 1);
+            s.serialize( &c, 1 );
+#endif
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, char & c ) {
+            nvStaticCheck(sizeof(char) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint8 & c ) {
+            nvStaticCheck(sizeof(uint8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, int8 & c ) {
+            nvStaticCheck(sizeof(int8) == 1);
+            s.serialize( &c, 1 );
+            return s;
+        }
+        friend Stream & operator<<( Stream & s, uint16 & c ) {
+            nvStaticCheck(sizeof(uint16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, int16 & c ) {
+            nvStaticCheck(sizeof(int16) == 2);
+            return s.byteOrderSerialize( &c, 2 );
+        }
+        friend Stream & operator<<( Stream & s, uint32 & c ) {
+            nvStaticCheck(sizeof(uint32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, int32 & c ) {
+            nvStaticCheck(sizeof(int32) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, uint64 & c ) {
+            nvStaticCheck(sizeof(uint64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, int64 & c ) {
+            nvStaticCheck(sizeof(int64) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+        friend Stream & operator<<( Stream & s, float & c ) {
+            nvStaticCheck(sizeof(float) == 4);
+            return s.byteOrderSerialize( &c, 4 );
+        }
+        friend Stream & operator<<( Stream & s, double & c ) {
+            nvStaticCheck(sizeof(double) == 8);
+            return s.byteOrderSerialize( &c, 8 );
+        }
+
+    protected:
+
+        /// Serialize in the stream byte order.
+        Stream & byteOrderSerialize( void * v, uint len ) {
+            if( m_byteOrder == getSystemByteOrder() ) {
+                serialize( v, len );
+            }
+            else {
+                for( uint i = len; i > 0; i-- ) {
+                    serialize( (uint8 *)v + i - 1, 1 );
+                }
+            }
+            return *this;
+        }
+
+
+    private:
+
+        ByteOrder m_byteOrder;
+
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STREAM_H

+ 429 - 0
3rdparty/nvtt/nvcore/strlib.h

@@ -0,0 +1,429 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_STRING_H
+#define NV_CORE_STRING_H
+
+#include "debug.h"
+#include "hash.h" // hash
+
+//#include <string.h> // strlen, etc.
+
+#if NV_OS_WIN32
+#define NV_PATH_SEPARATOR '\\'
+#else
+#define NV_PATH_SEPARATOR '/'
+#endif
+
+namespace nv
+{
+
+    NVCORE_API uint strHash(const char * str, uint h) NV_PURE;
+
+    /// String hash based on Bernstein's hash.
+    inline uint strHash(const char * data, uint h = 5381)
+    {
+        uint i = 0;
+        while(data[i] != 0) {
+            h = (33 * h) ^ uint(data[i]);
+            i++;
+        }
+        return h;
+    }
+
+    template <> struct Hash<const char *> {
+        uint operator()(const char * str) const { return strHash(str); }
+    };
+
+    NVCORE_API uint strLen(const char * str) NV_PURE;                       // Asserts on NULL strings.
+
+    NVCORE_API int strDiff(const char * s1, const char * s2) NV_PURE;       // Asserts on NULL strings.
+    NVCORE_API int strCaseDiff(const char * s1, const char * s2) NV_PURE;   // Asserts on NULL strings.
+    NVCORE_API bool strEqual(const char * s1, const char * s2) NV_PURE;     // Accepts NULL strings.
+    NVCORE_API bool strCaseEqual(const char * s1, const char * s2) NV_PURE; // Accepts NULL strings.
+
+    template <> struct Equal<const char *> {
+        bool operator()(const char * a, const char * b) const { return strEqual(a, b); }
+    };
+
+    NVCORE_API bool strBeginsWith(const char * dst, const char * prefix) NV_PURE;
+    NVCORE_API bool strEndsWith(const char * dst, const char * suffix) NV_PURE;
+
+
+    NVCORE_API void strCpy(char * dst, uint size, const char * src);
+    NVCORE_API void strCpy(char * dst, uint size, const char * src, uint len);
+    NVCORE_API void strCat(char * dst, uint size, const char * src);
+
+    NVCORE_API const char * strSkipWhiteSpace(const char * str);
+    NVCORE_API char * strSkipWhiteSpace(char * str);
+
+    NVCORE_API bool strMatch(const char * str, const char * pat) NV_PURE;
+
+    NVCORE_API bool isNumber(const char * str) NV_PURE;
+
+    /* @@ Implement these two functions and modify StringBuilder to use them?
+    NVCORE_API void strFormat(const char * dst, const char * fmt, ...);
+    NVCORE_API void strFormatList(const char * dst, const char * fmt, va_list arg);
+
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) __attribute__((format (printf, 2, 3)));
+    template <size_t count> void strFormatSafe(char (&buffer)[count], const char *fmt, ...) {
+        va_list args;
+        va_start(args, fmt);
+        strFormatList(buffer, count, fmt, args);
+        va_end(args);
+    }
+    template <size_t count> void strFormatListSafe(char (&buffer)[count], const char *fmt, va_list arg) {
+        va_list tmp;
+        va_copy(tmp, args);
+        strFormatList(buffer, count, fmt, tmp);
+        va_end(tmp);
+    }*/
+
+    template <int count> void strCpySafe(char (&buffer)[count], const char *src) {
+        strCpy(buffer, count, src);
+    }
+
+    template <int count> void strCatSafe(char (&buffer)[count], const char * src) {
+        strCat(buffer, count, src);
+    }
+
+
+
+    /// String builder.
+    class NVCORE_CLASS StringBuilder
+    {
+    public:
+
+        StringBuilder();
+        explicit StringBuilder( uint size_hint );
+        StringBuilder(const char * str);
+        StringBuilder(const char * str, uint len);
+        StringBuilder(const StringBuilder & other);
+
+        ~StringBuilder();
+
+        StringBuilder & format( const char * format, ... ) __attribute__((format (printf, 2, 3)));
+        StringBuilder & formatList( const char * format, va_list arg );
+
+        StringBuilder & append(const char * str);
+		StringBuilder & append(const char * str, uint len);
+        StringBuilder & appendFormat(const char * format, ...) __attribute__((format (printf, 2, 3)));
+        StringBuilder & appendFormatList(const char * format, va_list arg);
+
+        StringBuilder & appendSpace(uint n);
+
+        StringBuilder & number( int i, int base = 10 );
+        StringBuilder & number( uint i, int base = 10 );
+
+        StringBuilder & reserve(uint size_hint);
+        StringBuilder & copy(const char * str);
+        StringBuilder & copy(const char * str, uint len);
+        StringBuilder & copy(const StringBuilder & str);
+
+        StringBuilder & toLower();
+        StringBuilder & toUpper();
+
+        bool endsWith(const char * str) const;
+        bool beginsWith(const char * str) const;
+
+        char * reverseFind(char c);
+
+        void reset();
+        bool isNull() const { return m_size == 0; }
+
+        // const char * accessors
+        //operator const char * () const { return m_str; }
+        //operator char * () { return m_str; }
+        const char * str() const { return m_str; }
+        char * str() { return m_str; }
+
+        char * release();
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const StringBuilder & s ) {
+            return copy(s);
+        }
+
+        /// Implement value semantics.
+        StringBuilder & operator=( const char * s ) {
+            return copy(s);
+        }
+
+        /// Equal operator.
+        bool operator==( const StringBuilder & s ) const {
+            return strMatch(s.m_str, m_str);
+        }
+
+        /// Return the exact length.
+        uint length() const { return isNull() ? 0 : strLen(m_str); }
+
+        /// Return the size of the string container.
+        uint capacity() const { return m_size; }
+
+        /// Return the hash of the string.
+        uint hash() const { return isNull() ? 0 : strHash(m_str); }
+
+        // Swap strings.
+        friend void swap(StringBuilder & a, StringBuilder & b);
+
+    protected:
+
+        /// Size of the string container.
+        uint m_size;
+
+        /// String.
+        char * m_str;
+
+    };
+
+
+    /// Path string. @@ This should be called PathBuilder.
+    class NVCORE_CLASS Path : public StringBuilder
+    {
+    public:
+        Path() : StringBuilder() {}
+        explicit Path(int size_hint) : StringBuilder(size_hint) {}
+        Path(const char * str) : StringBuilder(str) {}
+        Path(const Path & path) : StringBuilder(path) {}
+
+        const char * fileName() const;
+        const char * extension() const;
+
+        void translatePath(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void appendSeparator(char pathSeparator = NV_PATH_SEPARATOR);
+
+        void stripFileName();
+        void stripExtension();
+
+        // statics
+        NVCORE_API static char separator();
+        NVCORE_API static const char * fileName(const char *);
+        NVCORE_API static const char * extension(const char *);
+
+        NVCORE_API static void translatePath(char * path, char pathSeparator = NV_PATH_SEPARATOR);
+    };
+
+
+    /// String class.
+    class NVCORE_CLASS String
+    {
+    public:
+
+        /// Constructs a null string. @sa isNull()
+        String()
+        {
+            data = NULL;
+        }
+
+        /// Constructs a shared copy of str.
+        String(const String & str)
+        {
+            data = str.data;
+            if (data != NULL) addRef();
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str)
+        {
+            setString(str);
+        }
+
+        /// Constructs a shared string from a standard string.
+        String(const char * str, int length)
+        {
+            setString(str, length);
+        }
+
+        /// Constructs a shared string from a StringBuilder.
+        String(const StringBuilder & str)
+        {
+            setString(str);
+        }
+
+        /// Dtor.
+        ~String()
+        {
+            release();
+        }
+
+        String clone() const;
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const char * str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Release the current string and allocate a new one.
+        const String & operator=( const StringBuilder & str )
+        {
+            release();
+            setString( str );
+            return *this;
+        }
+
+        /// Implement value semantics.
+        String & operator=( const String & str )
+        {
+            if (str.data != data)
+            {
+                release();
+                data = str.data;
+                addRef();
+            }
+            return *this;
+        }
+
+        /// Equal operator.
+        bool operator==( const String & str ) const
+        {
+            return strMatch(str.data, data);
+        }
+
+        /// Equal operator.
+        bool operator==( const char * str ) const
+        {
+            return strMatch(str, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const String & str ) const
+        {
+            return !strMatch(str.data, data);
+        }
+
+        /// Not equal operator.
+        bool operator!=( const char * str ) const
+        {
+            return !strMatch(str, data);
+        }
+
+        /// Returns true if this string is the null string.
+        bool isNull() const { return data == NULL; }
+
+        /// Return the exact length.
+        uint length() const { nvDebugCheck(data != NULL); return strLen(data); }
+
+        /// Return the hash of the string.
+        uint hash() const { nvDebugCheck(data != NULL); return strHash(data); }
+
+        /// const char * cast operator.
+        operator const char * () const { return data; }
+
+        /// Get string pointer.
+        const char * str() const { return data; }
+
+
+    private:
+
+        // Add reference count.
+        void addRef();
+
+        // Decrease reference count.
+        void release();
+
+        uint16 getRefCount() const
+        {
+            nvDebugCheck(data != NULL);
+            return *reinterpret_cast<const uint16 *>(data - 2);
+        }
+
+        void setRefCount(uint16 count) {
+            nvDebugCheck(data != NULL);
+            nvCheck(count < 0xFFFF);
+            *reinterpret_cast<uint16 *>(const_cast<char *>(data - 2)) = uint16(count);
+        }
+
+        void setData(const char * str) {
+            data = str + 2;
+        }
+
+        void allocString(const char * str)
+        {
+            allocString(str, strLen(str));
+        }
+
+        void allocString(const char * str, uint length);
+
+        void setString(const char * str);
+        void setString(const char * str, uint length);
+        void setString(const StringBuilder & str);
+
+        // Swap strings.
+        friend void swap(String & a, String & b);
+
+    private:
+
+        const char * data;
+
+    };
+
+    template <> struct Hash<String> {
+        uint operator()(const String & str) const { return str.hash(); }
+    };
+
+
+    // Like AutoPtr, but for const char strings.
+    class AutoString
+    {
+        NV_FORBID_COPY(AutoString);
+        NV_FORBID_HEAPALLOC();
+    public:
+
+        // Ctor.
+        AutoString(const char * p = NULL) : m_ptr(p) { }
+
+#if NV_CC_CPP11
+        // Move ctor.
+        AutoString(AutoString && ap) : m_ptr(ap.m_ptr) { ap.m_ptr = NULL; }
+#endif
+        
+        // Dtor. Deletes owned pointer.
+        ~AutoString() {
+            delete [] m_ptr;
+            m_ptr = NULL;
+        }
+
+        // Delete owned pointer and assign new one.
+        void operator=(const char * p) {
+            if (p != m_ptr) 
+            {
+                delete [] m_ptr;
+                m_ptr = p;
+            }
+        }
+
+        // Get pointer.
+        const char * ptr() const { return m_ptr; }
+        operator const char *() const { return m_ptr; }
+
+        // Relinquish ownership of the underlying pointer and returns that pointer.
+        const char * release() {
+            const char * tmp = m_ptr;
+            m_ptr = NULL;
+            return tmp;
+        }
+
+        // comparison operators.
+        friend bool operator == (const AutoString & ap, const char * const p) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const AutoString & ap, const char * const p) {
+            return (ap.ptr() != p);
+        }
+        friend bool operator == (const char * const p, const AutoString & ap) {
+            return (ap.ptr() == p);
+        }
+        friend bool operator != (const char * const p, const AutoString & ap) {
+            return (ap.ptr() != p);
+        }
+
+    private:
+        const char * m_ptr;
+    };
+
+} // nv namespace
+
+#endif // NV_CORE_STRING_H

+ 281 - 0
3rdparty/nvtt/nvcore/utils.h

@@ -0,0 +1,281 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_CORE_UTILS_H
+#define NV_CORE_UTILS_H
+
+#include "debug.h" // nvdebugcheck
+
+#include <new> // for placement new
+
+
+// Just in case. Grrr.
+#undef min
+#undef max
+
+#define NV_INT8_MIN    (-128)
+#define NV_INT8_MAX    127
+#define NV_UINT8_MAX    255
+#define NV_INT16_MIN    (-32767-1)
+#define NV_INT16_MAX    32767
+#define NV_UINT16_MAX   0xffff
+#define NV_INT32_MIN    (-2147483647-1)
+#define NV_INT32_MAX    2147483647
+#define NV_UINT32_MAX   0xffffffff
+#define NV_INT64_MAX    POSH_I64(9223372036854775807)
+#define NV_INT64_MIN    (-POSH_I64(9223372036854775807)-1)
+#define NV_UINT64_MAX   POSH_U64(0xffffffffffffffff)
+
+#define NV_HALF_MAX     65504.0F
+#define NV_FLOAT_MAX    3.402823466e+38F
+
+#define NV_INTEGER_TO_FLOAT_MAX  16777217     // Largest integer such that it and all smaller integers can be stored in a 32bit float.
+
+
+namespace nv
+{
+    // Less error prone than casting. From CB:
+    // http://cbloomrants.blogspot.com/2011/06/06-17-11-c-casting-is-devil.html
+
+    // These intentionally look like casts.
+
+    // uint32 casts:
+    template <typename T> inline uint32 U32(T x) { return x; }
+    template <> inline uint32 U32<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT32_MAX); return (uint32)x; }
+    template <> inline uint32 U32<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT32_MAX); return (uint32)x; }
+    //template <> inline uint32 U32<uint32>(uint32 x) { return x; }
+    template <> inline uint32 U32<int32>(int32 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint16>(uint16 x) { return x; }
+    template <> inline uint32 U32<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+    //template <> inline uint32 U32<uint8>(uint8 x) { return x; }
+    template <> inline uint32 U32<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint32)x; }
+
+    // int32 casts:
+    template <typename T> inline int32 I32(T x) { return x; }
+    template <> inline int32 I32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    template <> inline int32 I32<int64>(int64 x) { nvDebugCheck(x >= NV_INT32_MIN && x <= NV_UINT32_MAX); return (int32)x; }
+    template <> inline int32 I32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT32_MAX); return (int32)x; }
+    //template <> inline int32 I32<int32>(int32 x) { return x; }
+    //template <> inline int32 I32<uint16>(uint16 x) { return x; }
+    //template <> inline int32 I32<int16>(int16 x) { return x; }
+    //template <> inline int32 I32<uint8>(uint8 x) { return x; }
+    //template <> inline int32 I32<int8>(int8 x) { return x; }
+
+    // uint16 casts:
+    template <typename T> inline uint16 U16(T x) { return x; }
+    template <> inline uint16 U16<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT16_MAX); return (uint16)x; }
+    template <> inline uint16 U16<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT16_MAX); return (uint16)x; }
+    //template <> inline uint16 U16<uint16>(uint16 x) { return x; }
+    template <> inline uint16 U16<int16>(int16 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+    //template <> inline uint16 U16<uint8>(uint8 x) { return x; }
+    template <> inline uint16 U16<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint16)x; }
+
+    // int16 casts:
+    template <typename T> inline int16 I16(T x) { return x; }
+    template <> inline int16 I16<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int64>(int64 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    template <> inline int16 I16<int32>(int32 x) { nvDebugCheck(x >= NV_INT16_MIN && x <= NV_UINT16_MAX); return (int16)x; }
+    template <> inline int16 I16<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT16_MAX); return (int16)x; }
+    //template <> inline int16 I16<int16>(int16 x) { return x; }
+    //template <> inline int16 I16<uint8>(uint8 x) { return x; }
+    //template <> inline int16 I16<int8>(int8 x) { return x; }
+
+    // uint8 casts:
+    template <typename T> inline uint8 U8(T x) { return x; }
+    template <> inline uint8 U8<uint64>(uint64 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int64>(int64 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint32>(uint32 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int32>(int32 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<uint16>(uint16 x) { nvDebugCheck(x <= NV_UINT8_MAX); return (uint8)x; }
+    template <> inline uint8 U8<int16>(int16 x) { nvDebugCheck(x >= 0 && x <= NV_UINT8_MAX); return (uint8)x; }
+    //template <> inline uint8 U8<uint8>(uint8 x) { return x; }
+    template <> inline uint8 U8<int8>(int8 x) { nvDebugCheck(x >= 0); return (uint8)x; }
+    //template <> inline uint8 U8<float>(int8 x) { nvDebugCheck(x >= 0.0f && x <= 255.0f); return (uint8)x; }
+
+    // int8 casts:
+    template <typename T> inline int8 I8(T x) { return x; }
+    template <> inline int8 I8<uint64>(uint64 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int64>(int64 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint32>(uint32 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int32>(int32 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint16>(uint16 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    template <> inline int8 I8<int16>(int16 x) { nvDebugCheck(x >= NV_INT8_MIN && x <= NV_UINT8_MAX); return (int8)x; }
+    template <> inline int8 I8<uint8>(uint8 x) { nvDebugCheck(x <= NV_INT8_MAX); return (int8)x; }
+    //template <> inline int8 I8<int8>(int8 x) { return x; }
+
+    // float casts:
+    template <typename T> inline float F32(T x) { return x; }
+    template <> inline float F32<uint64>(uint64 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int64>(int64 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<uint32>(uint32 x) { nvDebugCheck(x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    template <> inline float F32<int32>(int32 x) { nvDebugCheck(x >= -NV_INTEGER_TO_FLOAT_MAX && x <= NV_INTEGER_TO_FLOAT_MAX); return (float)x; }
+    // The compiler should not complain about these conversions:
+    //template <> inline float F32<uint16>(uint16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int16>(int16 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<uint8>(uint8 x) { nvDebugCheck(return (float)x; }
+    //template <> inline float F32<int8>(int8 x) { nvDebugCheck(return (float)x; }
+
+
+    /// Swap two values.
+    template <typename T> 
+    inline void swap(T & a, T & b)
+    {
+        T temp(a);
+        a = b; 
+        b = temp;
+    }
+
+    /// Return the maximum of the two arguments. For floating point values, it returns the second value if the first is NaN.
+    template <typename T> 
+    //inline const T & max(const T & a, const T & b)
+    inline T max(const T & a, const T & b)
+    {
+        return (b < a) ? a : b;
+    }
+
+	/// Return the maximum of the four arguments.
+	template <typename T> 
+	//inline const T & max4(const T & a, const T & b, const T & c)
+	inline T max4(const T & a, const T & b, const T & c, const T & d)
+	{
+		return max(max(a, b), max(c, d));
+	}
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & max3(const T & a, const T & b, const T & c)
+    inline T max3(const T & a, const T & b, const T & c)
+    {
+        return max(a, max(b, c));
+    }
+
+    /// Return the minimum of two values.
+    template <typename T> 
+    //inline const T & min(const T & a, const T & b)
+    inline T min(const T & a, const T & b)
+    {
+        return (a < b) ? a : b;
+    }
+
+    /// Return the maximum of the three arguments.
+    template <typename T> 
+    //inline const T & min3(const T & a, const T & b, const T & c)
+    inline T min3(const T & a, const T & b, const T & c)
+    {
+        return min(a, min(b, c));
+    }
+
+    /// Clamp between two values.
+    template <typename T> 
+    //inline const T & clamp(const T & x, const T & a, const T & b)
+    inline T clamp(const T & x, const T & a, const T & b)
+    {
+        return min(max(x, a), b);
+    }
+
+    /** Return the next power of two. 
+    * @see http://graphics.stanford.edu/~seander/bithacks.html
+    * @warning Behaviour for 0 is undefined.
+    * @note isPowerOfTwo(x) == true -> nextPowerOfTwo(x) == x
+    * @note nextPowerOfTwo(x) = 2 << log2(x-1)
+    */
+    inline uint nextPowerOfTwo( uint x )
+    {
+        nvDebugCheck( x != 0 );
+#if 1	// On modern CPUs this is supposed to be as fast as using the bsr instruction.
+        x--;
+        x |= x >> 1;
+        x |= x >> 2;
+        x |= x >> 4;
+        x |= x >> 8;
+        x |= x >> 16;
+        return x+1;	
+#else
+        uint p = 1;
+        while( x > p ) {
+            p += p;
+        }
+        return p;
+#endif
+    }
+
+    /// Return true if @a n is a power of two.
+    inline bool isPowerOfTwo( uint n )
+    {
+        return (n & (n-1)) == 0;
+    }
+
+
+    // @@ Move this to utils?
+    /// Delete all the elements of a container.
+    template <typename T>
+    void deleteAll(T & container)
+    {
+        for (typename T::PseudoIndex i = container.start(); !container.isDone(i); container.advance(i))
+        {
+            delete container[i];
+        }
+    }
+
+
+
+    // @@ Specialize these methods for numeric, pointer, and pod types.
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T; // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T & elem) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(elem); // placement new
+        }
+    }
+
+    template <typename T>
+    void construct_range(T * restrict ptr, uint new_size, uint old_size, const T * src) {
+        for (uint i = old_size; i < new_size; i++) {
+            new(ptr+i) T(src[i]); // placement new
+        }
+    }
+
+    template <typename T>
+    void destroy_range(T * restrict ptr, uint new_size, uint old_size) {
+        for (uint i = new_size; i < old_size; i++) {
+            (ptr+i)->~T(); // Explicit call to the destructor
+        }
+    }
+
+    template <typename T>
+    void fill(T * restrict dst, uint count, const T & value) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = value;
+        }
+    }
+
+    template <typename T>
+    void copy_range(T * restrict dst, const T * restrict src, uint count) {
+        for (uint i = 0; i < count; i++) {
+            dst[i] = src[i];
+        }
+    }
+
+    template <typename T>
+    bool find(const T & element, const T * restrict ptr, uint begin, uint end, uint * index) {
+        for (uint i = begin; i < end; i++) {
+            if (ptr[i] == element) {
+                if (index != NULL) *index = i;
+                return true;
+            }
+        }
+        return false;
+    }
+
+} // nv namespace
+
+#endif // NV_CORE_UTILS_H

+ 921 - 0
3rdparty/nvtt/nvmath/Vector.inl

@@ -0,0 +1,921 @@
+// This code is in the public domain -- [email protected]
+
+#ifndef NV_MATH_VECTOR_INL
+#define NV_MATH_VECTOR_INL
+
+#include "vector.h"
+#include "nvcore/utils.h" // min, max
+#include "nvcore/hash.h" // hash
+
+namespace nv
+{
+
+    // Helpers to convert vector types. Assume T has x,y members and 2 argument constructor.
+    //template <typename T> T to(Vector2::Arg v) { return T(v.x, v.y); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector3::Arg v) { return T(v.x, v.y, v.z); }
+
+    // Helpers to convert vector types. Assume T has x,y,z members and 3 argument constructor.
+    //template <typename T> T to(Vector4::Arg v) { return T(v.x, v.y, v.z, v.w); }
+
+
+    // Vector2
+    inline Vector2::Vector2() {}
+    inline Vector2::Vector2(float f) : x(f), y(f) {}
+    inline Vector2::Vector2(float x, float y) : x(x), y(y) {}
+    inline Vector2::Vector2(Vector2::Arg v) : x(v.x), y(v.y) {}
+
+    inline const Vector2 & Vector2::operator=(Vector2::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        return *this;
+    }
+
+    inline const float * Vector2::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector2::set(float x, float y)
+    {
+        this->x = x;
+        this->y = y;
+    }
+
+    inline Vector2 Vector2::operator-() const
+    {
+        return Vector2(-x, -y);
+    }
+
+    inline void Vector2::operator+=(Vector2::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+    }
+
+    inline void Vector2::operator-=(Vector2::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+    }
+
+    inline void Vector2::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+    }
+
+    inline void Vector2::operator*=(Vector2::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+    }
+
+    inline bool operator==(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x == b.x && a.y == b.y; 
+    }
+    inline bool operator!=(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x != b.x || a.y != b.y; 
+    }
+
+
+    // Vector3
+    inline Vector3::Vector3() {}
+    inline Vector3::Vector3(float f) : x(f), y(f), z(f) {}
+    inline Vector3::Vector3(float x, float y, float z) : x(x), y(y), z(z) {}
+    inline Vector3::Vector3(Vector2::Arg v, float z) : x(v.x), y(v.y), z(z) {}
+    inline Vector3::Vector3(Vector3::Arg v) : x(v.x), y(v.y), z(v.z) {}
+
+    inline const Vector3 & Vector3::operator=(Vector3::Arg v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        return *this;
+    }
+
+
+    inline Vector2 Vector3::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline const float * Vector3::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector3::set(float x, float y, float z)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+    }
+
+    inline Vector3 Vector3::operator-() const
+    {
+        return Vector3(-x, -y, -z);
+    }
+
+    inline void Vector3::operator+=(Vector3::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+    }
+
+    inline void Vector3::operator-=(Vector3::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+    }
+
+    inline void Vector3::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+    }
+
+    inline void Vector3::operator/=(float s)
+    {
+        float is = 1.0f / s;
+        x *= is;
+        y *= is;
+        z *= is;
+    }
+
+    inline void Vector3::operator*=(Vector3::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+    }
+
+    inline void Vector3::operator/=(Vector3::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+    }
+
+    inline bool operator==(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z; 
+    }
+    inline bool operator!=(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z; 
+    }
+
+
+    // Vector4
+    inline Vector4::Vector4() {}
+    inline Vector4::Vector4(float f) : x(f), y(f), z(f), w(f) {}
+    inline Vector4::Vector4(float x, float y, float z, float w) : x(x), y(y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, float z, float w) : x(v.x), y(v.y), z(z), w(w) {}
+    inline Vector4::Vector4(Vector2::Arg v, Vector2::Arg u) : x(v.x), y(v.y), z(u.x), w(u.y) {}
+    inline Vector4::Vector4(Vector3::Arg v, float w) : x(v.x), y(v.y), z(v.z), w(w) {}
+    inline Vector4::Vector4(Vector4::Arg v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+
+    inline const Vector4 & Vector4::operator=(const Vector4 & v)
+    {
+        x = v.x;
+        y = v.y;
+        z = v.z;
+        w = v.w;
+        return *this;
+    }
+
+    inline Vector2 Vector4::xy() const
+    {
+        return Vector2(x, y);
+    }
+
+    inline Vector2 Vector4::zw() const
+    {
+        return Vector2(z, w);
+    }
+
+    inline Vector3 Vector4::xyz() const
+    {
+        return Vector3(x, y, z);
+    }
+
+    inline const float * Vector4::ptr() const
+    {
+        return &x;
+    }
+
+    inline void Vector4::set(float x, float y, float z, float w)
+    {
+        this->x = x;
+        this->y = y;
+        this->z = z;
+        this->w = w;
+    }
+
+    inline Vector4 Vector4::operator-() const
+    {
+        return Vector4(-x, -y, -z, -w);
+    }
+
+    inline void Vector4::operator+=(Vector4::Arg v)
+    {
+        x += v.x;
+        y += v.y;
+        z += v.z;
+        w += v.w;
+    }
+
+    inline void Vector4::operator-=(Vector4::Arg v)
+    {
+        x -= v.x;
+        y -= v.y;
+        z -= v.z;
+        w -= v.w;
+    }
+
+    inline void Vector4::operator*=(float s)
+    {
+        x *= s;
+        y *= s;
+        z *= s;
+        w *= s;
+    }
+
+    inline void Vector4::operator/=(float s)
+    {
+        x /= s;
+        y /= s;
+        z /= s;
+        w /= s;
+    }
+
+    inline void Vector4::operator*=(Vector4::Arg v)
+    {
+        x *= v.x;
+        y *= v.y;
+        z *= v.z;
+        w *= v.w;
+    }
+
+    inline void Vector4::operator/=(Vector4::Arg v)
+    {
+        x /= v.x;
+        y /= v.y;
+        z /= v.z;
+        w /= v.w;
+    }
+
+    inline bool operator==(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; 
+    }
+    inline bool operator!=(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x != b.x || a.y != b.y || a.z != b.z || a.w != b.w; 
+    }
+
+
+
+    // Functions
+
+
+    // Vector2
+
+    inline Vector2 add(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x + b.x, a.y + b.y);
+    }
+    inline Vector2 operator+(Vector2::Arg a, Vector2::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector2 sub(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(a.x - b.x, a.y - b.y);
+    }
+    inline Vector2 operator-(Vector2::Arg a, Vector2::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, float s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 scale(Vector2::Arg v, Vector2::Arg s)
+    {
+        return Vector2(v.x * s.x, v.y * s.y);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v1, Vector2::Arg v2)
+    {
+        return Vector2(v1.x*v2.x, v1.y*v2.y);
+    }
+
+    inline Vector2 operator*(float s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector2 lerp(Vector2::Arg v1, Vector2::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector2(v1.x * s + t * v2.x, v1.y * s + t * v2.y);
+    }
+
+    inline float dot(Vector2::Arg a, Vector2::Arg b)
+    {
+        return a.x * b.x + a.y * b.y;
+    }
+
+    inline float lengthSquared(Vector2::Arg v)
+    {
+        return v.x * v.x + v.y * v.y;
+    }
+
+    inline float length(Vector2::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector2::Arg a, Vector2::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float inverseLength(Vector2::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector2::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector2 normalize(Vector2::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector2 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector2 normalizeSafe(Vector2::Arg v, Vector2::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector2 normalizeFast(Vector2::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector2::Arg v1, Vector2::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon);
+    }
+
+    inline Vector2 min(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(min(a.x, b.x), min(a.y, b.y));
+    }
+
+    inline Vector2 max(Vector2::Arg a, Vector2::Arg b)
+    {
+        return Vector2(max(a.x, b.x), max(a.y, b.y));
+    }
+
+    inline Vector2 clamp(Vector2::Arg v, float min, float max)
+    {
+        return Vector2(clamp(v.x, min, max), clamp(v.y, min, max));
+    }
+
+    inline Vector2 saturate(Vector2::Arg v)
+    {
+        return Vector2(saturate(v.x), saturate(v.y));
+    }
+
+    inline bool isFinite(Vector2::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y);
+    }
+
+    inline Vector2 validate(Vector2::Arg v, Vector2::Arg fallback = Vector2(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector2 vf = v;
+        nv::floatCleanup(vf.component, 2);
+        return vf;
+    }
+
+    // Note, this is the area scaled by 2!
+    inline float triangleArea(Vector2::Arg v0, Vector2::Arg v1)
+    {
+	    return (v0.x * v1.y - v0.y * v1.x); // * 0.5f;
+    }
+    inline float triangleArea(Vector2::Arg a, Vector2::Arg b, Vector2::Arg c)
+    {
+        // IC: While it may be appealing to use the following expression:
+        //return (c.x * a.y + a.x * b.y + b.x * c.y - b.x * a.y - c.x * b.y - a.x * c.y); // * 0.5f;
+
+        // That's actually a terrible idea. Small triangles far from the origin can end up producing fairly large floating point 
+        // numbers and the results becomes very unstable and dependent on the order of the factors.
+
+        // Instead, it's preferable to substract the vertices first, and multiply the resulting small values together. The result
+        // in this case is always much more accurate (as long as the triangle is small) and less dependent of the location of 
+        // the triangle.
+
+        //return ((a.x - c.x) * (b.y - c.y) - (a.y - c.y) * (b.x - c.x)); // * 0.5f;
+        return triangleArea(a-c, b-c);
+    }
+
+
+    template <>
+    inline uint hash(const Vector2 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 2, h);
+    }
+
+
+
+    // Vector3
+
+    inline Vector3 add(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x + b.x, a.y + b.y, a.z + b.z);
+    }
+    inline Vector3 add(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x + b, a.y + b, a.z + b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, Vector3::Arg b)
+    {
+        return add(a, b);
+    }
+    inline Vector3 operator+(Vector3::Arg a, float b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector3 sub(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.x - b.x, a.y - b.y, a.z - b.z);
+    }
+    inline Vector3 sub(Vector3::Arg a, float b)
+    {
+        return Vector3(a.x - b, a.y - b, a.z - b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, Vector3::Arg b)
+    {
+        return sub(a, b);
+    }
+    inline Vector3 operator-(Vector3::Arg a, float b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector3 cross(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, float s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, Vector3::Arg s)
+    {
+        return Vector3(v.x * s.x, v.y * s.y, v.z * s.z);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(float s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, Vector3::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector3 add_scaled(Vector3::Arg a, Vector3::Arg b, float s)
+    {
+        return Vector3(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s);
+    }*/
+
+    inline Vector3 lerp(Vector3::Arg v1, Vector3::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector3(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z);
+    }
+
+    inline float dot(Vector3::Arg a, Vector3::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z;
+    }
+
+    inline float lengthSquared(Vector3::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z;
+    }
+
+    inline float length(Vector3::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float distance(Vector3::Arg a, Vector3::Arg b)
+    {
+        return length(a - b);
+    }
+
+    inline float distanceSquared(Vector3::Arg a, Vector3::Arg b)
+    {
+        return lengthSquared(a - b);
+    }
+
+    inline float inverseLength(Vector3::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector3::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector3 normalize(Vector3::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector3 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector3 normalizeSafe(Vector3::Arg v, Vector3::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector3 normalizeFast(Vector3::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector3::Arg v1, Vector3::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon);
+    }
+
+    inline Vector3 min(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z));
+    }
+
+    inline Vector3 max(Vector3::Arg a, Vector3::Arg b)
+    {
+        return Vector3(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z));
+    }
+
+    inline Vector3 clamp(Vector3::Arg v, float min, float max)
+    {
+        return Vector3(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max));
+    }
+
+    inline Vector3 saturate(Vector3::Arg v)
+    {
+        return Vector3(saturate(v.x), saturate(v.y), saturate(v.z));
+    }
+
+    inline Vector3 floor(Vector3::Arg v)
+    {
+        return Vector3(floorf(v.x), floorf(v.y), floorf(v.z));
+    }
+
+    inline Vector3 ceil(Vector3::Arg v)
+    {
+        return Vector3(ceilf(v.x), ceilf(v.y), ceilf(v.z));
+    }
+
+    inline bool isFinite(Vector3::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z);
+    }
+
+    inline Vector3 validate(Vector3::Arg v, Vector3::Arg fallback = Vector3(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector3 vf = v;
+        nv::floatCleanup(vf.component, 3);
+        return vf;
+    }
+
+    inline Vector3 reflect(Vector3::Arg v, Vector3::Arg n)
+    {
+	    return v - (2 * dot(v, n)) * n;
+    }
+
+    template <>
+    inline uint hash(const Vector3 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 3, h);
+    }
+
+
+    // Vector4
+
+    inline Vector4 add(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x + b.x, a.y + b.y, a.z + b.z, a.w + b.w);
+    }
+    inline Vector4 operator+(Vector4::Arg a, Vector4::Arg b)
+    {
+        return add(a, b);
+    }
+
+    inline Vector4 sub(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(a.x - b.x, a.y - b.y, a.z - b.z, a.w - b.w);
+    }
+    inline Vector4 operator-(Vector4::Arg a, Vector4::Arg b)
+    {
+        return sub(a, b);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, float s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, Vector4::Arg s)
+    {
+        return Vector4(v.x * s.x, v.y * s.y, v.z * s.z, v.w * s.w);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, float s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(float s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, Vector4::Arg s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, float s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    /*inline Vector4 add_scaled(Vector4::Arg a, Vector4::Arg b, float s)
+    {
+        return Vector4(a.x + b.x * s, a.y + b.y * s, a.z + b.z * s, a.w + b.w * s);
+    }*/
+
+    inline Vector4 lerp(Vector4::Arg v1, Vector4::Arg v2, float t)
+    {
+        const float s = 1.0f - t;
+        return Vector4(v1.x * s + t * v2.x, v1.y * s + t * v2.y, v1.z * s + t * v2.z, v1.w * s + t * v2.w);
+    }
+
+    inline float dot(Vector4::Arg a, Vector4::Arg b)
+    {
+        return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w;
+    }
+
+    inline float lengthSquared(Vector4::Arg v)
+    {
+        return v.x * v.x + v.y * v.y + v.z * v.z + v.w * v.w;
+    }
+
+    inline float length(Vector4::Arg v)
+    {
+        return sqrtf(lengthSquared(v));
+    }
+
+    inline float inverseLength(Vector4::Arg v)
+    {
+        return 1.0f / sqrtf(lengthSquared(v));
+    }
+
+    inline bool isNormalized(Vector4::Arg v, float epsilon = NV_NORMAL_EPSILON)
+    {
+        return equal(length(v), 1, epsilon);
+    }
+
+    inline Vector4 normalize(Vector4::Arg v, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        NV_UNUSED(epsilon);
+        nvDebugCheck(!isZero(l, epsilon));
+        Vector4 n = scale(v, 1.0f / l);
+        nvDebugCheck(isNormalized(n));
+        return n;
+    }
+
+    inline Vector4 normalizeSafe(Vector4::Arg v, Vector4::Arg fallback, float epsilon = NV_EPSILON)
+    {
+        float l = length(v);
+        if (isZero(l, epsilon)) {
+            return fallback;
+        }
+        return scale(v, 1.0f / l);
+    }
+
+    // Safe, branchless normalization from Andy Firth. All error checking ommitted.
+    // http://altdevblogaday.com/2011/08/21/practical-flt-point-tricks/
+    inline Vector4 normalizeFast(Vector4::Arg v)
+    {
+        const float very_small_float = 1.0e-037f;
+        float l = very_small_float + length(v);
+        return scale(v, 1.0f / l);
+    }
+
+    inline bool equal(Vector4::Arg v1, Vector4::Arg v2, float epsilon = NV_EPSILON)
+    {
+        return equal(v1.x, v2.x, epsilon) && equal(v1.y, v2.y, epsilon) && equal(v1.z, v2.z, epsilon) && equal(v1.w, v2.w, epsilon);
+    }
+
+    inline Vector4 min(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(min(a.x, b.x), min(a.y, b.y), min(a.z, b.z), min(a.w, b.w));
+    }
+
+    inline Vector4 max(Vector4::Arg a, Vector4::Arg b)
+    {
+        return Vector4(max(a.x, b.x), max(a.y, b.y), max(a.z, b.z), max(a.w, b.w));
+    }
+
+    inline Vector4 clamp(Vector4::Arg v, float min, float max)
+    {
+        return Vector4(clamp(v.x, min, max), clamp(v.y, min, max), clamp(v.z, min, max), clamp(v.w, min, max));
+    }
+
+    inline Vector4 saturate(Vector4::Arg v)
+    {
+        return Vector4(saturate(v.x), saturate(v.y), saturate(v.z), saturate(v.w));
+    }
+
+    inline bool isFinite(Vector4::Arg v)
+    {
+        return isFinite(v.x) && isFinite(v.y) && isFinite(v.z) && isFinite(v.w);
+    }
+
+    inline Vector4 validate(Vector4::Arg v, Vector4::Arg fallback = Vector4(0.0f))
+    {
+        if (!isFinite(v)) return fallback;
+        Vector4 vf = v;
+        nv::floatCleanup(vf.component, 4);
+        return vf;
+    }
+
+    template <>
+    inline uint hash(const Vector4 & v, uint h)
+    {
+        return sdbmFloatHash(v.component, 4, h);
+    }
+
+
+#if NV_OS_IOS // LLVM is not happy with implicit conversion of immediate constants to float
+
+    //int:
+
+    inline Vector2 scale(Vector2::Arg v, int s)
+    {
+        return Vector2(v.x * s, v.y * s);
+    }
+
+    inline Vector2 operator*(Vector2::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator*(int s, Vector2::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector2 operator/(Vector2::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector3 scale(Vector3::Arg v, int s)
+    {
+        return Vector3(v.x * s, v.y * s, v.z * s);
+    }
+
+    inline Vector3 operator*(Vector3::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator*(int s, Vector3::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    inline Vector4 scale(Vector4::Arg v, int s)
+    {
+        return Vector4(v.x * s, v.y * s, v.z * s, v.w * s);
+    }
+
+    inline Vector4 operator*(Vector4::Arg v, int s)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator*(int s, Vector4::Arg v)
+    {
+        return scale(v, s);
+    }
+
+    inline Vector4 operator/(Vector4::Arg v, int s)
+    {
+        return scale(v, 1.0f/s);
+    }
+
+    //double:
+
+    inline Vector3 operator*(Vector3::Arg v, double s)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator*(double s, Vector3::Arg v)
+    {
+        return scale(v, (float)s);
+    }
+
+    inline Vector3 operator/(Vector3::Arg v, double s)
+    {
+        return scale(v, 1.f/((float)s));
+    }    
+        
+#endif //NV_OS_IOS
+
+} // nv namespace
+
+#endif // NV_MATH_VECTOR_INL

+ 1200 - 0
3rdparty/nvtt/nvmath/fitting.cpp

@@ -0,0 +1,1200 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#include "fitting.h"
+#include "vector.inl"
+#include "plane.inl"
+#include "matrix.inl"
+
+#include "nvcore/array.inl"
+#include "nvcore/utils.h" // max, swap
+
+using namespace nv;
+
+// @@ Move to EigenSolver.h
+
+// @@ We should be able to do something cheaper...
+static Vector3 estimatePrincipalComponent(const float * __restrict matrix)
+{
+	const Vector3 row0(matrix[0], matrix[1], matrix[2]);
+	const Vector3 row1(matrix[1], matrix[3], matrix[4]);
+	const Vector3 row2(matrix[2], matrix[4], matrix[5]);
+
+	float r0 = lengthSquared(row0);
+	float r1 = lengthSquared(row1);
+	float r2 = lengthSquared(row2);
+
+	if (r0 > r1 && r0 > r2) return row0;
+	if (r1 > r2) return row1;
+	return row2;
+}
+
+
+static inline Vector3 firstEigenVector_PowerMethod(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(0.0f);
+    }
+
+    Vector3 v = estimatePrincipalComponent(matrix);
+
+    const int NUM = 8;
+    for (int i = 0; i < NUM; i++)
+    {
+        float x = v.x * matrix[0] + v.y * matrix[1] + v.z * matrix[2];
+        float y = v.x * matrix[1] + v.y * matrix[3] + v.z * matrix[4];
+        float z = v.x * matrix[2] + v.y * matrix[4] + v.z * matrix[5];
+
+        float norm = max(max(x, y), z);
+
+        v = Vector3(x, y, z) / norm;
+    }
+
+    return v;
+}
+
+
+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points)
+{
+    Vector3 centroid(0.0f);
+
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);
+
+    return centroid;
+}
+
+Vector3 nv::Fit::computeCentroid(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    Vector3 centroid(0.0f);
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points)
+{
+    Vector4 centroid(0.0f);
+
+    for (int i = 0; i < n; i++)
+    {
+        centroid += points[i];
+    }
+    centroid /= float(n);
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCentroid(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
+{
+    Vector4 centroid(0.0f);
+    float total = 0.0f;
+
+    for (int i = 0; i < n; i++)
+    {
+        total += weights[i];
+        centroid += weights[i]*points[i];
+    }
+    centroid /= total;
+
+    return centroid;
+}
+
+
+
+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 v = points[i] - centroid;
+
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.y * v.y;
+        covariance[4] += v.y * v.z;
+        covariance[5] += v.z * v.z;
+    }
+
+    return centroid;
+}
+
+Vector3 nv::Fit::computeCovariance(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector3 centroid = computeCentroid(n, points, weights, metric);
+
+    // compute covariance matrix
+    for (int i = 0; i < 6; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector3 a = (points[i] - centroid) * metric;
+        Vector3 b = weights[i]*a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.y * b.y;
+        covariance[4] += a.y * b.z;
+        covariance[5] += a.z * b.z;
+    }
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector4 centroid = computeCentroid(n, points);
+
+    // compute covariance matrix
+    for (int i = 0; i < 10; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector4 v = points[i] - centroid;
+
+        covariance[0] += v.x * v.x;
+        covariance[1] += v.x * v.y;
+        covariance[2] += v.x * v.z;
+        covariance[3] += v.x * v.w;
+
+		covariance[4] += v.y * v.y;
+        covariance[5] += v.y * v.z;
+        covariance[6] += v.y * v.w;
+
+		covariance[7] += v.z * v.z;
+		covariance[8] += v.z * v.w;
+
+		covariance[9] += v.w * v.w;
+	}
+
+    return centroid;
+}
+
+Vector4 nv::Fit::computeCovariance(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric, float *__restrict covariance)
+{
+    // compute the centroid
+    Vector4 centroid = computeCentroid(n, points, weights, metric);
+
+    // compute covariance matrix
+    for (int i = 0; i < 10; i++)
+    {
+        covariance[i] = 0.0f;
+    }
+
+    for (int i = 0; i < n; i++)
+    {
+        Vector4 a = (points[i] - centroid) * metric;
+        Vector4 b = weights[i]*a;
+
+        covariance[0] += a.x * b.x;
+        covariance[1] += a.x * b.y;
+        covariance[2] += a.x * b.z;
+        covariance[3] += a.x * b.w;
+
+		covariance[4] += a.y * b.y;
+        covariance[5] += a.y * b.z;
+        covariance[6] += a.y * b.w;
+
+		covariance[7] += a.z * b.z;
+		covariance[8] += a.z * b.w;
+
+		covariance[9] += a.w * b.w;
+    }
+
+    return centroid;
+}
+
+
+
+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points)
+{
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+Vector3 nv::Fit::computePrincipalComponent_PowerMethod(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_PowerMethod(matrix);
+}
+
+
+
+static inline Vector3 firstEigenVector_EigenSolver3(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        return Vector3(0.0f);
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+	if (!nv::Fit::eigenSolveSymmetric3(matrix, eigenValues, eigenVectors))
+	{
+		return Vector3(0.0f);
+	}
+
+	return eigenVectors[0];
+}
+
+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points)
+{
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_EigenSolver3(matrix);
+}
+
+Vector3 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric)
+{
+    float matrix[6];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_EigenSolver3(matrix);
+}
+
+
+
+static inline Vector4 firstEigenVector_EigenSolver4(const float *__restrict matrix)
+{
+    if (matrix[0] == 0 && matrix[4] == 0 && matrix[7] == 0&& matrix[9] == 0)
+    {
+        return Vector4(0.0f);
+    }
+
+    float eigenValues[4];
+    Vector4 eigenVectors[4];
+	if (!nv::Fit::eigenSolveSymmetric4(matrix, eigenValues, eigenVectors))
+	{
+		return Vector4(0.0f);
+	}
+
+	return eigenVectors[0];
+}
+
+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points)
+{
+    float matrix[10];
+    computeCovariance(n, points, matrix);
+
+    return firstEigenVector_EigenSolver4(matrix);
+}
+
+Vector4 nv::Fit::computePrincipalComponent_EigenSolver(int n, const Vector4 *__restrict points, const float *__restrict weights, Vector4::Arg metric)
+{
+    float matrix[10];
+    computeCovariance(n, points, weights, metric, matrix);
+
+    return firstEigenVector_EigenSolver4(matrix);
+}
+
+
+
+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R);
+
+Vector3 nv::Fit::computePrincipalComponent_SVD(int n, const Vector3 *__restrict points)
+{
+	// Store the points in an n x n matrix
+    Array<float> Q; Q.resize(n*n, 0.0f);
+	for (int i = 0; i < n; ++i)
+	{
+		Q[i*n+0] = points[i].x;
+		Q[i*n+1] = points[i].y;
+		Q[i*n+2] = points[i].z;
+	}
+
+	// Alloc space for the SVD outputs
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);
+
+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
+
+	// Get the principal component
+	return Vector3(R[0], R[1], R[2]);
+}
+
+Vector4 nv::Fit::computePrincipalComponent_SVD(int n, const Vector4 *__restrict points)
+{
+	// Store the points in an n x n matrix
+    Array<float> Q; Q.resize(n*n, 0.0f);
+	for (int i = 0; i < n; ++i)
+	{
+		Q[i*n+0] = points[i].x;
+		Q[i*n+1] = points[i].y;
+		Q[i*n+2] = points[i].z;
+		Q[i*n+3] = points[i].w;
+	}
+
+	// Alloc space for the SVD outputs
+    Array<float> diag; diag.resize(n, 0.0f);
+    Array<float> R; R.resize(n*n, 0.0f);
+
+	ArvoSVD(n, n, &Q[0], &diag[0], &R[0]);
+
+	// Get the principal component
+	return Vector4(R[0], R[1], R[2], R[3]);
+}
+
+
+
+Plane nv::Fit::bestPlane(int n, const Vector3 *__restrict points)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, matrix);
+
+    if (matrix[0] == 0 && matrix[3] == 0 && matrix[5] == 0)
+    {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        // If no plane defined, then return a horizontal plane.
+        return Plane(Vector3(0, 0, 1), centroid);
+    }
+
+    return Plane(eigenVectors[2], centroid);
+}
+
+bool nv::Fit::isPlanar(int n, const Vector3 * points, float epsilon/*=NV_EPSILON*/)
+{
+    // compute the centroid and covariance
+    float matrix[6];
+    computeCovariance(n, points, matrix);
+
+    float eigenValues[3];
+    Vector3 eigenVectors[3];
+    if (!eigenSolveSymmetric3(matrix, eigenValues, eigenVectors)) {
+        return false;
+    }
+
+    return eigenValues[2] < epsilon;
+}
+
+
+
+// Tridiagonal solver from Charles Bloom. 
+// Householder transforms followed by QL decomposition. 
+// Seems to be based on the code from Numerical Recipes in C.
+
+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd);
+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd);
+
+bool nv::Fit::eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    float subd[3];
+    float diag[3];
+    float work[3][3];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[1][1] = matrix[3];
+    work[1][2] = work[2][1] = matrix[4];
+    work[2][2] = matrix[5];
+
+    EigenSolver3_Tridiagonal(work, diag, subd);
+    if (!EigenSolver3_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 3; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector3(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 3; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows :
+
+    for (int i=0; i < 3; i++)
+    {
+        for (int j = 0; j < 3; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // shuffle to sort by singular value :
+    if (eigenValues[2] > eigenValues[0] && eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[0], eigenValues[2]);
+        swap(eigenVectors[0], eigenVectors[2]);
+    }
+    if (eigenValues[1] > eigenValues[0])
+    {
+        swap(eigenValues[0], eigenValues[1]);
+        swap(eigenVectors[0], eigenVectors[1]);
+    }
+    if (eigenValues[2] > eigenValues[1])
+    {
+        swap(eigenValues[1], eigenValues[2]);
+        swap(eigenVectors[1], eigenVectors[2]);
+    }
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2]);
+
+    return true;
+}
+
+static void EigenSolver3_Tridiagonal(float mat[3][3], float * diag, float * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+    const float epsilon = 1e-08f;
+
+    float a = mat[0][0];
+    float b = mat[0][1];
+    float c = mat[0][2];
+    float d = mat[1][1];
+    float e = mat[1][2];
+    float f = mat[2][2];
+
+    diag[0] = a;
+    subd[2] = 0.f;
+    if (fabsf(c) >= epsilon)
+    {
+        const float ell = sqrtf(b*b+c*c);
+        b /= ell;
+        c /= ell;
+        const float q = 2*b*e+c*(f-d);
+        diag[1] = d+c*q;
+        diag[2] = f-c*q;
+        subd[0] = ell;
+        subd[1] = e-b*q;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = b; mat[1][2] = c;
+        mat[2][0] = 0; mat[2][1] = c; mat[2][2] = -b;
+    }
+    else
+    {
+        diag[1] = d;
+        diag[2] = f;
+        subd[0] = b;
+        subd[1] = e;
+        mat[0][0] = 1; mat[0][1] = 0; mat[0][2] = 0;
+        mat[1][0] = 0; mat[1][1] = 1; mat[1][2] = 0;
+        mat[2][0] = 0; mat[2][1] = 0; mat[2][2] = 1;
+    }
+}
+
+static bool EigenSolver3_QLAlgorithm(float mat[3][3], float * diag, float * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 3; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m <= 1; m++)
+            {
+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
+                if ( fabsf(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            float r = sqrtf(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            float s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                float f = s*subd[i], b = c*subd[i];
+                if ( fabsf(f) >= fabsf(g) )
+                {
+                    c = g/f;
+                    r = sqrtf(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrtf(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 3; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+// Tridiagonal solver for 4x4 symmetric matrices.
+
+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd);
+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd);
+
+bool nv::Fit::eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4])
+{
+    nvDebugCheck(matrix != NULL && eigenValues != NULL && eigenVectors != NULL);
+
+    float subd[4];
+    float diag[4];
+    float work[4][4];
+
+    work[0][0] = matrix[0];
+    work[0][1] = work[1][0] = matrix[1];
+    work[0][2] = work[2][0] = matrix[2];
+    work[0][3] = work[3][0] = matrix[3];
+    work[1][1] = matrix[4];
+    work[1][2] = work[2][1] = matrix[5];
+    work[1][3] = work[3][1] = matrix[6];
+    work[2][2] = matrix[7];
+    work[2][3] = work[3][2] = matrix[8];
+    work[3][3] = matrix[9];
+
+    EigenSolver4_Tridiagonal(work, diag, subd);
+    if (!EigenSolver4_QLAlgorithm(work, diag, subd))
+    {
+        for (int i = 0; i < 4; i++) {
+            eigenValues[i] = 0;
+            eigenVectors[i] = Vector4(0);
+        }
+        return false;
+    }
+
+    for (int i = 0; i < 4; i++) {
+        eigenValues[i] = (float)diag[i];
+    }
+
+    // eigenvectors are the columns; make them the rows
+
+    for (int i = 0; i < 4; i++)
+    {
+        for (int j = 0; j < 4; j++)
+        {
+            eigenVectors[j].component[i] = (float) work[i][j];
+        }
+    }
+
+    // sort by singular value
+
+	for (int i = 0; i < 3; ++i)
+	{
+		for (int j = i+1; j < 4; ++j)
+		{
+			if (eigenValues[j] > eigenValues[i])
+			{
+				swap(eigenValues[i], eigenValues[j]);
+				swap(eigenVectors[i], eigenVectors[j]);
+			}
+		}
+	}
+
+    nvDebugCheck(eigenValues[0] >= eigenValues[1] && eigenValues[0] >= eigenValues[2] && eigenValues[0] >= eigenValues[3]);
+    nvDebugCheck(eigenValues[1] >= eigenValues[2] && eigenValues[1] >= eigenValues[3]);
+    nvDebugCheck(eigenValues[2] >= eigenValues[2]);
+
+    return true;
+}
+
+inline float signNonzero(float x)
+{
+	return (x >= 0.0f) ? 1.0f : -1.0f;
+}
+
+static void EigenSolver4_Tridiagonal(float mat[4][4], float * diag, float * subd)
+{
+    // Householder reduction T = Q^t M Q
+    //   Input:   
+    //     mat, symmetric 3x3 matrix M
+    //   Output:  
+    //     mat, orthogonal matrix Q
+    //     diag, diagonal entries of T
+    //     subd, subdiagonal entries of T (T is symmetric)
+
+	static const int n = 4;
+
+	// Set epsilon relative to size of elements in matrix
+	static const float relEpsilon = 1e-6f;
+	float maxElement = FLT_MAX;
+	for (int i = 0; i < n; ++i)
+		for (int j = 0; j < n; ++j)
+			maxElement = max(maxElement, fabsf(mat[i][j]));
+	float epsilon = relEpsilon * maxElement;
+
+	// Iterative algorithm, works for any size of matrix but might be slower than
+	// a closed-form solution for symmetric 4x4 matrices.  Based on this article:
+	// http://en.wikipedia.org/wiki/Householder_transformation#Tridiagonalization
+
+	Matrix A, Q(identity);
+	memcpy(&A, mat, sizeof(float)*n*n);
+
+	// We proceed from left to right, making the off-tridiagonal entries zero in
+	// one column of the matrix at a time.
+	for (int k = 0; k < n - 2; ++k)
+	{
+		float sum = 0.0f;
+		for (int j = k+1; j < n; ++j)
+			sum += A(j,k)*A(j,k);
+		float alpha = -signNonzero(A(k+1,k)) * sqrtf(sum);
+		float r = sqrtf(0.5f * (alpha*alpha - A(k+1,k)*alpha));
+
+		// If r is zero, skip this column - already in tridiagonal form
+		if (fabsf(r) < epsilon)
+			continue;
+
+		float v[n] = {};
+		v[k+1] = 0.5f * (A(k+1,k) - alpha) / r;
+		for (int j = k+2; j < n; ++j)
+			v[j] = 0.5f * A(j,k) / r;
+
+		Matrix P(identity);
+		for (int i = 0; i < n; ++i)
+			for (int j = 0; j < n; ++j)
+				P(i,j) -= 2.0f * v[i] * v[j];
+
+		A = mul(mul(P, A), P);
+		Q = mul(Q, P);
+	}
+
+	nvDebugCheck(fabsf(A(2,0)) < epsilon);
+	nvDebugCheck(fabsf(A(0,2)) < epsilon);
+	nvDebugCheck(fabsf(A(3,0)) < epsilon);
+	nvDebugCheck(fabsf(A(0,3)) < epsilon);
+	nvDebugCheck(fabsf(A(3,1)) < epsilon);
+	nvDebugCheck(fabsf(A(1,3)) < epsilon);
+
+	for (int i = 0; i < n; ++i)
+		diag[i] = A(i,i);
+	for (int i = 0; i < n - 1; ++i)
+		subd[i] = A(i+1,i);
+	subd[n-1] = 0.0f;
+
+	memcpy(mat, &Q, sizeof(float)*n*n);
+}
+
+static bool EigenSolver4_QLAlgorithm(float mat[4][4], float * diag, float * subd)
+{
+    // QL iteration with implicit shifting to reduce matrix from tridiagonal
+    // to diagonal
+    const int maxiter = 32;
+
+    for (int ell = 0; ell < 4; ell++)
+    {
+        int iter;
+        for (iter = 0; iter < maxiter; iter++)
+        {
+            int m;
+            for (m = ell; m < 3; m++)
+            {
+                float dd = fabsf(diag[m]) + fabsf(diag[m+1]);
+                if ( fabsf(subd[m]) + dd == dd )
+                    break;
+            }
+            if ( m == ell )
+                break;
+
+            float g = (diag[ell+1]-diag[ell])/(2*subd[ell]);
+            float r = sqrtf(g*g+1);
+            if ( g < 0 )
+                g = diag[m]-diag[ell]+subd[ell]/(g-r);
+            else
+                g = diag[m]-diag[ell]+subd[ell]/(g+r);
+            float s = 1, c = 1, p = 0;
+            for (int i = m-1; i >= ell; i--)
+            {
+                float f = s*subd[i], b = c*subd[i];
+                if ( fabsf(f) >= fabsf(g) )
+                {
+                    c = g/f;
+                    r = sqrtf(c*c+1);
+                    subd[i+1] = f*r;
+                    c *= (s = 1/r);
+                }
+                else
+                {
+                    s = f/g;
+                    r = sqrtf(s*s+1);
+                    subd[i+1] = g*r;
+                    s *= (c = 1/r);
+                }
+                g = diag[i+1]-p;
+                r = (diag[i]-g)*s+2*b*c;
+                p = s*r;
+                diag[i+1] = g+p;
+                g = c*r-b;
+
+                for (int k = 0; k < 4; k++)
+                {
+                    f = mat[k][i+1];
+                    mat[k][i+1] = s*mat[k][i]+c*f;
+                    mat[k][i] = c*mat[k][i]-s*f;
+                }
+            }
+            diag[ell] -= p;
+            subd[ell] = g;
+            subd[m] = 0;
+        }
+
+        if ( iter == maxiter )
+            // should not get here under normal circumstances
+            return false;
+    }
+
+    return true;
+}
+
+
+
+int nv::Fit::compute4Means(int n, const Vector3 *__restrict points, const float *__restrict weights, Vector3::Arg metric, Vector3 *__restrict cluster)
+{
+    // Compute principal component.
+    float matrix[6];
+    Vector3 centroid = computeCovariance(n, points, weights, metric, matrix);
+    Vector3 principal = firstEigenVector_PowerMethod(matrix);
+
+    // Pick initial solution.
+    int mini, maxi;
+    mini = maxi = 0;
+
+    float mindps, maxdps;
+    mindps = maxdps = dot(points[0] - centroid, principal);
+
+    for (int i = 1; i < n; ++i)
+    {
+        float dps = dot(points[i] - centroid, principal);
+
+        if (dps < mindps) {
+            mindps = dps;
+            mini = i;
+        }
+        else {
+            maxdps = dps;
+            maxi = i;
+        }
+    }
+
+    cluster[0] = centroid + mindps * principal;
+    cluster[1] = centroid + maxdps * principal;
+    cluster[2] = (2.0f * cluster[0] + cluster[1]) / 3.0f;
+    cluster[3] = (2.0f * cluster[1] + cluster[0]) / 3.0f;
+
+    // Now we have to iteratively refine the clusters.
+    while (true)
+    {
+        Vector3 newCluster[4] = { Vector3(0.0f), Vector3(0.0f), Vector3(0.0f), Vector3(0.0f) };
+        float total[4] = {0, 0, 0, 0};
+
+        for (int i = 0; i < n; ++i)
+        {
+            // Find nearest cluster.
+            int nearest = 0;
+            float mindist = FLT_MAX;
+            for (int j = 0; j < 4; j++)
+            {
+                float dist = lengthSquared((cluster[j] - points[i]) * metric);
+                if (dist < mindist)
+                {
+                    mindist = dist;
+                    nearest = j;
+                }
+            }
+
+            newCluster[nearest] += weights[i] * points[i];
+            total[nearest] += weights[i];
+        }
+
+        for (int j = 0; j < 4; j++)
+        {
+            if (total[j] != 0)
+                newCluster[j] /= total[j];
+        }
+
+        if (equal(cluster[0], newCluster[0]) && equal(cluster[1], newCluster[1]) && 
+            equal(cluster[2], newCluster[2]) && equal(cluster[3], newCluster[3]))
+        {
+            return (total[0] != 0) + (total[1] != 0) + (total[2] != 0) + (total[3] != 0);
+        }
+
+        cluster[0] = newCluster[0];
+        cluster[1] = newCluster[1];
+        cluster[2] = newCluster[2];
+        cluster[3] = newCluster[3];
+
+        // Sort clusters by weight.
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = i; j > 0 && total[j] > total[j - 1]; j--)
+            {
+                swap( total[j], total[j - 1] );
+                swap( cluster[j], cluster[j - 1] );
+            }
+        }
+    }
+}
+
+
+
+// Adaptation of James Arvo's SVD code, as found in ZOH.
+
+inline float Sqr(float x) { return x*x; }
+
+inline float svd_pythag( float a, float b )
+{
+	float at = fabsf(a);
+	float bt = fabsf(b);
+	if( at > bt )
+		return at * sqrtf( 1.0f + Sqr( bt / at ) );
+	else if( bt > 0.0f )
+		return bt * sqrtf( 1.0f + Sqr( at / bt ) );
+	else return 0.0f;
+}
+
+inline float SameSign( float a, float b ) 
+{
+	float t;
+	if( b >= 0.0f ) t = fabsf( a );
+	else t = -fabsf( a );
+	return t;
+}
+
+void ArvoSVD(int rows, int cols, float * Q, float * diag, float * R)
+{
+	static const int MaxIterations = 30;
+
+	int    i, j, k, l, p, q, iter;
+	float  c, f, h, s, x, y, z;
+	float  norm  = 0.0f;
+	float  g     = 0.0f;
+	float  scale = 0.0f;
+
+    Array<float> temp; temp.resize(cols, 0.0f);
+
+	for( i = 0; i < cols; i++ ) 
+	{
+		temp[i] = scale * g;
+		scale   = 0.0f;
+		g       = 0.0f;
+		s       = 0.0f;
+		l       = i + 1;
+
+		if( i < rows )
+		{
+			for( k = i; k < rows; k++ ) scale += fabsf( Q[k*cols+i] );
+			if( scale != 0.0f ) 
+			{
+				for( k = i; k < rows; k++ ) 
+				{
+					Q[k*cols+i] /= scale;
+					s += Sqr( Q[k*cols+i] );
+				}
+				f = Q[i*cols+i];
+				g = -SameSign( sqrtf(s), f );
+				h = f * g - s;
+				Q[i*cols+i] = f - g;
+				if( i != cols - 1 )
+				{
+					for( j = l; j < cols; j++ ) 
+					{
+						s = 0.0f;
+						for( k = i; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
+						f = s / h;
+						for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
+					}
+				}
+				for( k = i; k < rows; k++ ) Q[k*cols+i] *= scale;
+			}
+		}
+
+		diag[i] = scale * g;
+		g       = 0.0f;
+		s       = 0.0f;
+		scale   = 0.0f;
+
+		if( i < rows && i != cols - 1 ) 
+		{
+			for( k = l; k < cols; k++ ) scale += fabsf( Q[i*cols+k] );
+			if( scale != 0.0f ) 
+			{
+				for( k = l; k < cols; k++ ) 
+				{
+					Q[i*cols+k] /= scale;
+					s += Sqr( Q[i*cols+k] );
+				}
+				f = Q[i*cols+l];
+				g = -SameSign( sqrtf(s), f );
+				h = f * g - s;
+				Q[i*cols+l] = f - g;
+				for( k = l; k < cols; k++ ) temp[k] = Q[i*cols+k] / h;
+				if( i != rows - 1 ) 
+				{
+					for( j = l; j < rows; j++ ) 
+					{
+						s = 0.0f;
+						for( k = l; k < cols; k++ ) s += Q[j*cols+k] * Q[i*cols+k];
+						for( k = l; k < cols; k++ ) Q[j*cols+k] += s * temp[k];
+					}
+				}
+				for( k = l; k < cols; k++ ) Q[i*cols+k] *= scale;
+			}
+		}
+		norm = max( norm, fabsf( diag[i] ) + fabsf( temp[i] ) );
+	}
+
+
+	for( i = cols - 1; i >= 0; i-- ) 
+	{
+		if( i < cols - 1 ) 
+		{
+			if( g != 0.0f ) 
+			{
+				for( j = l; j < cols; j++ ) R[i*cols+j] = ( Q[i*cols+j] / Q[i*cols+l] ) / g;
+				for( j = l; j < cols; j++ ) 
+				{
+					s = 0.0f;
+					for( k = l; k < cols; k++ ) s += Q[i*cols+k] * R[j*cols+k];
+					for( k = l; k < cols; k++ ) R[j*cols+k] += s * R[i*cols+k];
+				}
+			}
+			for( j = l; j < cols; j++ ) 
+			{
+				R[i*cols+j] = 0.0f;
+				R[j*cols+i] = 0.0f;
+			}
+		}
+		R[i*cols+i] = 1.0f;
+		g = temp[i];
+		l = i;
+	}
+
+
+	for( i = cols - 1; i >= 0; i-- ) 
+	{
+		l = i + 1;
+		g = diag[i];
+		if( i < cols - 1 ) for( j = l; j < cols; j++ ) Q[i*cols+j] = 0.0f;
+		if( g != 0.0f ) 
+		{
+			g = 1.0f / g;
+			if( i != cols - 1 ) 
+			{
+				for( j = l; j < cols; j++ ) 
+				{
+					s = 0.0f;
+					for( k = l; k < rows; k++ ) s += Q[k*cols+i] * Q[k*cols+j];
+					f = ( s / Q[i*cols+i] ) * g;
+					for( k = i; k < rows; k++ ) Q[k*cols+j] += f * Q[k*cols+i];
+				}
+			}
+			for( j = i; j < rows; j++ ) Q[j*cols+i] *= g;
+		} 
+		else 
+		{
+			for( j = i; j < rows; j++ ) Q[j*cols+i] = 0.0f;
+		}
+		Q[i*cols+i] += 1.0f;
+	}
+
+
+	for( k = cols - 1; k >= 0; k-- ) 
+	{
+		for( iter = 1; iter <= MaxIterations; iter++ ) 
+		{
+			int jump;
+
+			for( l = k; l >= 0; l-- )
+			{
+				q = l - 1;
+				if( fabsf( temp[l] ) + norm == norm ) { jump = 1; break; }
+				if( fabsf( diag[q] ) + norm == norm ) { jump = 0; break; }
+			}
+
+			if( !jump )
+			{
+				c = 0.0f;
+				s = 1.0f;
+				for( i = l; i <= k; i++ )
+				{
+					f = s * temp[i];
+					temp[i] *= c;
+					if( fabsf( f ) + norm == norm ) break;
+					g = diag[i];
+					h = svd_pythag( f, g );
+					diag[i] = h;
+					h = 1.0f / h;
+					c = g * h;
+					s = -f * h;
+					for( j = 0; j < rows; j++ ) 
+					{
+						y = Q[j*cols+q];
+						z = Q[j*cols+i];
+						Q[j*cols+q] = y * c + z * s;
+						Q[j*cols+i] = z * c - y * s;
+					}
+				}
+			}
+
+			z = diag[k];
+			if( l == k ) 
+			{
+				if( z < 0.0f ) 
+				{
+					diag[k] = -z;
+					for( j = 0; j < cols; j++ ) R[k*cols+j] *= -1.0f; 
+				}
+				break;
+			}
+			if( iter >= MaxIterations ) return;
+			x = diag[l];
+			q = k - 1;
+			y = diag[q];
+			g = temp[q];
+			h = temp[k];
+			f = ( ( y - z ) * ( y + z ) + ( g - h ) * ( g + h ) ) / ( 2.0f * h * y );
+			g = svd_pythag( f, 1.0f );
+			f = ( ( x - z ) * ( x + z ) + h * ( ( y / ( f + SameSign( g, f ) ) ) - h ) ) / x;
+			c = 1.0f;
+			s = 1.0f;
+			for( j = l; j <= q; j++ ) 
+			{
+				i = j + 1;
+				g = temp[i];
+				y = diag[i];
+				h = s * g;
+				g = c * g;
+				z = svd_pythag( f, h );
+				temp[j] = z;
+				c = f / z;
+				s = h / z;
+				f = x * c + g * s;
+				g = g * c - x * s;
+				h = y * s;
+				y = y * c;
+				for( p = 0; p < cols; p++ ) 
+				{
+					x = R[j*cols+p];
+					z = R[i*cols+p];
+					R[j*cols+p] = x * c + z * s;
+					R[i*cols+p] = z * c - x * s;
+				}
+				z = svd_pythag( f, h );
+				diag[j] = z;
+				if( z != 0.0f ) 
+				{
+					z = 1.0f / z;
+					c = f * z;
+					s = h * z;
+				}
+				f = c * g + s * y;
+				x = c * y - s * g;
+				for( p = 0; p < rows; p++ ) 
+				{
+					y = Q[p*cols+j];
+					z = Q[p*cols+i];
+					Q[p*cols+j] = y * c + z * s;
+					Q[p*cols+i] = z * c - y * s;
+				}
+			}
+			temp[l] = 0.0f;
+			temp[k] = f;
+			diag[k] = x;
+		}
+	}
+
+	// Sort the singular values into descending order.
+
+	for( i = 0; i < cols - 1; i++ )
+	{
+		float biggest = diag[i];  // Biggest singular value so far.
+		int   bindex  = i;        // The row/col it occurred in.
+		for( j = i + 1; j < cols; j++ )
+		{
+			if( diag[j] > biggest ) 
+			{
+				biggest = diag[j];
+				bindex  = j;
+			}            
+		}
+		if( bindex != i )  // Need to swap rows and columns.
+		{
+			// Swap columns in Q.
+			for (int j = 0; j < rows; ++j)
+				swap(Q[j*cols+i], Q[j*cols+bindex]);
+
+			// Swap rows in R.
+			for (int j = 0; j < rows; ++j)
+				swap(R[i*cols+j], R[bindex*cols+j]);
+
+			// Swap elements in diag.
+			swap(diag[i], diag[bindex]);
+		}
+	}
+}

+ 49 - 0
3rdparty/nvtt/nvmath/fitting.h

@@ -0,0 +1,49 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#ifndef NV_MATH_FITTING_H
+#define NV_MATH_FITTING_H
+
+#include "vector.h"
+#include "plane.h"
+
+namespace nv
+{
+    namespace Fit
+    {
+        Vector3 computeCentroid(int n, const Vector3 * points);
+        Vector3 computeCentroid(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector4 computeCentroid(int n, const Vector4 * points);
+        Vector4 computeCentroid(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computeCovariance(int n, const Vector3 * points, float * covariance);
+        Vector3 computeCovariance(int n, const Vector3 * points, const float * weights, const Vector3 & metric, float * covariance);
+
+        Vector4 computeCovariance(int n, const Vector4 * points, float * covariance);
+        Vector4 computeCovariance(int n, const Vector4 * points, const float * weights, const Vector4 & metric, float * covariance);
+
+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent_PowerMethod(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points);
+        Vector3 computePrincipalComponent_EigenSolver(int n, const Vector3 * points, const float * weights, const Vector3 & metric);
+
+		Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points);
+        Vector4 computePrincipalComponent_EigenSolver(int n, const Vector4 * points, const float * weights, const Vector4 & metric);
+
+        Vector3 computePrincipalComponent_SVD(int n, const Vector3 * points);
+        Vector4 computePrincipalComponent_SVD(int n, const Vector4 * points);
+
+        Plane bestPlane(int n, const Vector3 * points);
+        bool isPlanar(int n, const Vector3 * points, float epsilon = NV_EPSILON);
+
+        bool eigenSolveSymmetric3(const float matrix[6], float eigenValues[3], Vector3 eigenVectors[3]);
+        bool eigenSolveSymmetric4(const float matrix[10], float eigenValues[4], Vector4 eigenVectors[4]);
+
+        // Returns number of clusters [1-4].
+        int compute4Means(int n, const Vector3 * points, const float * weights, const Vector3 & metric, Vector3 * cluster);
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_FITTING_H

+ 112 - 0
3rdparty/nvtt/nvmath/matrix.h

@@ -0,0 +1,112 @@
+// This code is in the public domain -- [email protected]
+
+#ifndef NV_MATH_MATRIX_H
+#define NV_MATH_MATRIX_H
+
+#include "vector.h"
+
+// - Matrices are stored in memory in *column major* order.
+// - Points are to be though of as column vectors.
+// - Transformation of a point p by a matrix M is: p' = M * p
+
+namespace nv
+{
+    enum identity_t { identity };
+
+    // 3x3 matrix.
+    class NVMATH_CLASS Matrix3
+    {
+    public:
+        Matrix3();
+        explicit Matrix3(float f);
+        explicit Matrix3(identity_t);
+        Matrix3(const Matrix3 & m);
+        Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2);
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+
+        Vector3 row(uint i) const;
+        Vector3 column(uint i) const;
+
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator+=(const Matrix3 & m);
+        void operator-=(const Matrix3 & m);
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        float determinant() const;
+
+    private:
+        float m_data[9];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix3 & m, const Vector3 & b, Vector3 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix3 & A, const Vector3 & b, Vector3 * x);
+
+
+    // 4x4 matrix.
+    class NVMATH_CLASS Matrix
+    {
+    public:
+        typedef Matrix const & Arg;
+
+        Matrix();
+        explicit Matrix(float f);
+        explicit Matrix(identity_t);
+        Matrix(const Matrix3 & m);
+        Matrix(const Matrix & m);
+        Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3);
+        //explicit Matrix(const float m[]);	// m is assumed to contain 16 elements
+
+        float data(uint idx) const;
+        float & data(uint idx);
+        float get(uint row, uint col) const;
+        float operator()(uint row, uint col) const;
+        float & operator()(uint row, uint col);
+        const float * ptr() const;
+
+        Vector4 row(uint i) const;
+        Vector4 column(uint i) const;
+
+        void zero();
+        void identity();
+
+        void scale(float s);
+        void scale(Vector3::Arg s);
+        void translate(Vector3::Arg t);
+        void rotate(float theta, float v0, float v1, float v2);
+        float determinant() const;
+
+        void operator+=(const Matrix & m);
+        void operator-=(const Matrix & m);
+
+        void apply(Matrix::Arg m);
+
+    private:
+        float m_data[16];
+    };
+
+    // Solve equation system using LU decomposition and back-substitution.
+    extern bool solveLU(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Solve equation system using Cramer's inverse.
+    extern bool solveCramer(const Matrix & A, const Vector4 & b, Vector4 * x);
+
+    // Compute inverse using LU decomposition.
+    extern Matrix inverseLU(const Matrix & m);
+
+    // Compute inverse using Gaussian elimination and partial pivoting.
+    extern Matrix inverse(const Matrix & m);
+    extern Matrix3 inverse(const Matrix3 & m);
+
+} // nv namespace
+
+#endif // NV_MATH_MATRIX_H

+ 1274 - 0
3rdparty/nvtt/nvmath/matrix.inl

@@ -0,0 +1,1274 @@
+// This code is in the public domain -- [email protected]
+
+#pragma once
+#ifndef NV_MATH_MATRIX_INL
+#define NV_MATH_MATRIX_INL
+
+#include "Matrix.h"
+
+namespace nv
+{
+    inline Matrix3::Matrix3() {}
+    
+    inline Matrix3::Matrix3(float f)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = f;
+        }
+    }
+
+    inline Matrix3::Matrix3(identity_t)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                m_data[3*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix3::Matrix3(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+    
+    inline Matrix3::Matrix3(Vector3::Arg v0, Vector3::Arg v1, Vector3::Arg v2)
+    {
+        m_data[0] = v0.x; m_data[1] = v0.y; m_data[2] = v0.z;
+        m_data[3] = v1.x; m_data[4] = v1.y; m_data[5] = v1.z;
+        m_data[6] = v2.x; m_data[7] = v2.y; m_data[8] = v2.z;
+    }
+
+    inline float Matrix3::data(uint idx) const
+    {
+        nvDebugCheck(idx < 9);
+        return m_data[idx];
+    }
+    inline float & Matrix3::data(uint idx)
+    {
+        nvDebugCheck(idx < 9);
+        return m_data[idx];
+    }
+    inline float Matrix3::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline float Matrix3::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+    inline float & Matrix3::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 3 && col < 3);
+        return m_data[col * 3 + row];
+    }
+
+    inline Vector3 Matrix3::row(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(i, 0), get(i, 1), get(i, 2));
+    }
+    inline Vector3 Matrix3::column(uint i) const
+    {
+        nvDebugCheck(i < 3);
+        return Vector3(get(0, i), get(1, i), get(2, i));
+    }
+
+    inline void Matrix3::operator*=(float s)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::operator/=(float s)
+    {
+        float is = 1.0f /s;
+        for(int i = 0; i < 9; i++) {
+            m_data[i] *= is;
+        }
+    }
+
+    inline void Matrix3::operator+=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix3::operator-=(const Matrix3 & m)
+    {
+        for(int i = 0; i < 9; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix3 operator+(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix3 operator-(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m = a;
+        m -= b;
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator*(float s, const Matrix3 & a)
+    {
+        Matrix3 m = a;
+        m *= s;
+        return m;
+    }
+
+    inline Matrix3 operator/(const Matrix3 & a, float s)
+    {
+        Matrix3 m = a;
+        m /= s;
+        return m;
+    }
+
+    inline Matrix3 mul(const Matrix3 & a, const Matrix3 & b)
+    {
+        Matrix3 m;
+
+        for(int i = 0; i < 3; i++) {
+            const float ai0 = a(i,0), ai1 = a(i,1), ai2 = a(i,2);
+            m(i, 0) = ai0 * b(0,0) + ai1 * b(1,0) + ai2 * b(2,0);
+            m(i, 1) = ai0 * b(0,1) + ai1 * b(1,1) + ai2 * b(2,1);
+            m(i, 2) = ai0 * b(0,2) + ai1 * b(1,2) + ai2 * b(2,2);
+        }
+
+        return m;
+    }
+
+    inline Matrix3 operator*(const Matrix3 & a, const Matrix3 & b)
+    {
+        return mul(a, b);
+    }
+
+    // Transform the given 3d vector with the given matrix.
+    inline Vector3 transform(const Matrix3 & m, const Vector3 & p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    inline void Matrix3::scale(float s)
+    {
+        for (int i = 0; i < 9; i++) {
+            m_data[i] *= s;
+        }
+    }
+
+    inline void Matrix3::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x;
+        m_data[3] *= s.y; m_data[4] *= s.y; m_data[5] *= s.y;
+        m_data[6] *= s.z; m_data[7] *= s.z; m_data[8] *= s.z;
+    }
+
+    inline float Matrix3::determinant() const
+    {
+        return 
+            get(0,0) * get(1,1) * get(2,2) + 
+            get(0,1) * get(1,2) * get(2,0) + 
+            get(0,2) * get(1,0) * get(2,1) -
+            get(0,2) * get(1,1) * get(2,0) - 
+            get(0,1) * get(1,0) * get(2,2) -
+            get(0,0) * get(1,2) * get(2,1);
+    }
+
+    // Inverse using Cramer's rule.
+    inline Matrix3 inverseCramer(const Matrix3 & m)
+    {
+        const float det = m.determinant();
+        if (equal(det, 0.0f, 0.0f)) {
+            return Matrix3(0);
+        }
+
+        Matrix3 r;
+
+        r.data(0) =  - m.data(5) * m.data(7) + m.data(4) * m.data(8);
+        r.data(1) =  + m.data(5) * m.data(6) - m.data(3) * m.data(8);
+        r.data(2) =  - m.data(4) * m.data(6) + m.data(3) * m.data(7);
+
+        r.data(3) =  + m.data(2) * m.data(7) - m.data(1) * m.data(8);
+        r.data(4) =  - m.data(2) * m.data(6) + m.data(0) * m.data(8);
+        r.data(5) =  + m.data(1) * m.data(6) - m.data(0) * m.data(7);
+
+        r.data(6) =  - m.data(2) * m.data(4) + m.data(1) * m.data(5);
+        r.data(7) =  + m.data(2) * m.data(3) - m.data(0) * m.data(5);
+        r.data(8) =  - m.data(1) * m.data(3) + m.data(0) * m.data(4);
+
+        r.scale(1.0f / det);
+
+        return r;
+    }
+
+
+
+    inline Matrix::Matrix()
+    {
+    }
+
+    inline Matrix::Matrix(float f)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = 0.0f;
+        }
+    }
+
+    inline Matrix::Matrix(identity_t)
+    {
+        for(int i = 0; i < 4; i++) {
+            for(int j = 0; j < 4; j++) {
+                m_data[4*j+i] = (i == j) ? 1.0f : 0.0f;
+            }
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m.m_data[i];
+        }
+    }
+
+    inline Matrix::Matrix(const Matrix3 & m)
+    {
+        for(int i = 0; i < 3; i++) {
+            for(int j = 0; j < 3; j++) {
+                operator()(i, j) = m.get(i, j);
+            }
+        }
+        for(int i = 0; i < 4; i++) {
+            operator()(3, i) = 0;
+            operator()(i, 3) = 0;
+        }
+    }
+
+    inline Matrix::Matrix(Vector4::Arg v0, Vector4::Arg v1, Vector4::Arg v2, Vector4::Arg v3)
+    {
+        m_data[ 0] = v0.x; m_data[ 1] = v0.y; m_data[ 2] = v0.z; m_data[ 3] = v0.w;
+        m_data[ 4] = v1.x; m_data[ 5] = v1.y; m_data[ 6] = v1.z; m_data[ 7] = v1.w;
+        m_data[ 8] = v2.x; m_data[ 9] = v2.y; m_data[10] = v2.z; m_data[11] = v2.w;
+        m_data[12] = v3.x; m_data[13] = v3.y; m_data[14] = v3.z; m_data[15] = v3.w;
+    }
+
+    /*inline Matrix::Matrix(const float m[])
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] = m[i];
+        }
+    }*/
+
+
+    // Accessors
+    inline float Matrix::data(uint idx) const
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline float & Matrix::data(uint idx)
+    {
+        nvDebugCheck(idx < 16);
+        return m_data[idx];
+    }
+    inline float Matrix::get(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline float Matrix::operator()(uint row, uint col) const
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+    inline float & Matrix::operator()(uint row, uint col)
+    {
+        nvDebugCheck(row < 4 && col < 4);
+        return m_data[col * 4 + row];
+    }
+
+    inline const float * Matrix::ptr() const
+    {
+        return m_data;
+    }
+
+    inline Vector4 Matrix::row(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(i, 0), get(i, 1), get(i, 2), get(i, 3));
+    }
+
+    inline Vector4 Matrix::column(uint i) const
+    {
+        nvDebugCheck(i < 4);
+        return Vector4(get(0, i), get(1, i), get(2, i), get(3, i));
+    }
+
+    inline void Matrix::zero()
+    {
+        m_data[0] = 0; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;
+        m_data[4] = 0; m_data[5] = 0; m_data[6] = 0; m_data[7] = 0;
+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 0; m_data[11] = 0;
+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 0;
+    }
+
+    inline void Matrix::identity()
+    {
+        m_data[0] = 1; m_data[1] = 0; m_data[2] = 0; m_data[3] = 0;
+        m_data[4] = 0; m_data[5] = 1; m_data[6] = 0; m_data[7] = 0;
+        m_data[8] = 0; m_data[9] = 0; m_data[10] = 1; m_data[11] = 0;
+        m_data[12] = 0; m_data[13] = 0; m_data[14] = 0; m_data[15] = 1;
+    }
+
+    // Apply scale.
+    inline void Matrix::scale(float s)
+    {
+        m_data[0] *= s; m_data[1] *= s; m_data[2] *= s; m_data[3] *= s;
+        m_data[4] *= s; m_data[5] *= s; m_data[6] *= s; m_data[7] *= s;
+        m_data[8] *= s; m_data[9] *= s; m_data[10] *= s; m_data[11] *= s;
+        m_data[12] *= s; m_data[13] *= s; m_data[14] *= s; m_data[15] *= s;
+    }
+
+    // Apply scale.
+    inline void Matrix::scale(Vector3::Arg s)
+    {
+        m_data[0] *= s.x; m_data[1] *= s.x; m_data[2] *= s.x; m_data[3] *= s.x;
+        m_data[4] *= s.y; m_data[5] *= s.y; m_data[6] *= s.y; m_data[7] *= s.y;
+        m_data[8] *= s.z; m_data[9] *= s.z; m_data[10] *= s.z; m_data[11] *= s.z;
+    }
+
+    // Apply translation.
+    inline void Matrix::translate(Vector3::Arg t)
+    {
+        m_data[12] = m_data[0] * t.x + m_data[4] * t.y + m_data[8]  * t.z + m_data[12];
+        m_data[13] = m_data[1] * t.x + m_data[5] * t.y + m_data[9]  * t.z + m_data[13];
+        m_data[14] = m_data[2] * t.x + m_data[6] * t.y + m_data[10] * t.z + m_data[14];
+        m_data[15] = m_data[3] * t.x + m_data[7] * t.y + m_data[11] * t.z + m_data[15];
+    }
+
+    Matrix rotation(float theta, float v0, float v1, float v2);
+
+    // Apply rotation.
+    inline void Matrix::rotate(float theta, float v0, float v1, float v2)
+    {
+        Matrix R(rotation(theta, v0, v1, v2));
+        apply(R);
+    }
+
+    // Apply transform.
+    inline void Matrix::apply(Matrix::Arg m)
+    {
+        nvDebugCheck(this != &m);
+
+        for(int i = 0; i < 4; i++) {
+            const float ai0 = get(i,0), ai1 = get(i,1), ai2 = get(i,2), ai3 = get(i,3);
+            m_data[0 + i] = ai0 * m(0,0) + ai1 * m(1,0) + ai2 * m(2,0) + ai3 * m(3,0);
+            m_data[4 + i] = ai0 * m(0,1) + ai1 * m(1,1) + ai2 * m(2,1) + ai3 * m(3,1);
+            m_data[8 + i] = ai0 * m(0,2) + ai1 * m(1,2) + ai2 * m(2,2) + ai3 * m(3,2);
+            m_data[12+ i] = ai0 * m(0,3) + ai1 * m(1,3) + ai2 * m(2,3) + ai3 * m(3,3);
+        }
+    }
+
+    // Get scale matrix.
+    inline Matrix scale(Vector3::Arg s)
+    {
+        Matrix m(identity);
+        m(0,0) = s.x;
+        m(1,1) = s.y;
+        m(2,2) = s.z;
+        return m;
+    }
+
+    // Get scale matrix.
+    inline Matrix scale(float s)
+    {
+        Matrix m(identity);
+        m(0,0) = m(1,1) = m(2,2) = s;
+        return m;
+    }
+
+    // Get translation matrix.
+    inline Matrix translation(Vector3::Arg t)
+    {
+        Matrix m(identity);
+        m(0,3) = t.x;
+        m(1,3) = t.y;
+        m(2,3) = t.z;
+        return m;
+    }
+
+    // Get rotation matrix.
+    inline Matrix rotation(float theta, float v0, float v1, float v2)
+    {
+        float cost = cosf(theta);
+        float sint = sinf(theta);
+
+        Matrix m(identity);
+
+        if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+            m(1,1) = cost; m(2,1) = -sint;
+            m(1,2) = sint; m(2,2) = cost;
+        }
+        else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+            m(0,0) = cost; m(2,0) = sint;
+            m(1,2) = -sint; m(2,2) = cost;
+        }
+        else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+            m(0,0) = cost; m(1,0) = -sint;
+            m(0,1) = sint; m(1,1) = cost;
+        } 
+        else {
+            float a2, b2, c2;
+            a2 = v0 * v0;
+            b2 = v1 * v1;
+            c2 = v2 * v2;
+
+            float iscale = 1.0f / sqrtf(a2 + b2 + c2);
+            v0 *= iscale;
+            v1 *= iscale;
+            v2 *= iscale;
+
+            float abm, acm, bcm;
+            float mcos, asin, bsin, csin;
+            mcos = 1.0f - cost;
+            abm = v0 * v1 * mcos;
+            acm = v0 * v2 * mcos;
+            bcm = v1 * v2 * mcos;
+            asin = v0 * sint;
+            bsin = v1 * sint;
+            csin = v2 * sint;
+            m(0,0) = a2 * mcos + cost;
+            m(1,0) = abm - csin;
+            m(2,0) = acm + bsin;
+            m(3,0) = abm + csin;
+            m(1,1) = b2 * mcos + cost;
+            m(2,1) = bcm - asin;
+            m(3,1) = acm - bsin;
+            m(1,2) = bcm + asin;
+            m(2,2) = c2 * mcos + cost;
+        }
+        return m;
+    }
+
+    //Matrix rotation(float yaw, float pitch, float roll);
+    //Matrix skew(float angle, Vector3::Arg v1, Vector3::Arg v2);
+
+    // Get frustum matrix.
+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float doubleznear = 2.0f * zNear;
+        float one_deltax = 1.0f / (xmax - xmin);
+        float one_deltay = 1.0f / (ymax - ymin);
+        float one_deltaz = 1.0f / (zFar - zNear);
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -(zFar + zNear) * one_deltaz;
+        m(3,2) = -1.0f;
+        m(2,3) = -(zFar * doubleznear) * one_deltaz;
+
+        return m;
+    }
+
+    // Get inverse frustum matrix.
+    inline Matrix frustumInverse(float xmin, float xmax, float ymin, float ymax, float zNear, float zFar)
+    {
+        Matrix m(0.0f);
+
+        float one_doubleznear = 1.0f / (2.0f * zNear);
+        float one_doubleznearzfar = 1.0f / (2.0f * zNear * zFar);
+
+        m(0,0) = (xmax - xmin) * one_doubleznear;
+        m(0,3) = (xmax + xmin) * one_doubleznear;
+        m(1,1) = (ymax - ymin) * one_doubleznear;
+        m(1,3) = (ymax + ymin) * one_doubleznear;
+        m(2,3) = -1;
+        m(3,2) = -(zFar - zNear) * one_doubleznearzfar;
+        m(3,3) = (zFar + zNear) * one_doubleznearzfar;
+
+        return m;
+    }
+
+    // Get infinite frustum matrix.
+    inline Matrix frustum(float xmin, float xmax, float ymin, float ymax, float zNear)
+    {
+        Matrix m(0.0f);
+
+        float doubleznear = 2.0f * zNear;
+        float one_deltax = 1.0f / (xmax - xmin);
+        float one_deltay = 1.0f / (ymax - ymin);
+        float nudge = 1.0; // 0.999;
+
+        m(0,0) = doubleznear * one_deltax;
+        m(1,1) = doubleznear * one_deltay;
+        m(0,2) = (xmax + xmin) * one_deltax;
+        m(1,2) = (ymax + ymin) * one_deltay;
+        m(2,2) = -1.0f * nudge;
+        m(3,2) = -1.0f;
+        m(2,3) = -doubleznear * nudge;
+
+        return m;
+    }
+
+    // Get perspective matrix.
+    inline Matrix perspective(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tanf(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustum(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    // Get inverse perspective matrix.
+    inline Matrix perspectiveInverse(float fovy, float aspect, float zNear, float zFar)
+    {
+        float xmax = zNear * tanf(fovy / 2);
+        float xmin = -xmax;
+
+        float ymax = xmax / aspect;
+        float ymin = -ymax;
+
+        return frustumInverse(xmin, xmax, ymin, ymax, zNear, zFar);	
+    }
+
+    // Get infinite perspective matrix.
+    inline Matrix perspective(float fovy, float aspect, float zNear)
+    {
+        float x = zNear * tanf(fovy / 2);
+        float y = x / aspect;
+        return frustum( -x, x, -y, y, zNear );	
+    }
+
+    // Get matrix determinant.
+    inline float Matrix::determinant() const
+    {
+        return 
+            m_data[3] * m_data[6] * m_data[ 9] * m_data[12] - m_data[2] * m_data[7] * m_data[ 9] * m_data[12] - m_data[3] * m_data[5] * m_data[10] * m_data[12] + m_data[1] * m_data[7] * m_data[10] * m_data[12] +
+            m_data[2] * m_data[5] * m_data[11] * m_data[12] - m_data[1] * m_data[6] * m_data[11] * m_data[12] - m_data[3] * m_data[6] * m_data[ 8] * m_data[13] + m_data[2] * m_data[7] * m_data[ 8] * m_data[13] +
+            m_data[3] * m_data[4] * m_data[10] * m_data[13] - m_data[0] * m_data[7] * m_data[10] * m_data[13] - m_data[2] * m_data[4] * m_data[11] * m_data[13] + m_data[0] * m_data[6] * m_data[11] * m_data[13] +
+            m_data[3] * m_data[5] * m_data[ 8] * m_data[14] - m_data[1] * m_data[7] * m_data[ 8] * m_data[14] - m_data[3] * m_data[4] * m_data[ 9] * m_data[14] + m_data[0] * m_data[7] * m_data[ 9] * m_data[14] +
+            m_data[1] * m_data[4] * m_data[11] * m_data[14] - m_data[0] * m_data[5] * m_data[11] * m_data[14] - m_data[2] * m_data[5] * m_data[ 8] * m_data[15] + m_data[1] * m_data[6] * m_data[ 8] * m_data[15] +
+            m_data[2] * m_data[4] * m_data[ 9] * m_data[15] - m_data[0] * m_data[6] * m_data[ 9] * m_data[15] - m_data[1] * m_data[4] * m_data[10] * m_data[15] + m_data[0] * m_data[5] * m_data[10] * m_data[15];
+    }
+
+    inline Matrix transpose(Matrix::Arg m)
+    {
+        Matrix r;
+        for (int i = 0; i < 4; i++)
+        {
+            for (int j = 0; j < 4; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+        return r;
+    }
+
+    // Inverse using Cramer's rule.
+    inline Matrix inverseCramer(Matrix::Arg m)
+    {
+        Matrix r;
+        r.data( 0) = m.data(6)*m.data(11)*m.data(13) - m.data(7)*m.data(10)*m.data(13) + m.data(7)*m.data(9)*m.data(14) - m.data(5)*m.data(11)*m.data(14) - m.data(6)*m.data(9)*m.data(15) + m.data(5)*m.data(10)*m.data(15);
+        r.data( 1) = m.data(3)*m.data(10)*m.data(13) - m.data(2)*m.data(11)*m.data(13) - m.data(3)*m.data(9)*m.data(14) + m.data(1)*m.data(11)*m.data(14) + m.data(2)*m.data(9)*m.data(15) - m.data(1)*m.data(10)*m.data(15);
+        r.data( 2) = m.data(2)*m.data( 7)*m.data(13) - m.data(3)*m.data( 6)*m.data(13) + m.data(3)*m.data(5)*m.data(14) - m.data(1)*m.data( 7)*m.data(14) - m.data(2)*m.data(5)*m.data(15) + m.data(1)*m.data( 6)*m.data(15);
+        r.data( 3) = m.data(3)*m.data( 6)*m.data( 9) - m.data(2)*m.data( 7)*m.data( 9) - m.data(3)*m.data(5)*m.data(10) + m.data(1)*m.data( 7)*m.data(10) + m.data(2)*m.data(5)*m.data(11) - m.data(1)*m.data( 6)*m.data(11);
+        r.data( 4) = m.data(7)*m.data(10)*m.data(12) - m.data(6)*m.data(11)*m.data(12) - m.data(7)*m.data(8)*m.data(14) + m.data(4)*m.data(11)*m.data(14) + m.data(6)*m.data(8)*m.data(15) - m.data(4)*m.data(10)*m.data(15);
+        r.data( 5) = m.data(2)*m.data(11)*m.data(12) - m.data(3)*m.data(10)*m.data(12) + m.data(3)*m.data(8)*m.data(14) - m.data(0)*m.data(11)*m.data(14) - m.data(2)*m.data(8)*m.data(15) + m.data(0)*m.data(10)*m.data(15);
+        r.data( 6) = m.data(3)*m.data( 6)*m.data(12) - m.data(2)*m.data( 7)*m.data(12) - m.data(3)*m.data(4)*m.data(14) + m.data(0)*m.data( 7)*m.data(14) + m.data(2)*m.data(4)*m.data(15) - m.data(0)*m.data( 6)*m.data(15);
+        r.data( 7) = m.data(2)*m.data( 7)*m.data( 8) - m.data(3)*m.data( 6)*m.data( 8) + m.data(3)*m.data(4)*m.data(10) - m.data(0)*m.data( 7)*m.data(10) - m.data(2)*m.data(4)*m.data(11) + m.data(0)*m.data( 6)*m.data(11);
+        r.data( 8) = m.data(5)*m.data(11)*m.data(12) - m.data(7)*m.data( 9)*m.data(12) + m.data(7)*m.data(8)*m.data(13) - m.data(4)*m.data(11)*m.data(13) - m.data(5)*m.data(8)*m.data(15) + m.data(4)*m.data( 9)*m.data(15);
+        r.data( 9) = m.data(3)*m.data( 9)*m.data(12) - m.data(1)*m.data(11)*m.data(12) - m.data(3)*m.data(8)*m.data(13) + m.data(0)*m.data(11)*m.data(13) + m.data(1)*m.data(8)*m.data(15) - m.data(0)*m.data( 9)*m.data(15);
+        r.data(10) = m.data(1)*m.data( 7)*m.data(12) - m.data(3)*m.data( 5)*m.data(12) + m.data(3)*m.data(4)*m.data(13) - m.data(0)*m.data( 7)*m.data(13) - m.data(1)*m.data(4)*m.data(15) + m.data(0)*m.data( 5)*m.data(15);
+        r.data(11) = m.data(3)*m.data( 5)*m.data( 8) - m.data(1)*m.data( 7)*m.data( 8) - m.data(3)*m.data(4)*m.data( 9) + m.data(0)*m.data( 7)*m.data( 9) + m.data(1)*m.data(4)*m.data(11) - m.data(0)*m.data( 5)*m.data(11);
+        r.data(12) = m.data(6)*m.data( 9)*m.data(12) - m.data(5)*m.data(10)*m.data(12) - m.data(6)*m.data(8)*m.data(13) + m.data(4)*m.data(10)*m.data(13) + m.data(5)*m.data(8)*m.data(14) - m.data(4)*m.data( 9)*m.data(14);
+        r.data(13) = m.data(1)*m.data(10)*m.data(12) - m.data(2)*m.data( 9)*m.data(12) + m.data(2)*m.data(8)*m.data(13) - m.data(0)*m.data(10)*m.data(13) - m.data(1)*m.data(8)*m.data(14) + m.data(0)*m.data( 9)*m.data(14);
+        r.data(14) = m.data(2)*m.data( 5)*m.data(12) - m.data(1)*m.data( 6)*m.data(12) - m.data(2)*m.data(4)*m.data(13) + m.data(0)*m.data( 6)*m.data(13) + m.data(1)*m.data(4)*m.data(14) - m.data(0)*m.data( 5)*m.data(14);
+        r.data(15) = m.data(1)*m.data( 6)*m.data( 8) - m.data(2)*m.data( 5)*m.data( 8) + m.data(2)*m.data(4)*m.data( 9) - m.data(0)*m.data( 6)*m.data( 9) - m.data(1)*m.data(4)*m.data(10) + m.data(0)*m.data( 5)*m.data(10);
+        r.scale(1.0f / m.determinant());
+        return r;
+    }
+
+    inline Matrix isometryInverse(Matrix::Arg m)
+    {
+        Matrix r(identity);
+
+        // transposed 3x3 upper left matrix
+        for (int i = 0; i < 3; i++)
+        {
+            for (int j = 0; j < 3; j++)
+            {
+                r(i, j) = m(j, i);
+            }
+        }
+
+        // translate by the negative offsets
+        r.translate(-Vector3(m.data(12), m.data(13), m.data(14)));
+
+        return r;
+    }
+
+    // Transform the given 3d point with the given matrix.
+    inline Vector3 transformPoint(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + m(2,3));
+    }
+
+    // Transform the given 3d vector with the given matrix.
+    inline Vector3 transformVector(Matrix::Arg m, Vector3::Arg p)
+    {
+        return Vector3(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2));
+    }
+
+    // Transform the given 4d vector with the given matrix.
+    inline Vector4 transform(Matrix::Arg m, Vector4::Arg p)
+    {
+        return Vector4(
+            p.x * m(0,0) + p.y * m(0,1) + p.z * m(0,2) + p.w * m(0,3),
+            p.x * m(1,0) + p.y * m(1,1) + p.z * m(1,2) + p.w * m(1,3),
+            p.x * m(2,0) + p.y * m(2,1) + p.z * m(2,2) + p.w * m(2,3),
+            p.x * m(3,0) + p.y * m(3,1) + p.z * m(3,2) + p.w * m(3,3));
+    }
+
+    inline Matrix mul(Matrix::Arg a, Matrix::Arg b)
+    {
+        // @@ Is this the right order? mul(a, b) = b * a
+        Matrix m = a;
+        m.apply(b);
+        return m;
+    }
+
+    inline void Matrix::operator+=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] += m.m_data[i];
+        }
+    }
+
+    inline void Matrix::operator-=(const Matrix & m)
+    {
+        for(int i = 0; i < 16; i++) {
+            m_data[i] -= m.m_data[i];
+        }
+    }
+
+    inline Matrix operator+(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m += b;
+        return m;
+    }
+
+    inline Matrix operator-(const Matrix & a, const Matrix & b)
+    {
+        Matrix m = a;
+        m -= b;
+        return m;
+    }
+
+
+} // nv namespace
+
+
+#if 0 // old code.
+/** @name Special matrices. */
+//@{
+/** Generate a translation matrix. */
+void TranslationMatrix(const Vec3 & v) {
+    data[0] = 1; data[1] = 0; data[2] = 0; data[3] = 0;
+    data[4] = 0; data[5] = 1; data[6] = 0; data[7] = 0;
+    data[8] = 0; data[9] = 0; data[10] = 1; data[11] = 0;
+    data[12] = v.x; data[13] = v.y; data[14] = v.z; data[15] = 1;
+}
+
+/** Rotate theta degrees around v. */
+void RotationMatrix( float theta, float v0, float v1, float v2 ) {
+    float cost = cos(theta);
+    float sint = sin(theta);
+
+    if( 1 == v0 && 0 == v1 && 0 == v2 ) {
+        data[0] = 1.0f;	data[1] = 0.0f;	data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = cost;	data[6] = -sint;data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = sint;	data[10] = cost;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0  && 1 == v1 && 0 == v2 ) {
+        data[0] = cost;	data[1] = 0.0f;	data[2] = sint;	data[3] = 0.0f;
+        data[4] = 0.0f;	data[5] = 1.0f;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = -sint;data[9] = 0.0f;data[10] = cost;	data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    }
+    else if( 0 == v0 && 0 == v1 && 1 == v2 ) {
+        data[0] = cost;	data[1] = -sint;data[2] = 0.0f;	data[3] = 0.0f;
+        data[4] = sint; data[5] = cost;	data[6] = 0.0f;	data[7] = 0.0f;
+        data[8] = 0.0f;	data[9] = 0.0f;	data[10] = 1.0f;data[11] = 0.0f;
+        data[12] = 0.0f;data[13] = 0.0f;data[14] = 0.0f;data[15] = 1.0f;
+    } 
+    else {
+        //we need scale a,b,c to unit length.
+        float a2, b2, c2;
+        a2 = v0 * v0;
+        b2 = v1 * v1;
+        c2 = v2 * v2;
+
+        float iscale = 1.0f / sqrtf(a2 + b2 + c2);
+        v0 *= iscale;
+        v1 *= iscale;
+        v2 *= iscale;
+
+        float abm, acm, bcm;
+        float mcos, asin, bsin, csin;
+        mcos = 1.0f - cost;
+        abm = v0 * v1 * mcos;
+        acm = v0 * v2 * mcos;
+        bcm = v1 * v2 * mcos;
+        asin = v0 * sint;
+        bsin = v1 * sint;
+        csin = v2 * sint;
+        data[0] = a2 * mcos + cost;
+        data[1] = abm - csin;
+        data[2] = acm + bsin;
+        data[3] = abm + csin;
+        data[4] = 0.0f;
+        data[5] = b2 * mcos + cost;
+        data[6] = bcm - asin;
+        data[7] = acm - bsin;
+        data[8] = 0.0f;
+        data[9] = bcm + asin;
+        data[10] = c2 * mcos + cost;
+        data[11] = 0.0f;
+        data[12] = 0.0f;
+        data[13] = 0.0f;
+        data[14] = 0.0f;
+        data[15] = 1.0f;
+    }
+}
+
+/*
+void SkewMatrix(float angle, const Vec3 & v1, const Vec3 & v2) {
+v1.Normalize();
+v2.Normalize();
+
+Vec3 v3;
+v3.Cross(v1, v2);
+v3.Normalize();
+
+// Get skew factor.
+float costheta = Vec3DotProduct(v1, v2);
+float sintheta = Real.Sqrt(1 - costheta * costheta);
+float skew = tan(Trig.DegreesToRadians(angle) + acos(sintheta)) * sintheta - costheta;
+
+// Build orthonormal matrix.
+v1 = FXVector3.Cross(v3, v2);
+v1.Normalize();
+
+Matrix R = Matrix::Identity;
+R[0, 0] = v3.X; // Not sure this is in the correct order...
+R[1, 0] = v3.Y;
+R[2, 0] = v3.Z;
+R[0, 1] = v1.X;
+R[1, 1] = v1.Y;
+R[2, 1] = v1.Z;
+R[0, 2] = v2.X;
+R[1, 2] = v2.Y;
+R[2, 2] = v2.Z;
+
+// Build skew matrix.
+Matrix S = Matrix::Identity;
+S[2, 1] = -skew;
+
+// Return skew transform.
+return R * S * R.Transpose;	// Not sure this is in the correct order...
+}
+*/
+
+/**
+* Generate rotation matrix for the euler angles. This is the same as computing
+* 3 rotation matrices and multiplying them together in our custom order.
+*
+* @todo Have to recompute this code for our new convention.
+**/
+void RotationMatrix( float yaw, float pitch, float roll ) {
+    float sy = sin(yaw+ToRadian(90));
+    float cy = cos(yaw+ToRadian(90));
+    float sp = sin(pitch-ToRadian(90));
+    float cp = cos(pitch-ToRadian(90));
+    float sr = sin(roll);
+    float cr = cos(roll);
+
+    data[0] = cr*cy + sr*sp*sy;
+    data[1] = cp*sy;
+    data[2] = -sr*cy + cr*sp*sy;
+    data[3] = 0;
+
+    data[4] = -cr*sy + sr*sp*cy;
+    data[5] = cp*cy;
+    data[6] = sr*sy + cr*sp*cy;
+    data[7] = 0;
+
+    data[8] = sr*cp;
+    data[9] = -sp;
+    data[10] = cr*cp;
+    data[11] = 0;
+
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = 0;
+    data[15] = 1;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void Frustum( float xmin, float xmax, float ymin, float ymax, float zNear, float zFar ) {
+    float one_deltax, one_deltay, one_deltaz, doubleznear;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    one_deltaz = 1.0f / (zFar - zNear);
+
+    data[0] = (float)(doubleznear * one_deltax);
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+    data[4] = 0.0f;
+    data[5] = (float)(doubleznear * one_deltay);
+    data[6] = 0.f;
+    data[7] = 0.f;
+    data[8] = (float)((xmax + xmin) * one_deltax);
+    data[9] = (float)((ymax + ymin) * one_deltay);
+    data[10] = (float)(-(zFar + zNear) * one_deltaz);
+    data[11] = -1.f;
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = (float)(-(zFar * doubleznear) * one_deltaz);
+    data[15] = 0.f;
+}
+
+/** Create a frustum matrix with the far plane at the infinity. */
+void FrustumInf( float xmin, float xmax, float ymin, float ymax, float zNear ) {
+    float one_deltax, one_deltay, doubleznear, nudge;
+
+    doubleznear = 2.0f * zNear;
+    one_deltax = 1.0f / (xmax - xmin);
+    one_deltay = 1.0f / (ymax - ymin);
+    nudge = 1.0; // 0.999;
+
+    data[0] = doubleznear * one_deltax;
+    data[1] = 0.0f;
+    data[2] = 0.0f;
+    data[3] = 0.0f;
+
+    data[4] = 0.0f;
+    data[5] = doubleznear * one_deltay;
+    data[6] = 0.f;
+    data[7] = 0.f;
+
+    data[8] = (xmax + xmin) * one_deltax;
+    data[9] = (ymax + ymin) * one_deltay;
+    data[10] = -1.0f * nudge;
+    data[11] = -1.0f;
+
+    data[12] = 0.f;
+    data[13] = 0.f;
+    data[14] = -doubleznear * nudge;
+    data[15] = 0.f;
+}
+
+/** Create an inverse frustum matrix with the far plane at the infinity. */
+void FrustumInfInv( float left, float right, float bottom, float top, float zNear ) {
+    // this matrix is wrong (not tested floatly) I think it should be transposed.
+    data[0] = (right - left) / (2 * zNear);
+    data[1] = 0;
+    data[2] = 0;
+    data[3] = (right + left) / (2 * zNear);
+    data[4] = 0;
+    data[5] = (top - bottom) / (2 * zNear);
+    data[6] = 0;
+    data[7] = (top + bottom) / (2 * zNear);
+    data[8] = 0;
+    data[9] = 0;
+    data[10] = 0;
+    data[11] = -1;
+    data[12] = 0;
+    data[13] = 0;
+    data[14] = -1 / (2 * zNear);
+    data[15] = 1 / (2 * zNear);
+}
+
+/** Create an homogeneous projection matrix. */
+void Perspective( float fov, float aspect, float zNear, float zFar ) {
+    float xmin, xmax, ymin, ymax;
+
+    xmax = zNear * tan( fov/2 );
+    xmin = -xmax;
+
+    ymax = xmax / aspect;
+    ymin = -ymax;
+
+    Frustum(xmin, xmax, ymin, ymax, zNear, zFar);
+}
+
+/** Create a projection matrix with the far plane at the infinity. */
+void PerspectiveInf( float fov, float aspect, float zNear ) {
+    float x = zNear * tan( fov/2 );
+    float y = x / aspect;
+    FrustumInf( -x, x, -y, y, zNear );
+}
+
+/** Create an inverse projection matrix with far plane at the infinity. */
+void PerspectiveInfInv( float fov, float aspect, float zNear ) {
+    float x = zNear * tan( fov/2 );
+    float y = x / aspect;
+    FrustumInfInv( -x, x, -y, y, zNear );
+}
+
+/** Build bone matrix from quatertion and offset. */
+void BoneMatrix(const Quat & q, const Vec3 & offset) {
+    float x2, y2, z2, xx, xy, xz, yy, yz, zz, wx, wy, wz;
+
+    // calculate coefficients
+    x2 = q.x + q.x;
+    y2 = q.y + q.y;
+    z2 = q.z + q.z;
+
+    xx = q.x * x2;   xy = q.x * y2;   xz = q.x * z2;
+    yy = q.y * y2;   yz = q.y * z2;   zz = q.z * z2;
+    wx = q.w * x2;   wy = q.w * y2;   wz = q.w * z2;
+
+    data[0] = 1.0f - (yy + zz); 	
+    data[1] = xy - wz;
+    data[2] = xz + wy;		
+    data[3] = 0.0f;
+
+    data[4] = xy + wz;		
+    data[5] = 1.0f - (xx + zz);
+    data[6] = yz - wx;		
+    data[7] = 0.0f;
+
+    data[8] = xz - wy;		
+    data[9] = yz + wx;
+    data[10] = 1.0f - (xx + yy);		
+    data[11] = 0.0f;
+
+    data[12] = offset.x;
+    data[13] = offset.y;
+    data[14] = offset.z;			
+    data[15] = 1.0f;
+}
+
+//@}
+
+
+/** @name Transformations: */
+//@{
+
+/** Apply a general scale. */
+void Scale( float x, float y, float z ) {
+    data[0] *= x;	data[4] *= y;	data[8]  *= z;
+    data[1] *= x;	data[5] *= y;	data[9]  *= z;
+    data[2] *= x;	data[6] *= y;	data[10] *= z;
+    data[3] *= x;	data[7] *= y;	data[11] *= z;
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( float theta, const Vec3 & v ) {
+    Matrix b;
+    b.RotationMatrix( theta, v[0], v[1], v[2] );
+    Multiply4x3( b );
+}
+
+/** Apply a rotation of theta degrees around the axis v*/
+void Rotate( float theta, float v0, float v1, float v2 ) {
+    Matrix b;
+    b.RotationMatrix( theta, v0, v1, v2 );
+    Multiply4x3( b );
+}
+
+/**
+* Translate the matrix by t. This is the same as multiplying by a
+* translation matrix with the given offset.
+* this = T * this
+*/
+void Translate( const Vec3 &t ) {
+    data[12] = data[0] * t.x + data[4] * t.y + data[8]  * t.z + data[12];
+    data[13] = data[1] * t.x + data[5] * t.y + data[9]  * t.z + data[13];
+    data[14] = data[2] * t.x + data[6] * t.y + data[10] * t.z + data[14];
+    data[15] = data[3] * t.x + data[7] * t.y + data[11] * t.z + data[15];
+}
+
+/** 
+* Translate the matrix by x, y, z. This is the same as multiplying by a 
+* translation matrix with the given offsets.
+*/
+void Translate( float x, float y, float z ) {
+    data[12] = data[0] * x + data[4] * y + data[8]  * z + data[12];
+    data[13] = data[1] * x + data[5] * y + data[9]  * z + data[13];
+    data[14] = data[2] * x + data[6] * y + data[10] * z + data[14];
+    data[15] = data[3] * x + data[7] * y + data[11] * z + data[15];
+}
+
+/** Compute the transposed matrix. */
+void Transpose() {
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+    piSwap(data[3], data[12]);
+    piSwap(data[7], data[13]);
+    piSwap(data[11], data[14]);
+}
+
+/** Compute the inverse of a rigid-body/isometry/orthonormal matrix. */
+void IsometryInverse() {
+    // transposed 3x3 upper left matrix
+    piSwap(data[1], data[4]);
+    piSwap(data[2], data[8]);
+    piSwap(data[6], data[9]);
+
+    // translate by the negative offsets
+    Vec3 v(-data[12], -data[13], -data[14]);
+    data[12] = data[13] = data[14] = 0;
+    Translate(v);
+}
+
+/** Compute the inverse of the affine portion of this matrix. */
+void AffineInverse() {
+    data[12] = data[13] = data[14] = 0;
+    Transpose();
+}
+//@}
+
+/** @name Matrix operations: */
+//@{
+
+/** Return the determinant of this matrix. */
+float Determinant() const {
+    return	data[0] * data[5] * data[10] * data[15] + 
+        data[1] * data[6] * data[11] * data[12] +
+        data[2] * data[7] * data[ 8] * data[13] +
+        data[3] * data[4] * data[ 9] * data[14] -
+        data[3] * data[6] * data[ 9] * data[12] -
+        data[2] * data[5] * data[ 8] * data[15] -
+        data[1] * data[4] * data[11] * data[14] -
+        data[0] * data[7] * data[10] * data[12];
+}
+
+
+/** Standard matrix product: this *= B. */
+void Multiply4x4( const Matrix & restrict B ) {
+    Multiply4x4(*this, B);
+}
+
+/** Standard matrix product: this = A * B. this != B*/
+void Multiply4x4( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 4; i++) {
+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+
+    /* Unrolled but does not allow this == A
+    data[0] = A.data[0] * B.data[0] + A.data[4] * B.data[1] + A.data[8] * B.data[2] + A.data[12] * B.data[3];
+    data[1] = A.data[1] * B.data[0] + A.data[5] * B.data[1] + A.data[9] * B.data[2] + A.data[13] * B.data[3];
+    data[2] = A.data[2] * B.data[0] + A.data[6] * B.data[1] + A.data[10] * B.data[2] + A.data[14] * B.data[3];
+    data[3] = A.data[3] * B.data[0] + A.data[7] * B.data[1] + A.data[11] * B.data[2] + A.data[15] * B.data[3];
+    data[4] = A.data[0] * B.data[4] + A.data[4] * B.data[5] + A.data[8] * B.data[6] + A.data[12] * B.data[7];
+    data[5] = A.data[1] * B.data[4] + A.data[5] * B.data[5] + A.data[9] * B.data[6] + A.data[13] * B.data[7];
+    data[6] = A.data[2] * B.data[4] + A.data[6] * B.data[5] + A.data[10] * B.data[6] + A.data[14] * B.data[7];
+    data[7] = A.data[3] * B.data[4] + A.data[7] * B.data[5] + A.data[11] * B.data[6] + A.data[15] * B.data[7];
+    data[8] = A.data[0] * B.data[8] + A.data[4] * B.data[9] + A.data[8] * B.data[10] + A.data[12] * B.data[11];
+    data[9] = A.data[1] * B.data[8] + A.data[5] * B.data[9] + A.data[9] * B.data[10] + A.data[13] * B.data[11];
+    data[10]= A.data[2] * B.data[8] + A.data[6] * B.data[9] + A.data[10] * B.data[10] + A.data[14] * B.data[11];
+    data[11]= A.data[3] * B.data[8] + A.data[7] * B.data[9] + A.data[11] * B.data[10] + A.data[15] * B.data[11];
+    data[12]= A.data[0] * B.data[12] + A.data[4] * B.data[13] + A.data[8] * B.data[14] + A.data[12] * B.data[15];
+    data[13]= A.data[1] * B.data[12] + A.data[5] * B.data[13] + A.data[9] * B.data[14] + A.data[13] * B.data[15];
+    data[14]= A.data[2] * B.data[12] + A.data[6] * B.data[13] + A.data[10] * B.data[14] + A.data[14] * B.data[15];
+    data[15]= A.data[3] * B.data[12] + A.data[7] * B.data[13] + A.data[11] * B.data[14] + A.data[15] * B.data[15];
+    */
+}
+
+/** Standard matrix product: this *= B. */
+void Multiply4x3( const Matrix & restrict B ) {
+    Multiply4x3(*this, B);
+}
+
+/** Standard product of matrices, where the last row is [0 0 0 1]. */
+void Multiply4x3( const Matrix & A, const Matrix & restrict B ) {
+    piDebugCheck(this != &B);
+
+    for(int i = 0; i < 3; i++) {
+        const float ai0 = A(i,0), ai1 = A(i,1), ai2 = A(i,2), ai3 = A(i,3);
+        GetElem(i,0) = ai0 * B(0,0) + ai1 * B(1,0) + ai2 * B(2,0) + ai3 * B(3,0);
+        GetElem(i,1) = ai0 * B(0,1) + ai1 * B(1,1) + ai2 * B(2,1) + ai3 * B(3,1);
+        GetElem(i,2) = ai0 * B(0,2) + ai1 * B(1,2) + ai2 * B(2,2) + ai3 * B(3,2);
+        GetElem(i,3) = ai0 * B(0,3) + ai1 * B(1,3) + ai2 * B(2,3) + ai3 * B(3,3);
+    }
+    data[3] = 0.0f; data[7] = 0.0f; data[11] = 0.0f; data[15] = 1.0f;
+
+    /* Unrolled but does not allow this == A
+    data[0] = a.data[0] * b.data[0] + a.data[4] * b.data[1] + a.data[8] * b.data[2] + a.data[12] * b.data[3];
+    data[1] = a.data[1] * b.data[0] + a.data[5] * b.data[1] + a.data[9] * b.data[2] + a.data[13] * b.data[3];
+    data[2] = a.data[2] * b.data[0] + a.data[6] * b.data[1] + a.data[10] * b.data[2] + a.data[14] * b.data[3];
+    data[3] = 0.0f;
+    data[4] = a.data[0] * b.data[4] + a.data[4] * b.data[5] + a.data[8] * b.data[6] + a.data[12] * b.data[7];
+    data[5] = a.data[1] * b.data[4] + a.data[5] * b.data[5] + a.data[9] * b.data[6] + a.data[13] * b.data[7];
+    data[6] = a.data[2] * b.data[4] + a.data[6] * b.data[5] + a.data[10] * b.data[6] + a.data[14] * b.data[7];
+    data[7] = 0.0f;
+    data[8] = a.data[0] * b.data[8] + a.data[4] * b.data[9] + a.data[8] * b.data[10] + a.data[12] * b.data[11];
+    data[9] = a.data[1] * b.data[8] + a.data[5] * b.data[9] + a.data[9] * b.data[10] + a.data[13] * b.data[11];
+    data[10]= a.data[2] * b.data[8] + a.data[6] * b.data[9] + a.data[10] * b.data[10] + a.data[14] * b.data[11];
+    data[11]= 0.0f;
+    data[12]= a.data[0] * b.data[12] + a.data[4] * b.data[13] + a.data[8] * b.data[14] + a.data[12] * b.data[15];
+    data[13]= a.data[1] * b.data[12] + a.data[5] * b.data[13] + a.data[9] * b.data[14] + a.data[13] * b.data[15];
+    data[14]= a.data[2] * b.data[12] + a.data[6] * b.data[13] + a.data[10] * b.data[14] + a.data[14] * b.data[15];
+    data[15]= 1.0f;
+    */
+}
+//@}
+
+
+/** @name Vector operations: */
+//@{
+
+/** Transform 3d vector (w=0). */
+void TransformVec3(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10];
+}
+/** Transform 3d vector by the transpose (w=0). */
+void TransformVec3T(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[1] + orig.z * data[2];
+    dest->y = orig.x * data[4] + orig.y * data[5] + orig.z * data[6];
+    dest->z = orig.x * data[8] + orig.y * data[9] + orig.z * data[10];
+}
+
+/** Transform a 3d homogeneous vector, where the fourth coordinate is assumed to be 1. */
+void TransformPoint(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+}
+
+/** Transform a point, normalize it, and return w. */
+float TransformPointAndNormalize(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    float w;
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    w = 1 / (orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15]);
+    *dest *= w;
+    return w;
+}
+
+/** Transform a point and return w. */
+float TransformPointReturnW(const Vec3 & restrict orig, Vec3 * restrict dest) const {
+    piDebugCheck(&orig != dest);
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    return orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+
+/** Transform a normalized 3d point by a 4d matrix and return the resulting 4d vector. */
+void TransformVec4(const Vec3 & orig, Vec4 * dest) const {
+    dest->x = orig.x * data[0] + orig.y * data[4] + orig.z * data[8] + data[12];
+    dest->y = orig.x * data[1] + orig.y * data[5] + orig.z * data[9] + data[13];
+    dest->z = orig.x * data[2] + orig.y * data[6] + orig.z * data[10] + data[14];
+    dest->w = orig.x * data[3] + orig.y * data[7] + orig.z * data[11] + data[15];
+}
+//@}
+
+/** @name Matrix analysis. */
+//@{
+
+/** Get the ZYZ euler angles from the matrix. Assumes the matrix is orthonormal. */
+void GetEulerAnglesZYZ(float * s, float * t, float * r) const {
+    if( GetElem(2,2) < 1.0f ) {
+        if( GetElem(2,2) > -1.0f ) {
+            // 	cs*ct*cr-ss*sr 		-ss*ct*cr-cs*sr		st*cr
+            //	cs*ct*sr+ss*cr		-ss*ct*sr+cs*cr		st*sr
+            //	-cs*st				ss*st				ct
+            *s = atan2(GetElem(1,2), -GetElem(0,2));
+            *t = acos(GetElem(2,2));
+            *r = atan2(GetElem(2,1), GetElem(2,0));		
+        }
+        else {
+            // 	-c(s-r)	 	s(s-r)		0
+            //	s(s-r)		c(s-r)		0
+            //	0			0			-1
+            *s = atan2(GetElem(0, 1), -GetElem(0, 0)); // = s-r
+            *t = PI;
+            *r = 0;
+        }
+    }
+    else {
+        // 	c(s+r)		-s(s+r)		0
+        //	s(s+r)		c(s+r)		0
+        //	0			0			1
+        *s = atan2(GetElem(0, 1), GetElem(0, 0)); // = s+r
+        *t = 0;
+        *r = 0;
+    }
+}
+
+//@}
+
+MATHLIB_API friend PiStream & operator<< ( PiStream & s, Matrix & m );
+
+/** Print to debug output. */
+void Print() const {
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[0], data[4], data[8], data[12] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[1], data[5], data[9], data[13] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[2], data[6], data[10], data[14] );
+    piDebug( "[ %5.2f %5.2f %5.2f %5.2f ]\n", data[3], data[7], data[11], data[15] );
+}
+
+
+public:
+
+    float data[16];
+
+};
+#endif
+
+
+#endif // NV_MATH_MATRIX_INL

+ 56 - 0
3rdparty/nvtt/nvmath/nvmath.h

@@ -0,0 +1,56 @@
+// This code is in the public domain -- [email protected]
+
+#ifndef NV_MATH_H
+#define NV_MATH_H
+
+#include <math.h>
+#include <float.h>  // finite, isnan
+
+#include "nvcore/utils.h"   // max, clamp
+
+#define NVMATH_API
+#define NVMATH_CLASS
+
+#define PI                  float(3.1415926535897932384626433833)
+#define NV_EPSILON          (0.0001f)
+#define NV_NORMAL_EPSILON   (0.001f)
+
+namespace nv
+{
+    inline float toRadian(float degree) { return degree * (PI / 180.0f); }
+    inline float toDegree(float radian) { return radian * (180.0f / PI); }
+
+    // Robust floating point comparisons:
+    // http://realtimecollisiondetection.net/blog/?p=89
+    inline bool equal(const float f0, const float f1, const float epsilon = NV_EPSILON)
+    {
+        //return fabs(f0-f1) <= epsilon;
+        return fabs(f0-f1) <= epsilon * max3(1.0f, fabsf(f0), fabsf(f1));
+    }
+
+    inline bool isZero(const float f, const float epsilon = NV_EPSILON)
+    {
+        return fabsf(f) <= epsilon;
+    }
+
+    inline bool isFinite(const float f)
+    {
+        return _finite(f) != 0;
+    }
+
+    // Eliminates negative zeros from a float array.
+    inline void floatCleanup(float * fp, int n)
+    {
+        for (int i = 0; i < n; i++) {
+            //nvDebugCheck(isFinite(fp[i]));
+            union { float f; uint32 i; } x = { fp[i] };
+            if (x.i == 0x80000000) fp[i] = 0.0f;
+        }
+    }
+
+    inline float saturate(float f) {
+        return clamp(f, 0.0f, 1.0f);
+    }
+}
+
+#endif // NV_MATH_H

+ 40 - 0
3rdparty/nvtt/nvmath/plane.h

@@ -0,0 +1,40 @@
+// This code is in the public domain -- Ignacio Castańo <[email protected]>
+
+#ifndef NV_MATH_PLANE_H
+#define NV_MATH_PLANE_H
+
+#include "nvmath.h"
+#include "vector.h"
+
+namespace nv
+{
+    class Matrix;
+
+    class NVMATH_CLASS Plane
+    {
+    public:
+        Plane();
+        Plane(float x, float y, float z, float w);
+        Plane(const Vector4 & v);
+        Plane(const Vector3 & v, float d);
+        Plane(const Vector3 & normal, const Vector3 & point);
+        Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2);
+
+        const Plane & operator=(const Plane & v);
+
+        Vector3 vector() const;
+        float offset() const;
+
+        void operator*=(float s);
+
+        Vector4 v;
+    };
+
+    Plane transformPlane(const Matrix &, const Plane &);
+
+    Vector3 planeIntersection(const Plane & a, const Plane & b, const Plane & c);
+
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H

+ 49 - 0
3rdparty/nvtt/nvmath/plane.inl

@@ -0,0 +1,49 @@
+// This code is in the public domain -- Ignacio Castaño <[email protected]>
+
+#pragma once
+#ifndef NV_MATH_PLANE_INL
+#define NV_MATH_PLANE_INL
+
+#include "Plane.h"
+#include "Vector.inl"
+
+namespace nv
+{
+    inline Plane::Plane() {}
+    inline Plane::Plane(float x, float y, float z, float w) : v(x, y, z, w) {}
+    inline Plane::Plane(const Vector4 & v) : v(v) {}
+    inline Plane::Plane(const Vector3 & v, float d) : v(v, d) {}
+    inline Plane::Plane(const Vector3 & normal, const Vector3 & point) : v(normal, -dot(normal, point)) {}
+    inline Plane::Plane(const Vector3 & v0, const Vector3 & v1, const Vector3 & v2) {
+        Vector3 n = cross(v1-v0, v2-v0);
+        float d = -dot(n, v0);
+        v = Vector4(n, d);
+    }
+
+    inline const Plane & Plane::operator=(const Plane & p) { v = p.v; return *this; }
+
+    inline Vector3 Plane::vector() const { return v.xyz(); }
+    inline float Plane::offset() const { return v.w; }
+
+    // Normalize plane.
+    inline Plane normalize(const Plane & plane, float epsilon = NV_EPSILON)
+    {
+        const float len = length(plane.vector());
+        const float inv = isZero(len, epsilon) ? 0 : 1.0f / len;
+        return Plane(plane.v * inv);
+    }
+
+    // Get the signed distance from the given point to this plane.
+    inline float distance(const Plane & plane, const Vector3 & point)
+    {
+        return dot(plane.vector(), point) + plane.offset();
+    }
+
+    inline void Plane::operator*=(float s)
+    {
+        v *= s;
+    }
+
+} // nv namespace
+
+#endif // NV_MATH_PLANE_H

+ 148 - 0
3rdparty/nvtt/nvmath/vector.h

@@ -0,0 +1,148 @@
+// This code is in the public domain -- [email protected]
+
+#ifndef NV_MATH_VECTOR_H
+#define NV_MATH_VECTOR_H
+
+#include "nvmath.h"
+
+namespace nv
+{
+    class NVMATH_CLASS Vector2
+    {
+    public:
+        typedef Vector2 const & Arg;
+
+        Vector2();
+        explicit Vector2(float f);
+        Vector2(float x, float y);
+        Vector2(Vector2::Arg v);
+
+        //template <typename T> explicit Vector2(const T & v) : x(v.x), y(v.y) {}
+        //template <typename T> operator T() const { return T(x, y); }
+
+        const Vector2 & operator=(Vector2::Arg v);
+
+        const float * ptr() const;
+
+        void set(float x, float y);
+
+        Vector2 operator-() const;
+        void operator+=(Vector2::Arg v);
+        void operator-=(Vector2::Arg v);
+        void operator*=(float s);
+        void operator*=(Vector2::Arg v);
+
+        friend bool operator==(Vector2::Arg a, Vector2::Arg b);
+        friend bool operator!=(Vector2::Arg a, Vector2::Arg b);
+
+        union {
+            struct {
+                float x, y;
+            };
+            float component[2];
+        };
+    };
+
+    class NVMATH_CLASS Vector3
+    {
+    public:
+        typedef Vector3 const & Arg;
+
+        Vector3();
+        explicit Vector3(float x);
+        //explicit Vector3(int x) : x(float(x)), y(float(x)), z(float(x)) {}
+        Vector3(float x, float y, float z);
+        Vector3(Vector2::Arg v, float z);
+        Vector3(Vector3::Arg v);
+
+        //template <typename T> explicit Vector3(const T & v) : x(v.x), y(v.y), z(v.z) {}
+        //template <typename T> operator T() const { return T(x, y, z); }
+
+        const Vector3 & operator=(Vector3::Arg v);
+
+        Vector2 xy() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z);
+
+        Vector3 operator-() const;
+        void operator+=(Vector3::Arg v);
+        void operator-=(Vector3::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector3::Arg v);
+        void operator/=(Vector3::Arg v);
+
+        friend bool operator==(Vector3::Arg a, Vector3::Arg b);
+        friend bool operator!=(Vector3::Arg a, Vector3::Arg b);
+
+        union {
+            struct {
+                float x, y, z;
+            };
+            float component[3];
+        };
+    };
+
+    class NVMATH_CLASS Vector4
+    {
+    public:
+        typedef Vector4 const & Arg;
+
+        Vector4();
+        explicit Vector4(float x);
+        Vector4(float x, float y, float z, float w);
+        Vector4(Vector2::Arg v, float z, float w);
+        Vector4(Vector2::Arg v, Vector2::Arg u);
+        Vector4(Vector3::Arg v, float w);
+        Vector4(Vector4::Arg v);
+        //	Vector4(const Quaternion & v);
+
+        //template <typename T> explicit Vector4(const T & v) : x(v.x), y(v.y), z(v.z), w(v.w) {}
+        //template <typename T> operator T() const { return T(x, y, z, w); }
+
+        const Vector4 & operator=(Vector4::Arg v);
+
+        Vector2 xy() const;
+        Vector2 zw() const;
+        Vector3 xyz() const;
+
+        const float * ptr() const;
+
+        void set(float x, float y, float z, float w);
+
+        Vector4 operator-() const;
+        void operator+=(Vector4::Arg v);
+        void operator-=(Vector4::Arg v);
+        void operator*=(float s);
+        void operator/=(float s);
+        void operator*=(Vector4::Arg v);
+        void operator/=(Vector4::Arg v);
+
+        friend bool operator==(Vector4::Arg a, Vector4::Arg b);
+        friend bool operator!=(Vector4::Arg a, Vector4::Arg b);
+
+        union {
+            struct {
+                float x, y, z, w;
+            };
+            float component[4];
+        };
+    };
+
+} // nv namespace
+
+// If we had these functions, they would be ambiguous, the compiler would not know which one to pick:
+//template <typename T> Vector2 to(const T & v) { return Vector2(v.x, v.y); }
+//template <typename T> Vector3 to(const T & v) { return Vector3(v.x, v.y, v.z); }
+//template <typename T> Vector4 to(const T & v) { return Vector4(v.x, v.y, v.z, v.z); }
+
+// We could use a cast operator so that we could infer the expected type, but that doesn't work the same way in all compilers and produces horrible error messages.
+
+// Instead we simply have explicit casts:
+template <typename T> T to(const nv::Vector2 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector2)); return T(v.x, v.y); }
+template <typename T> T to(const nv::Vector3 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector3)); return T(v.x, v.y, v.z); }
+template <typename T> T to(const nv::Vector4 & v) { NV_COMPILER_CHECK(sizeof(T) == sizeof(nv::Vector4)); return T(v.x, v.y, v.z, v.w); }
+
+#endif // NV_MATH_VECTOR_H

+ 95 - 0
3rdparty/nvtt/nvtt.cpp

@@ -0,0 +1,95 @@
+/*
+ * Copyright 2011-2015 Branimir Karadzic. All rights reserved.
+ * License: http://www.opensource.org/licenses/BSD-2-Clause
+ */
+
+#include "nvtt.h"
+
+#include <string.h>
+#include <bx/uint32_t.h>
+
+#include "bc6h/zoh.h"
+#include "bc7/avpcl.h"
+#include "nvmath/vector.inl"
+
+NVCORE_API int nvAbort(const char *, const char *, int , const char *, const char *, ...) __attribute__((format (printf, 5, 6)))
+{
+	abort();
+	return 0;
+}
+
+namespace nvtt
+{
+	using namespace nv;
+
+	void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output)
+	{
+		const uint8_t* src = (const uint8_t*)_input;
+		char* dst = (char*)_output;
+
+		for (uint32_t yy = 0; yy < _height; yy += 4)
+		{
+			for (uint32_t xx = 0; xx < _width; xx += 4)
+			{
+				const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4];
+
+				ZOH::Utils::FORMAT = ZOH::UNSIGNED_F16;
+				ZOH::Tile zohTile(4, 4);
+
+				memset(zohTile.data, 0, sizeof(zohTile.data) );
+				memset(zohTile.importance_map, 0, sizeof(zohTile.importance_map) );
+
+				for (uint32_t blockY = 0; blockY < 4; ++blockY)
+				{
+					for (uint32_t blockX = 0; blockX < 4; ++blockX)
+					{
+						Vector4 color = rgba[blockY*4 + blockX];
+						uint16 rHalf = bx::halfFromFloat(color.x);
+						uint16 gHalf = bx::halfFromFloat(color.y);
+						uint16 bHalf = bx::halfFromFloat(color.z);
+						zohTile.data[blockY][blockX].x = ZOH::Tile::half2float(rHalf);
+						zohTile.data[blockY][blockX].y = ZOH::Tile::half2float(gHalf);
+						zohTile.data[blockY][blockX].z = ZOH::Tile::half2float(bHalf);
+						zohTile.importance_map[blockY][blockX] = 1.0f;
+					}
+				}
+
+				ZOH::compress(zohTile, &dst[( (yy*_width) + xx)/4 * 16]);
+			}
+		}
+	}
+
+	void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output)
+	{
+		const uint8_t* src = (const uint8_t*)_input;
+		char* dst = (char*)_output;
+
+		for (uint32_t yy = 0; yy < _height; yy += 4)
+		{
+			for (uint32_t xx = 0; xx < _width; xx += 4)
+			{
+				const Vector4* rgba = (const Vector4*)&src[yy*_stride + xx*sizeof(float)*4];
+
+				AVPCL::mode_rgb     = false;
+				AVPCL::flag_premult = false;
+				AVPCL::flag_nonuniform     = false;
+				AVPCL::flag_nonuniform_ati = false;
+
+				AVPCL::Tile avpclTile(4, 4);
+				memset(avpclTile.data, 0, sizeof(avpclTile.data) );
+				for (uint32_t blockY = 0; blockY < 4; ++blockY)
+				{
+					for (uint32_t blockX = 0; blockX < 4; ++blockX)
+					{
+						Vector4 color = rgba[blockY*4 + blockX];
+						avpclTile.data[blockY][blockX] = color * 255.0f;
+						avpclTile.importance_map[blockY][blockX] = 1.0f;
+					}
+				}
+
+				AVPCL::compress(avpclTile, &dst[( (yy*_width) + xx)/4 * 16]);
+			}
+		}
+	}
+
+} //namespace nvtt

+ 13 - 0
3rdparty/nvtt/nvtt.h

@@ -0,0 +1,13 @@
+#ifndef NVTT_H
+#define NVTT_H
+
+#include <stdint.h>
+
+namespace nvtt
+{
+void compressBC6H(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output);
+void compressBC7(const void* _input, uint32_t _width, uint32_t _height, uint32_t _stride, void* _output);
+
+} // namespace nvtt
+
+#endif // NVTT_H

+ 3 - 0
scripts/texturec.lua

@@ -12,6 +12,7 @@ project "texturec"
 		path.join(BGFX_DIR, "include"),
 		path.join(BGFX_DIR, "src"),
 		path.join(BGFX_DIR, "3rdparty"),
+		path.join(BGFX_DIR, "3rdparty/nvtt"),
 	}
 
 	files {
@@ -20,6 +21,8 @@ project "texturec"
 		path.join(BGFX_DIR, "3rdparty/libsquish/**.h"),
 		path.join(BGFX_DIR, "3rdparty/etc1/**.cpp"),
 		path.join(BGFX_DIR, "3rdparty/etc1/**.h"),
+		path.join(BGFX_DIR, "3rdparty/nvtt/**.cpp"),
+		path.join(BGFX_DIR, "3rdparty/nvtt/**.h"),
 		path.join(BGFX_DIR, "tools/texturec/**.cpp"),
 		path.join(BGFX_DIR, "tools/texturec/**.h"),
 	}

+ 32 - 0
tools/texturec/texturec.cpp

@@ -13,6 +13,7 @@
 #include "image.h"
 #include <libsquish/squish.h>
 #include <etc1/etc1.h>
+#include <nvtt/nvtt.h>
 
 #if 0
 #	define BX_TRACE(_format, ...) fprintf(stderr, "" _format "\n", ##__VA_ARGS__)
@@ -113,6 +114,14 @@ int main(int _argc, const char* _argv[])
 		{
 			format = TextureFormat::ETC1;
 		}
+		else if (0 == bx::stricmp(type, "bc6h") )
+		{
+			format = TextureFormat::BC6H;
+		}
+		else if (0 == bx::stricmp(type, "bc7") )
+		{
+			format = TextureFormat::BC7;
+		}
 	}
 
 	uint32_t size = (uint32_t)bx::getSize(&reader);
@@ -154,10 +163,33 @@ int main(int _argc, const char* _argv[])
 					);
 				break;
 
+			case TextureFormat::BC4:
+			case TextureFormat::BC5:
+				break;
+
+			case TextureFormat::BC6H:
+				nvtt::compressBC6H(rgba, mip.m_width, mip.m_height, 4, output);
+				break;
+
+			case TextureFormat::BC7:
+				nvtt::compressBC7(rgba, mip.m_width, mip.m_height, 4, output);
+				break;
+
 			case TextureFormat::ETC1:
 				etc1_encode_image(rgba, mip.m_width, mip.m_height, 4, mip.m_width*4, output);
 				break;
 
+			case TextureFormat::ETC2:
+			case TextureFormat::ETC2A:
+			case TextureFormat::ETC2A1:
+			case TextureFormat::PTC12:
+			case TextureFormat::PTC14:
+			case TextureFormat::PTC12A:
+			case TextureFormat::PTC14A:
+			case TextureFormat::PTC22:
+			case TextureFormat::PTC24:
+				break;
+
 			default:
 				break;
 			}